diff --git a/.github/labeler.yml b/.github/labeler.yml index 6617acbf9187e..be78eba4baf8b 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -93,9 +93,9 @@ SQL: - changed-files: - all-globs-to-any-file: [ '**/sql/**/*', - '!python/pyspark/sql/avro/**/*', - '!python/pyspark/sql/streaming/**/*', - '!python/pyspark/sql/tests/streaming/test_streaming*.py' + '!python/**/avro/**/*', + '!python/**/protobuf/**/*', + '!python/**/streaming/**/*' ] - any-glob-to-any-file: [ 'common/unsafe/**/*', @@ -119,7 +119,7 @@ AVRO: - changed-files: - any-glob-to-any-file: [ 'connector/avro/**/*', - 'python/pyspark/sql/avro/**/*' + 'python/**/avro/**/*' ] DSTREAM: @@ -152,9 +152,8 @@ ML: MLLIB: - changed-files: - any-glob-to-any-file: [ - '**/spark/mllib/**/*', - 'mllib-local/**/*', - 'python/pyspark/mllib/**/*' + '**/mllib/**/*', + 'mllib-local/**/*' ] STRUCTURED STREAMING: @@ -162,8 +161,7 @@ STRUCTURED STREAMING: - any-glob-to-any-file: [ '**/sql/**/streaming/**/*', 'connector/kafka-0-10-sql/**/*', - 'python/pyspark/sql/streaming/**/*', - 'python/pyspark/sql/tests/streaming/test_streaming*.py', + 'python/pyspark/sql/**/streaming/**/*', '**/*streaming.R' ] @@ -226,13 +224,12 @@ CONNECT: - any-glob-to-any-file: [ 'sql/connect/**/*', 'connector/connect/**/*', - 'python/pyspark/sql/**/connect/**/*', - 'python/pyspark/ml/**/connect/**/*' + 'python/**/connect/**/*' ] PROTOBUF: - changed-files: - any-glob-to-any-file: [ 'connector/protobuf/**/*', - 'python/pyspark/sql/protobuf/**/*' + 'python/**/protobuf/**/*' ] diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3117872e21680..ef11c8416b0ae 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,7 +41,7 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-311", "PYTHON_TO_TEST": "python3.11"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined @@ -64,6 +64,8 @@ jobs: image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }} image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }} image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }} + image_pyspark_url: ${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }} + image_pyspark_url_link: ${{ steps.infra-image-link.outputs.image_pyspark_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 @@ -82,10 +84,11 @@ jobs: id: set-outputs run: | if [ -z "${{ inputs.jobs }}" ]; then - pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` + pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark') and not m.name.startswith('pyspark-pandas')))"` + pyspark_pandas_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark-pandas')))"` pyspark=`./dev/is-changed.py -m $pyspark_modules` + pandas=`./dev/is-changed.py -m $pyspark_pandas_modules` if [[ "${{ github.repository }}" != 'apache/spark' ]]; then - pandas=$pyspark yarn=`./dev/is-changed.py -m yarn` kubernetes=`./dev/is-changed.py -m kubernetes` sparkr=`./dev/is-changed.py -m sparkr` @@ -132,6 +135,28 @@ jobs: precondition="${precondition//$'\n'/}" echo "required=$precondition" >> $GITHUB_OUTPUT fi + - name: Check envs + id: check-envs + if: inputs.branch != 'branch-3.5' + env: ${{ fromJSON(inputs.envs) }} + run: | + if [[ "${{ fromJson(steps.set-outputs.outputs.required).pyspark }}" == 'true' || "${{ fromJson(steps.set-outputs.outputs.required).pyspark-pandas }}" == 'true' ]]; then + if [[ "${{ env.PYSPARK_IMAGE_TO_TEST }}" == "" ]]; then + echo "PYSPARK_IMAGE_TO_TEST is required when pyspark is enabled." + exit 1 + fi + PYSPARK_IMAGE_PATH="dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/Dockerfile" + if [ -f $PYSPARK_IMAGE_PATH ]; then + echo "Dockerfile $PYSPARK_IMAGE_PATH exists." + else + echo "Dockerfile $PYSPARK_IMAGE_PATH does NOT exist." + exit 1 + fi + if [[ "${{ env.PYTHON_TO_TEST }}" == "" ]]; then + echo "PYTHON_TO_TEST is required when pyspark is enabled." + exit 1 + fi + fi - name: Generate infra image URL id: infra-image-outputs run: | @@ -164,8 +189,19 @@ jobs: IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT + - name: Generate infra image URL (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }}) + id: infra-image-pyspark-outputs + if: ${{ env.PYSPARK_IMAGE_TO_TEST }} + env: ${{ fromJSON(inputs.envs) }} + run: | + # Convert to lowercase to meet Docker repo name requirement + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + IMG_NAME="apache-spark-ci-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}:${{ inputs.branch }}-${{ github.run_id }}" + IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" + echo "image_pyspark_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Link the docker images id: infra-image-link + env: ${{ fromJSON(inputs.envs) }} run: | # Set the image URL for job "docs" # Should delete the link and directly use image_docs_url after SPARK 3.x EOL @@ -173,10 +209,12 @@ jobs: echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT + echo "image_pyspark_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT else echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT + echo "image_pyspark_url_link=${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}" >> $GITHUB_OUTPUT fi # Build: build Spark and run the tests for specified modules. @@ -204,7 +242,7 @@ jobs: - >- api, catalyst, hive-thriftserver - >- - mllib-local, mllib, graphx + mllib-local, mllib, graphx, profiler - >- streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect @@ -257,8 +295,6 @@ jobs: INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} - # GitHub Actions' default miniconda to use in pip packaging test. - CONDA_PREFIX: /usr/share/miniconda GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost NOLINT_ON_COMPILE: true @@ -320,7 +356,7 @@ jobs: - name: Install Python packages (Python 3.11) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') run: | - python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' + python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' python3.11 -m pip list # Run the tests. - name: Run tests @@ -355,7 +391,9 @@ jobs: needs: precondition if: >- fromJson(needs.precondition.outputs.required).pyspark == 'true' || + fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true' || fromJson(needs.precondition.outputs.required).lint == 'true' || + fromJson(needs.precondition.outputs.required).docs == 'true' || fromJson(needs.precondition.outputs.required).sparkr == 'true' runs-on: ubuntu-latest permissions: @@ -385,7 +423,8 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Build and push + - name: Build and push for branch-3.5 + if: inputs.branch == 'branch-3.5' id: docker_build uses: docker/build-push-action@v6 with: @@ -396,7 +435,7 @@ jobs: # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} - name: Build and push (Documentation) - if: hashFiles('dev/spark-test-image/docs/Dockerfile') != '' + if: ${{ inputs.branch != 'branch-3.5' && fromJson(needs.precondition.outputs.required).docs == 'true' && hashFiles('dev/spark-test-image/docs/Dockerfile') != '' }} id: docker_build_docs uses: docker/build-push-action@v6 with: @@ -407,7 +446,7 @@ jobs: # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ inputs.branch }} - name: Build and push (Linter) - if: hashFiles('dev/spark-test-image/lint/Dockerfile') != '' + if: ${{ inputs.branch != 'branch-3.5' && fromJson(needs.precondition.outputs.required).lint == 'true' && hashFiles('dev/spark-test-image/lint/Dockerfile') != '' }} id: docker_build_lint uses: docker/build-push-action@v6 with: @@ -418,7 +457,7 @@ jobs: # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }} - name: Build and push (SparkR) - if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + if: ${{ inputs.branch != 'branch-3.5' && fromJson(needs.precondition.outputs.required).sparkr == 'true' && hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' }} id: docker_build_sparkr uses: docker/build-push-action@v6 with: @@ -428,17 +467,29 @@ jobs: ${{ needs.precondition.outputs.image_sparkr_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }} + - name: Build and push (PySpark with ${{ env.PYSPARK_IMAGE_TO_TEST }}) + if: ${{ inputs.branch != 'branch-3.5' && (fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true') && env.PYSPARK_IMAGE_TO_TEST != '' }} + id: docker_build_pyspark + env: ${{ fromJSON(inputs.envs) }} + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/ + push: true + tags: | + ${{ needs.precondition.outputs.image_pyspark_url }} + # Use the infra image cache to speed up + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}-cache:${{ inputs.branch }} pyspark: needs: [precondition, infra-image] # always run if pyspark == 'true', even infra-image is skip (such as non-master job) - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true' + if: (!cancelled()) && (fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true') name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-latest timeout-minutes: 180 container: - image: ${{ needs.precondition.outputs.image_url }} + image: ${{ needs.precondition.outputs.image_pyspark_url_link }} strategy: fail-fast: false matrix: @@ -448,7 +499,7 @@ jobs: - >- pyspark-sql, pyspark-resource, pyspark-testing - >- - pyspark-core, pyspark-errors, pyspark-streaming + pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger - >- pyspark-mllib, pyspark-ml, pyspark-ml-connect - >- @@ -466,6 +517,13 @@ jobs: - >- pyspark-pandas-connect-part3 exclude: + # Always run if pyspark == 'true', even infra-image is skip (such as non-master job) + # In practice, the build will run in individual PR, but not against the individual commit + # in Apache Spark repository. + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-sql, pyspark-resource, pyspark-testing' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-mllib, pyspark-ml, pyspark-ml-connect' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-connect' }} # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) # In practice, the build will run in individual PR, but not against the individual commit # in Apache Spark repository. @@ -477,11 +535,8 @@ jobs: - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }} env: MODULES_TO_TEST: ${{ matrix.modules }} - PYTHON_TO_TEST: 'python3.11' HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 - # GitHub Actions' default miniconda to use in pip packaging test. - CONDA_PREFIX: /usr/share/miniconda GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_UNIDOC: true @@ -534,6 +589,7 @@ jobs: distribution: zulu java-version: ${{ matrix.java }} - name: List Python packages (${{ env.PYTHON_TO_TEST }}) + if: ${{ env.PYTHON_TO_TEST != '' }} env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | @@ -542,12 +598,18 @@ jobs: echo $py $py -m pip list done + - name: Install Conda for pip packaging test + if: contains(matrix.modules, 'pyspark-errors') + uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-version: latest # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then + export PATH=$CONDA/bin:$PATH export SKIP_PACKAGING=false echo "Python Packaging Tests Enabled!" fi @@ -559,11 +621,14 @@ jobs: fi - name: Upload coverage to Codecov if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} with: files: ./python/coverage.xml flags: unittests name: PySpark + verbose: true - name: Upload test results to report env: ${{ fromJSON(inputs.envs) }} if: always() @@ -692,7 +757,7 @@ jobs: python-version: '3.11' - name: Install dependencies for Python CodeGen check run: | - python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' + python3.11 -m pip install 'black==23.12.1' 'protobuf==5.29.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' python3.11 -m pip list - name: Python CodeGen check for branch-3.5 if: inputs.branch == 'branch-3.5' @@ -1043,6 +1108,7 @@ jobs: spark.sql.autoBroadcastJoinThreshold=-1 spark.sql.join.forceApplyShuffledHashJoin=true - name: Run TPC-DS queries on collated data + if: inputs.branch != 'branch-3.5' run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite" - name: Upload test results to report @@ -1174,6 +1240,7 @@ jobs: - name: Start Minikube uses: medyagh/setup-minikube@v0.0.18 with: + kubernetes-version: "1.32.0" # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic cpus: 2 memory: 6144m @@ -1192,7 +1259,7 @@ jobs: if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true else - kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml || true + kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.10.0/installer/volcano-development.yaml || true fi eval $(minikube docker-env) build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml index 2ec080d5722c1..4757ca3c574f5 100644 --- a/.github/workflows/build_branch35.yml +++ b/.github/workflows/build_branch35.yml @@ -22,6 +22,7 @@ name: "Build (branch-3.5, Scala 2.13, Hadoop 3, JDK 8)" on: schedule: - cron: '0 11 * * *' + workflow_dispatch: jobs: run-build: @@ -37,6 +38,7 @@ jobs: envs: >- { "SCALA_PROFILE": "scala2.13", + "PYSPARK_IMAGE_TO_TEST": "", "PYTHON_TO_TEST": "", "ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-xe:21.3.0" } diff --git a/.github/workflows/build_branch35_python.yml b/.github/workflows/build_branch35_python.yml index 1585534d33ba9..452a55f3bc2c1 100644 --- a/.github/workflows/build_branch35_python.yml +++ b/.github/workflows/build_branch35_python.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (branch-3.5)" on: schedule: - cron: '0 11 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "", "PYTHON_TO_TEST": "" } jobs: >- diff --git a/.github/workflows/build_coverage.yml b/.github/workflows/build_coverage.yml index 64f65bd777a02..007d9ce99c847 100644 --- a/.github/workflows/build_coverage.yml +++ b/.github/workflows/build_coverage.yml @@ -22,6 +22,7 @@ name: "Build / Coverage (master, Scala 2.13, Hadoop 3, JDK 17)" on: schedule: - cron: '0 10 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-311", "PYTHON_TO_TEST": "python3.11", "PYSPARK_CODECOV": "true" } diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index a6beacedeebd4..ac139147beb91 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -30,9 +30,16 @@ on: - 'dev/spark-test-image/docs/Dockerfile' - 'dev/spark-test-image/lint/Dockerfile' - 'dev/spark-test-image/sparkr/Dockerfile' + - 'dev/spark-test-image/pypy-310/Dockerfile' + - 'dev/spark-test-image/python-309/Dockerfile' + - 'dev/spark-test-image/python-310/Dockerfile' + - 'dev/spark-test-image/python-311/Dockerfile' + - 'dev/spark-test-image/python-312/Dockerfile' + - 'dev/spark-test-image/python-313/Dockerfile' - '.github/workflows/build_infra_images_cache.yml' # Create infra image when cutting down branches/tags create: + workflow_dispatch: jobs: main: if: github.repository == 'apache/spark' @@ -102,3 +109,107 @@ jobs: - name: Image digest (SparkR) if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' run: echo ${{ steps.docker_build_sparkr.outputs.digest }} + - name: Build and push (PySpark with old dependencies) + if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != '' + id: docker_build_pyspark_python_minimum + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-minimum/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with old dependencies) + if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_minimum.outputs.digest }} + - name: Build and push (PySpark PS with old dependencies) + if: hashFiles('dev/spark-test-image/python-ps-minimum/Dockerfile') != '' + id: docker_build_pyspark_python_ps_minimum + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-ps-minimum/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark PS with old dependencies) + if: hashFiles('dev/spark-test-image/python-ps-minimum/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_ps_minimum.outputs.digest }} + - name: Build and push (PySpark with PyPy 3.10) + if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != '' + id: docker_build_pyspark_pypy_310 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/pypy-310/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with PyPy 3.10) + if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_pypy_310.outputs.digest }} + - name: Build and push (PySpark with Python 3.9) + if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != '' + id: docker_build_pyspark_python_309 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-309/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.9) + if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }} + - name: Build and push (PySpark with Python 3.10) + if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != '' + id: docker_build_pyspark_python_310 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-310/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.10) + if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_310.outputs.digest }} + - name: Build and push (PySpark with Python 3.11) + if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != '' + id: docker_build_pyspark_python_311 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-311/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.11) + if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_311.outputs.digest }} + - name: Build and push (PySpark with Python 3.12) + if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != '' + id: docker_build_pyspark_python_312 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-312/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.12) + if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_312.outputs.digest }} + - name: Build and push (PySpark with Python 3.13) + if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != '' + id: docker_build_pyspark_python_313 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-313/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.13) + if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_313.outputs.digest }} diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml index 871e1a9c07ef0..51ece691f9284 100644 --- a/.github/workflows/build_java21.yml +++ b/.github/workflows/build_java21.yml @@ -22,6 +22,7 @@ name: "Build (master, Scala 2.13, Hadoop 3, JDK 21)" on: schedule: - cron: '0 4 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,8 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-311", + "PYTHON_TO_TEST": "python3.11", "SKIP_MIMA": "true", "SKIP_UNIDOC": "true", "DEDICATED_JVM_SBT_TESTS": "org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite" diff --git a/.github/workflows/build_maven.yml b/.github/workflows/build_maven.yml index b5546c61eb11b..e047390add6f9 100644 --- a/.github/workflows/build_maven.yml +++ b/.github/workflows/build_maven.yml @@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 17)" on: schedule: - cron: '0 13 * * *' + workflow_dispatch: jobs: run-build: diff --git a/.github/workflows/build_maven_java21.yml b/.github/workflows/build_maven_java21.yml index 127904145464b..9fbc7b84383f0 100644 --- a/.github/workflows/build_maven_java21.yml +++ b/.github/workflows/build_maven_java21.yml @@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21)" on: schedule: - cron: '0 14 * * *' + workflow_dispatch: jobs: run-build: diff --git a/.github/workflows/build_maven_java21_macos15.yml b/.github/workflows/build_maven_java21_macos15.yml index cc6d0ea4e90da..377a67191ab49 100644 --- a/.github/workflows/build_maven_java21_macos15.yml +++ b/.github/workflows/build_maven_java21_macos15.yml @@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)" on: schedule: - cron: '0 20 */2 * *' + workflow_dispatch: jobs: run-build: diff --git a/.github/workflows/build_non_ansi.yml b/.github/workflows/build_non_ansi.yml index 4ac2a589f4f81..31654476ea3f8 100644 --- a/.github/workflows/build_non_ansi.yml +++ b/.github/workflows/build_non_ansi.yml @@ -22,6 +22,7 @@ name: "Build / Non-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)" on: schedule: - cron: '0 1 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,8 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-311", + "PYTHON_TO_TEST": "python3.11", "SPARK_ANSI_SQL_MODE": "false", } jobs: >- diff --git a/.github/workflows/build_python_3.10.yml b/.github/workflows/build_python_3.10.yml index 5ae37fbc9120e..9b0c90c5c7747 100644 --- a/.github/workflows/build_python_3.10.yml +++ b/.github/workflows/build_python_3.10.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.10)" on: schedule: - cron: '0 17 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-310", "PYTHON_TO_TEST": "python3.10" } jobs: >- diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml index 4caae55b5fea8..57902e4871ffa 100644 --- a/.github/workflows/build_python_3.11_macos.yml +++ b/.github/workflows/build_python_3.11_macos.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.11, MacOS)" on: schedule: - cron: '0 21 * * *' + workflow_dispatch: jobs: run-build: diff --git a/.github/workflows/build_python_3.12.yml b/.github/workflows/build_python_3.12.yml index e1fd45a7d8838..e0c04700554ca 100644 --- a/.github/workflows/build_python_3.12.yml +++ b/.github/workflows/build_python_3.12.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.12)" on: schedule: - cron: '0 19 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-312", "PYTHON_TO_TEST": "python3.12" } jobs: >- diff --git a/.github/workflows/build_python_3.13.yml b/.github/workflows/build_python_3.13.yml index 6f67cf383584f..e85b1577f323f 100644 --- a/.github/workflows/build_python_3.13.yml +++ b/.github/workflows/build_python_3.13.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.13)" on: schedule: - cron: '0 20 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-313", "PYTHON_TO_TEST": "python3.13" } jobs: >- diff --git a/.github/workflows/build_python_3.9.yml b/.github/workflows/build_python_3.9.yml index b2401fcf2aa14..0df17699140ed 100644 --- a/.github/workflows/build_python_3.9.yml +++ b/.github/workflows/build_python_3.9.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.9)" on: schedule: - cron: '0 21 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-309", "PYTHON_TO_TEST": "python3.9" } jobs: >- diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml index d57a0c2b91623..311907558f6e2 100644 --- a/.github/workflows/build_python_connect.yml +++ b/.github/workflows/build_python_connect.yml @@ -22,6 +22,7 @@ name: Build / Spark Connect Python-only (master, Python 3.11) on: schedule: - cron: '0 19 * * *' + workflow_dispatch: jobs: # Build: build Spark and run the tests for specified modules using SBT @@ -82,7 +83,7 @@ jobs: sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties # Start a Spark Connect server for local - PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ + PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" @@ -93,7 +94,7 @@ jobs: # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener. ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect # None of tests are dependent on each other in Pandas API on Spark so run them in parallel - ./python/run-tests --parallelism=2 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3 + ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3 # Stop Spark Connect server. ./sbin/stop-connect-server.sh @@ -101,7 +102,7 @@ jobs: mv pyspark.back python/pyspark # Start a Spark Connect server for local-cluster - PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ + PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ --master "local-cluster[2, 4, 1024]" \ --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" diff --git a/.github/workflows/build_python_connect35.yml b/.github/workflows/build_python_connect35.yml index 4b7a6b82b9527..ba77f2dff75a9 100644 --- a/.github/workflows/build_python_connect35.yml +++ b/.github/workflows/build_python_connect35.yml @@ -22,6 +22,7 @@ name: Build / Spark Connect Python-only (master-server, 35-client, Python 3.11) on: schedule: - cron: '0 21 * * *' + workflow_dispatch: jobs: # Build: build Spark and run the tests for specified modules using SBT @@ -70,7 +71,7 @@ jobs: pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' # Add Python deps for Spark Connect. - pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' + pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' # Add torch as a testing dependency for TorchDistributor pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval @@ -85,7 +86,7 @@ jobs: sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties # Start a Spark Connect server for local - PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ + PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" @@ -98,7 +99,7 @@ jobs: # Run branch-3.5 tests ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect # None of tests are dependent on each other in Pandas API on Spark so run them in parallel - ./python/run-tests --parallelism=2 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect + ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 diff --git a/.github/workflows/build_python_minimum.yml b/.github/workflows/build_python_minimum.yml new file mode 100644 index 0000000000000..0efd2ad8265f7 --- /dev/null +++ b/.github/workflows/build_python_minimum.yml @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python with old dependencies)" + +on: + schedule: + - cron: '0 9 * * *' + workflow_dispatch: + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYSPARK_IMAGE_TO_TEST": "python-minimum", + "PYTHON_TO_TEST": "python3.9" + } + jobs: >- + { + "pyspark": "true" + } diff --git a/.github/workflows/build_python_ps_minimum.yml b/.github/workflows/build_python_ps_minimum.yml new file mode 100644 index 0000000000000..742d578e27418 --- /dev/null +++ b/.github/workflows/build_python_ps_minimum.yml @@ -0,0 +1,47 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python PS with old dependencies)" + +on: + schedule: + - cron: '0 10 * * *' + workflow_dispatch: + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYSPARK_IMAGE_TO_TEST": "python-ps-minimum", + "PYTHON_TO_TEST": "python3.9" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_python_pypy3.10.yml b/.github/workflows/build_python_pypy3.10.yml index 163af2f4aec8b..0bd2ef03ce77c 100644 --- a/.github/workflows/build_python_pypy3.10.yml +++ b/.github/workflows/build_python_pypy3.10.yml @@ -22,6 +22,7 @@ name: "Build / Python-only (master, PyPy 3.10)" on: schedule: - cron: '0 15 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "pypy-310", "PYTHON_TO_TEST": "pypy3" } jobs: >- diff --git a/.github/workflows/build_rockdb_as_ui_backend.yml b/.github/workflows/build_rockdb_as_ui_backend.yml index 96009c41dbbf9..1d9a079e72643 100644 --- a/.github/workflows/build_rockdb_as_ui_backend.yml +++ b/.github/workflows/build_rockdb_as_ui_backend.yml @@ -22,6 +22,7 @@ name: "Build / RocksDB as UI Backend (master, Hadoop 3, JDK 17, Scala 2.13)" on: schedule: - cron: '0 6 * * *' + workflow_dispatch: jobs: run-build: @@ -36,6 +37,8 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-311", + "PYTHON_TO_TEST": "python3.11", "LIVE_UI_LOCAL_STORE_DIR": "/tmp/kvStore", } jobs: >- diff --git a/.github/workflows/build_sparkr_window.yml b/.github/workflows/build_sparkr_window.yml index b97251a461715..b28e81908549f 100644 --- a/.github/workflows/build_sparkr_window.yml +++ b/.github/workflows/build_sparkr_window.yml @@ -21,6 +21,7 @@ name: "Build / SparkR-only (master, 4.4.2, windows-2022)" on: schedule: - cron: '0 17 * * *' + workflow_dispatch: jobs: build: diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml index 6965fb4968af3..206806a7a0ed7 100644 --- a/.github/workflows/maven_test.yml +++ b/.github/workflows/maven_test.yml @@ -178,7 +178,7 @@ jobs: - name: Install Python packages (Python 3.11) if: (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect') run: | - python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' + python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' python3.11 -m pip list # Run the tests. - name: Run tests diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 8729012c2b8d2..4bcc275064d3c 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -62,8 +62,8 @@ jobs: run: | pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow 'pandas==2.2.3' 'plotly>=4.8' 'docutils<0.18.0' \ - 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ - 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ + 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \ + 'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' - name: Install Ruby for documentation generation uses: ruby/setup-ruby@v1 diff --git a/.github/workflows/python_macos_test.yml b/.github/workflows/python_macos_test.yml index cca133dab541a..231816750236b 100644 --- a/.github/workflows/python_macos_test.yml +++ b/.github/workflows/python_macos_test.yml @@ -134,7 +134,7 @@ jobs: python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2' python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0' python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \ - python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \ + python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \ python${{matrix.python}} -m pip cache purge && \ python${{matrix.python}} -m pip list # Run the tests. diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml index d0a50b2b4aa74..542fa567dea69 100644 --- a/.github/workflows/update_build_status.yml +++ b/.github/workflows/update_build_status.yml @@ -72,7 +72,7 @@ jobs: } catch (error) { console.error(error) // Run not found. This can happen when the PR author removes GitHub Actions runs or - // disalbes GitHub Actions. + // disables GitHub Actions. continue } diff --git a/LICENSE-binary b/LICENSE-binary index 40d28fbe71e6b..5cf099cb4d3c4 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -286,6 +286,10 @@ io.netty:netty-transport-classes-kqueue io.netty:netty-transport-native-epoll io.netty:netty-transport-native-kqueue io.netty:netty-transport-native-unix-common +io.vertx:vertx-auth-common +io.vertx:vertx-core +io.vertx:vertx-web-client +io.vertx:vertx-web-common jakarta.inject:jakarta.inject-api jakarta.validation:jakarta.validation-api javax.jdo:jdo-api diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 9c825a99be180..e320981783ecc 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2922,7 +2922,7 @@ setClassUnion("characterOrstructTypeOrColumn", c("character", "structType", "Col #' @details #' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType} #' with the specified \code{schema} or array of \code{structType} if \code{as.json.array} is set -#' to \code{TRUE}. If the string is unparseable, the Column will contain the value NA. +#' to \code{TRUE}. If the string is unparsable, the Column will contain the value NA. #' #' @rdname column_collection_functions #' @param as.json.array indicating if input string is JSON array of objects or a single object. @@ -3004,7 +3004,7 @@ setMethod("schema_of_json", signature(x = "characterOrColumn"), #' @details #' \code{from_csv}: Parses a column containing a CSV string into a Column of \code{structType} #' with the specified \code{schema}. -#' If the string is unparseable, the Column will contain the value NA. +#' If the string is unparsable, the Column will contain the value NA. #' #' @rdname column_collection_functions #' @aliases from_csv from_csv,Column,characterOrstructTypeOrColumn-method diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 61e174de9ac56..4ccec991bb07b 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -60,7 +60,7 @@ writeObject <- function(con, object, writeType = TRUE) { if (type %in% c("integer", "character", "logical", "double", "numeric")) { if (is.na(object[[1]])) { # Uses the first element for now to keep the behavior same as R before - # 4.2.0. This is wrong because we should differenciate c(NA) from a + # 4.2.0. This is wrong because we should differentiate c(NA) from a # single NA as the former means array(null) and the latter means null # in Spark SQL. However, it requires non-trivial comparison to distinguish # both in R. We should ideally fix this. diff --git a/bin/pyspark b/bin/pyspark index 2f08f78369159..650d913eea028 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -77,7 +77,7 @@ fi # Add the PySpark classes to the Python path: export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" -export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" +export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" # Load the PySpark shell.py script when ./pyspark is used interactively: export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index 232813b4ffdd6..9f55d772a25cf 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" ( ) set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% -set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.7-src.zip;%PYTHONPATH% +set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.9-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py diff --git a/binder/Dockerfile b/binder/Dockerfile index 6e3dd9155fb7a..2d5c30a9a92e0 100644 --- a/binder/Dockerfile +++ b/binder/Dockerfile @@ -22,8 +22,8 @@ RUN pip install --no-cache notebook jupyterlab # create user with a home directory ARG NB_USER ARG NB_UID -ENV USER ${NB_USER} -ENV HOME /home/${NB_USER} +ENV USER=${NB_USER} +ENV HOME=/home/${NB_USER} RUN adduser --disabled-password \ --gecos "Default user" \ diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java index 02a38eac5b409..6e9bd548f5327 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java @@ -251,17 +251,17 @@ AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo( // Higher shuffleMergeId seen for the shuffle ID meaning new stage attempt is being // run for the shuffle ID. Close and clean up old shuffleMergeId files, // happens in the indeterminate stage retries - AppAttemptShuffleMergeId currrentAppAttemptShuffleMergeId = + AppAttemptShuffleMergeId currentAppAttemptShuffleMergeId = new AppAttemptShuffleMergeId(appShuffleInfo.appId, appShuffleInfo.attemptId, shuffleId, latestShuffleMergeId); logger.info("{}: creating a new shuffle merge metadata since received " + "shuffleMergeId {} is higher than latest shuffleMergeId {}", MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, - currrentAppAttemptShuffleMergeId), + currentAppAttemptShuffleMergeId), MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId), MDC.of(LogKeys.LATEST_SHUFFLE_MERGE_ID$.MODULE$, latestShuffleMergeId)); submitCleanupTask(() -> - closeAndDeleteOutdatedPartitions(currrentAppAttemptShuffleMergeId, + closeAndDeleteOutdatedPartitions(currentAppAttemptShuffleMergeId, mergePartitionsInfo.shuffleMergePartitions)); return new AppShuffleMergePartitionsInfo(shuffleMergeId, false); } else { diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 4064f830e92d8..81448dc95a374 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -415,18 +415,6 @@ private static Collation fetchCollation(int collationId) { } } - /** - * Method for constructing errors thrown on providing invalid collation name. - */ - protected static SparkException collationInvalidNameException(String collationName) { - Map params = new HashMap<>(); - final int maxSuggestions = 3; - params.put("collationName", collationName); - params.put("proposals", getClosestSuggestionsOnInvalidName(collationName, maxSuggestions)); - return new SparkException("COLLATION_INVALID_NAME", - SparkException.constructMessageParams(params), null); - } - private static int collationNameToId(String collationName) throws SparkException { // Collation names provided by user are treated as case-insensitive. String collationNameUpper = collationName.toUpperCase(); @@ -479,9 +467,6 @@ private enum CaseSensitivity { */ private static final int CASE_SENSITIVITY_MASK = 0b1; - private static final String UTF8_BINARY_COLLATION_NAME = "UTF8_BINARY"; - private static final String UTF8_LCASE_COLLATION_NAME = "UTF8_LCASE"; - private static final int UTF8_BINARY_COLLATION_ID = new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).collationId; private static final int UTF8_LCASE_COLLATION_ID = @@ -667,9 +652,9 @@ protected CollationMeta buildCollationMeta() { protected String normalizedCollationName() { StringBuilder builder = new StringBuilder(); if(caseSensitivity == CaseSensitivity.UNSPECIFIED){ - builder.append(UTF8_BINARY_COLLATION_NAME); + builder.append(CollationNames.UTF8_BINARY); } else{ - builder.append(UTF8_LCASE_COLLATION_NAME); + builder.append(CollationNames.UTF8_LCASE); } if (spaceTrimming != SpaceTrimming.NONE) { builder.append('_'); @@ -681,12 +666,12 @@ protected String normalizedCollationName() { static List listCollations() { CollationIdentifier UTF8_BINARY_COLLATION_IDENT = new CollationIdentifier( PROVIDER_SPARK, - UTF8_BINARY_COLLATION_NAME, + CollationNames.UTF8_BINARY, CollationSpecICU.ICU_VERSION ); CollationIdentifier UTF8_LCASE_COLLATION_IDENT = new CollationIdentifier( PROVIDER_SPARK, - UTF8_LCASE_COLLATION_NAME, + CollationNames.UTF8_LCASE, CollationSpecICU.ICU_VERSION ); return Arrays.asList(UTF8_BINARY_COLLATION_IDENT, UTF8_LCASE_COLLATION_IDENT); @@ -770,7 +755,7 @@ private enum AccentSensitivity { VersionInfo.ICU_VERSION.getMinor()); static { - ICULocaleMap.put("UNICODE", ULocale.ROOT); + ICULocaleMap.put(CollationNames.UNICODE, ULocale.ROOT); // ICU-implemented `ULocale`s which have corresponding `Collator` installed. ULocale[] locales = Collator.getAvailableULocales(); // Build locale names in format: language["_" optional script]["_" optional country code]. @@ -818,13 +803,13 @@ private enum AccentSensitivity { } private static final int UNICODE_COLLATION_ID = new CollationSpecICU( - "UNICODE", + CollationNames.UNICODE, CaseSensitivity.CS, AccentSensitivity.AS, SpaceTrimming.NONE).collationId; private static final int UNICODE_CI_COLLATION_ID = new CollationSpecICU( - "UNICODE", + CollationNames.UNICODE, CaseSensitivity.CI, AccentSensitivity.AS, SpaceTrimming.NONE).collationId; @@ -1185,6 +1170,52 @@ public static int collationNameToId(String collationName) throws SparkException return Collation.CollationSpec.collationNameToId(collationName); } + /** + * Returns the resolved fully qualified collation name. + */ + public static String resolveFullyQualifiedName(String[] collationName) throws SparkException { + // If collation name has only one part, then we don't need to do any name resolution. + if (collationName.length == 1) return collationName[0]; + else { + // Currently we only support builtin collation names with fixed catalog `SYSTEM` and + // schema `BUILTIN`. + if (collationName.length != 3 || + !CollationFactory.CATALOG.equalsIgnoreCase(collationName[0]) || + !CollationFactory.SCHEMA.equalsIgnoreCase(collationName[1])) { + // Throw exception with original (before case conversion) collation name. + throw CollationFactory.collationInvalidNameException( + collationName.length != 0 ? collationName[collationName.length - 1] : ""); + } + return collationName[2]; + } + } + + /** + * Method for constructing errors thrown on providing invalid collation name. + */ + public static SparkException collationInvalidNameException(String collationName) { + Map params = new HashMap<>(); + final int maxSuggestions = 3; + params.put("collationName", collationName); + params.put("proposals", getClosestSuggestionsOnInvalidName(collationName, maxSuggestions)); + return new SparkException("COLLATION_INVALID_NAME", + SparkException.constructMessageParams(params), null); + } + + + + /** + * Returns the fully qualified collation name for the given collation ID. + */ + public static String fullyQualifiedName(int collationId) { + Collation.CollationSpec.DefinitionOrigin definitionOrigin = + Collation.CollationSpec.getDefinitionOrigin(collationId); + // Currently only predefined collations are supported. + assert definitionOrigin == Collation.CollationSpec.DefinitionOrigin.PREDEFINED; + return String.format("%s.%s.%s", CATALOG, SCHEMA, + Collation.CollationSpec.fetchCollation(collationId).collationName); + } + public static boolean isCaseInsensitive(int collationId) { return Collation.CollationSpecICU.fromCollationId(collationId).caseSensitivity == Collation.CollationSpecICU.CaseSensitivity.CI; diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationNames.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationNames.java new file mode 100644 index 0000000000000..11e9e1a87e713 --- /dev/null +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationNames.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util; + +public class CollationNames { + public static final String UTF8_BINARY = "UTF8_BINARY"; + public static final String UTF8_LCASE = "UTF8_LCASE"; + public static final String UNICODE = "UNICODE"; + public static final String UNICODE_CI = "UNICODE_CI"; +} diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java index aae47aa963201..f12408fb49313 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java @@ -135,27 +135,57 @@ public static byte[] subStringSQL(byte[] bytes, int pos, int len) { return Arrays.copyOfRange(bytes, start, end); } + /** + * Concatenate multiple byte arrays into one. + * If one of the inputs is null then null will be returned. + * + * @param inputs byte arrays to concatenate + * @return the concatenated byte array or null if one of the arguments is null + */ public static byte[] concat(byte[]... inputs) { + return concatWS(EMPTY_BYTE, inputs); + } + + /** + * Concatenate multiple byte arrays with a given delimiter. + * If the delimiter or one of the inputs is null then null will be returned. + * + * @param delimiter byte array to be placed between each input + * @param inputs byte arrays to concatenate + * @return the concatenated byte array or null if one of the arguments is null + */ + public static byte[] concatWS(byte[] delimiter, byte[]... inputs) { + if (delimiter == null) { + return null; + } // Compute the total length of the result long totalLength = 0; for (byte[] input : inputs) { if (input != null) { - totalLength += input.length; + totalLength += input.length + delimiter.length; } else { return null; } } - + if (totalLength > 0) totalLength -= delimiter.length; // Allocate a new byte array, and copy the inputs one by one into it final byte[] result = new byte[Ints.checkedCast(totalLength)]; int offset = 0; - for (byte[] input : inputs) { + for (int i = 0; i < inputs.length; i++) { + byte[] input = inputs[i]; int len = input.length; Platform.copyMemory( input, Platform.BYTE_ARRAY_OFFSET, result, Platform.BYTE_ARRAY_OFFSET + offset, len); offset += len; + if (delimiter.length > 0 && i < inputs.length - 1) { + Platform.copyMemory( + delimiter, Platform.BYTE_ARRAY_OFFSET, + result, Platform.BYTE_ARRAY_OFFSET + offset, + delimiter.length); + offset += delimiter.length; + } } return result; } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/array/ByteArraySuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/ByteArraySuite.java index aff619175ff7b..5e221b4e359d4 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/array/ByteArraySuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/ByteArraySuite.java @@ -67,4 +67,59 @@ public void testCompareBinary() { byte[] y4 = new byte[]{(byte) 100, (byte) 200}; Assertions.assertEquals(0, ByteArray.compareBinary(x4, y4)); } + + @Test + public void testConcat() { + byte[] x1 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + byte[] y1 = new byte[]{(byte) 4, (byte) 5, (byte) 6}; + byte[] result1 = ByteArray.concat(x1, y1); + byte[] expected1 = new byte[]{(byte) 1, (byte) 2, (byte) 3, (byte) 4, (byte) 5, (byte) 6}; + Assertions.assertArrayEquals(expected1, result1); + + byte[] x2 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + byte[] y2 = new byte[0]; + byte[] result2 = ByteArray.concat(x2, y2); + byte[] expected2 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + Assertions.assertArrayEquals(expected2, result2); + + byte[] x3 = new byte[0]; + byte[] y3 = new byte[]{(byte) 4, (byte) 5, (byte) 6}; + byte[] result3 = ByteArray.concat(x3, y3); + byte[] expected3 = new byte[]{(byte) 4, (byte) 5, (byte) 6}; + Assertions.assertArrayEquals(expected3, result3); + + byte[] x4 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + byte[] y4 = null; + byte[] result4 = ByteArray.concat(x4, y4); + Assertions.assertArrayEquals(null, result4); + } + + @Test + public void testConcatWS() { + byte[] separator = new byte[]{(byte) 42}; + + byte[] x1 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + byte[] y1 = new byte[]{(byte) 4, (byte) 5, (byte) 6}; + byte[] result1 = ByteArray.concatWS(separator, x1, y1); + byte[] expected1 = new byte[]{(byte) 1, (byte) 2, (byte) 3, (byte) 42, + (byte) 4, (byte) 5, (byte) 6}; + Assertions.assertArrayEquals(expected1, result1); + + byte[] x2 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + byte[] y2 = new byte[0]; + byte[] result2 = ByteArray.concatWS(separator, x2, y2); + byte[] expected2 = new byte[]{(byte) 1, (byte) 2, (byte) 3, (byte) 42}; + Assertions.assertArrayEquals(expected2, result2); + + byte[] x3 = new byte[0]; + byte[] y3 = new byte[]{(byte) 4, (byte) 5, (byte) 6}; + byte[] result3 = ByteArray.concatWS(separator, x3, y3); + byte[] expected3 = new byte[]{(byte) 42, (byte) 4, (byte) 5, (byte) 6}; + Assertions.assertArrayEquals(expected3, result3); + + byte[] x4 = new byte[]{(byte) 1, (byte) 2, (byte) 3}; + byte[] y4 = null; + byte[] result4 = ByteArray.concatWS(separator, x4, y4); + Assertions.assertArrayEquals(null, result4); + } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index a696da8cf45b8..1db163c1c822d 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -26,6 +26,7 @@ import java.util.Map; import static org.junit.jupiter.api.Assertions.*; +import static org.apache.spark.sql.catalyst.util.CollationNames.*; // checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { @@ -37,7 +38,7 @@ public class CollationSupportSuite { * the specified collations (as often seen in some pass-through Spark expressions). */ private final String[] testSupportedCollations = - {"UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI"}; + {UTF8_BINARY, UTF8_LCASE, UNICODE, UNICODE_CI}; /** * Collation-aware UTF8String comparison and equality check. @@ -86,82 +87,82 @@ public void testCompare() throws SparkException { assertCompare("a", "ä", collationName, -1); } // Advanced tests. - assertCompare("äü", "bü", "UTF8_BINARY", 1); - assertCompare("bxx", "bü", "UTF8_BINARY", -1); - assertCompare("äü", "bü", "UTF8_LCASE", 1); - assertCompare("bxx", "bü", "UTF8_LCASE", -1); - assertCompare("äü", "bü", "UNICODE", -1); - assertCompare("bxx", "bü", "UNICODE", 1); - assertCompare("äü", "bü", "UNICODE_CI", -1); - assertCompare("bxx", "bü", "UNICODE_CI", 1); + assertCompare("äü", "bü", UTF8_BINARY, 1); + assertCompare("bxx", "bü", UTF8_BINARY, -1); + assertCompare("äü", "bü", UTF8_LCASE, 1); + assertCompare("bxx", "bü", UTF8_LCASE, -1); + assertCompare("äü", "bü", UNICODE, -1); + assertCompare("bxx", "bü", UNICODE, 1); + assertCompare("äü", "bü", UNICODE_CI, -1); + assertCompare("bxx", "bü", UNICODE_CI, 1); assertCompare("cČć", "ČćC", "SR_CI_AI", 0); // Case variation. - assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1); - assertCompare("ABCD", "abcd", "UTF8_LCASE", 0); - assertCompare("AbcD", "aBCd", "UNICODE", 1); - assertCompare("abcd", "ABCD", "UNICODE_CI", 0); + assertCompare("AbCd", "aBcD", UTF8_BINARY, -1); + assertCompare("ABCD", "abcd", UTF8_LCASE, 0); + assertCompare("AbcD", "aBCd", UNICODE, 1); + assertCompare("abcd", "ABCD", UNICODE_CI, 0); // Accent variation. - assertCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); - assertCompare("AbCδ", "ABCΔ", "UTF8_LCASE", 0); - assertCompare("äBCd", "ÄBCD", "UNICODE", -1); - assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + assertCompare("aBćD", "ABĆD", UTF8_BINARY, 1); + assertCompare("AbCδ", "ABCΔ", UTF8_LCASE, 0); + assertCompare("äBCd", "ÄBCD", UNICODE, -1); + assertCompare("Ab́cD", "AB́CD", UNICODE_CI, 0); assertCompare("ÈÉÊË", "EeEe", "AF_CI_AI", 0); // One-to-many case mapping (e.g. Turkish dotted I). - assertCompare("i\u0307", "İ", "UTF8_BINARY", -1); - assertCompare("İ", "i\u0307", "UTF8_BINARY", 1); - assertCompare("i\u0307", "İ", "UTF8_LCASE", 0); - assertCompare("İ", "i\u0307", "UTF8_LCASE", 0); - assertCompare("i\u0307", "İ", "UNICODE", -1); - assertCompare("İ", "i\u0307", "UNICODE", 1); - assertCompare("i\u0307", "İ", "UNICODE_CI", 0); - assertCompare("İ", "i\u0307", "UNICODE_CI", 0); - assertCompare("i\u0307İ", "i\u0307İ", "UTF8_LCASE", 0); - assertCompare("i\u0307İ", "İi\u0307", "UTF8_LCASE", 0); - assertCompare("İi\u0307", "i\u0307İ", "UTF8_LCASE", 0); - assertCompare("İi\u0307", "İi\u0307", "UTF8_LCASE", 0); - assertCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); - assertCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); - assertCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); - assertCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + assertCompare("i\u0307", "İ", UTF8_BINARY, -1); + assertCompare("İ", "i\u0307", UTF8_BINARY, 1); + assertCompare("i\u0307", "İ", UTF8_LCASE, 0); + assertCompare("İ", "i\u0307", UTF8_LCASE, 0); + assertCompare("i\u0307", "İ", UNICODE, -1); + assertCompare("İ", "i\u0307", UNICODE, 1); + assertCompare("i\u0307", "İ", UNICODE_CI, 0); + assertCompare("İ", "i\u0307", UNICODE_CI, 0); + assertCompare("i\u0307İ", "i\u0307İ", UTF8_LCASE, 0); + assertCompare("i\u0307İ", "İi\u0307", UTF8_LCASE, 0); + assertCompare("İi\u0307", "i\u0307İ", UTF8_LCASE, 0); + assertCompare("İi\u0307", "İi\u0307", UTF8_LCASE, 0); + assertCompare("i\u0307İ", "i\u0307İ", UNICODE_CI, 0); + assertCompare("i\u0307İ", "İi\u0307", UNICODE_CI, 0); + assertCompare("İi\u0307", "i\u0307İ", UNICODE_CI, 0); + assertCompare("İi\u0307", "İi\u0307", UNICODE_CI, 0); // Conditional case mapping (e.g. Greek sigmas). - assertCompare("ς", "σ", "UTF8_BINARY", -1); - assertCompare("ς", "Σ", "UTF8_BINARY", 1); - assertCompare("σ", "Σ", "UTF8_BINARY", 1); - assertCompare("ς", "σ", "UTF8_LCASE", 0); - assertCompare("ς", "Σ", "UTF8_LCASE", 0); - assertCompare("σ", "Σ", "UTF8_LCASE", 0); - assertCompare("ς", "σ", "UNICODE", 1); - assertCompare("ς", "Σ", "UNICODE", 1); - assertCompare("σ", "Σ", "UNICODE", -1); - assertCompare("ς", "σ", "UNICODE_CI", 0); - assertCompare("ς", "Σ", "UNICODE_CI", 0); - assertCompare("σ", "Σ", "UNICODE_CI", 0); + assertCompare("ς", "σ", UTF8_BINARY, -1); + assertCompare("ς", "Σ", UTF8_BINARY, 1); + assertCompare("σ", "Σ", UTF8_BINARY, 1); + assertCompare("ς", "σ", UTF8_LCASE, 0); + assertCompare("ς", "Σ", UTF8_LCASE, 0); + assertCompare("σ", "Σ", UTF8_LCASE, 0); + assertCompare("ς", "σ", UNICODE, 1); + assertCompare("ς", "Σ", UNICODE, 1); + assertCompare("σ", "Σ", UNICODE, -1); + assertCompare("ς", "σ", UNICODE_CI, 0); + assertCompare("ς", "Σ", UNICODE_CI, 0); + assertCompare("σ", "Σ", UNICODE_CI, 0); // Surrogate pairs. - assertCompare("a🙃b🙃c", "aaaaa", "UTF8_BINARY", 1); - assertCompare("a🙃b🙃c", "aaaaa", "UTF8_LCASE", 1); - assertCompare("a🙃b🙃c", "aaaaa", "UNICODE", -1); // != UTF8_BINARY - assertCompare("a🙃b🙃c", "aaaaa", "UNICODE_CI", -1); // != UTF8_LCASE - assertCompare("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", 0); - assertCompare("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", 0); - assertCompare("a🙃b🙃c", "a🙃b🙃c", "UNICODE", 0); - assertCompare("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", 0); - assertCompare("a🙃b🙃c", "a🙃b🙃d", "UTF8_BINARY", -1); - assertCompare("a🙃b🙃c", "a🙃b🙃d", "UTF8_LCASE", -1); - assertCompare("a🙃b🙃c", "a🙃b🙃d", "UNICODE", -1); - assertCompare("a🙃b🙃c", "a🙃b🙃d", "UNICODE_CI", -1); + assertCompare("a🙃b🙃c", "aaaaa", UTF8_BINARY, 1); + assertCompare("a🙃b🙃c", "aaaaa", UTF8_LCASE, 1); + assertCompare("a🙃b🙃c", "aaaaa", UNICODE, -1); // != UTF8_BINARY + assertCompare("a🙃b🙃c", "aaaaa", UNICODE_CI, -1); // != UTF8_LCASE + assertCompare("a🙃b🙃c", "a🙃b🙃c", UTF8_BINARY, 0); + assertCompare("a🙃b🙃c", "a🙃b🙃c", UTF8_LCASE, 0); + assertCompare("a🙃b🙃c", "a🙃b🙃c", UNICODE, 0); + assertCompare("a🙃b🙃c", "a🙃b🙃c", UNICODE_CI, 0); + assertCompare("a🙃b🙃c", "a🙃b🙃d", UTF8_BINARY, -1); + assertCompare("a🙃b🙃c", "a🙃b🙃d", UTF8_LCASE, -1); + assertCompare("a🙃b🙃c", "a🙃b🙃d", UNICODE, -1); + assertCompare("a🙃b🙃c", "a🙃b🙃d", UNICODE_CI, -1); // Maximum code point. int maxCodePoint = Character.MAX_CODE_POINT; String maxCodePointStr = new String(Character.toChars(maxCodePoint)); for (int i = 0; i < maxCodePoint && Character.isValidCodePoint(i); ++i) { - assertCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_BINARY", -1); - assertCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_LCASE", -1); + assertCompare(new String(Character.toChars(i)), maxCodePointStr, UTF8_BINARY, -1); + assertCompare(new String(Character.toChars(i)), maxCodePointStr, UTF8_LCASE, -1); } // Minimum code point. int minCodePoint = Character.MIN_CODE_POINT; String minCodePointStr = new String(Character.toChars(minCodePoint)); for (int i = minCodePoint + 1; i <= maxCodePoint && Character.isValidCodePoint(i); ++i) { - assertCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_BINARY", 1); - assertCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_LCASE", 1); + assertCompare(new String(Character.toChars(i)), minCodePointStr, UTF8_BINARY, 1); + assertCompare(new String(Character.toChars(i)), minCodePointStr, UTF8_LCASE, 1); } } @@ -302,201 +303,201 @@ public void testContains() throws SparkException { assertContains("Здраво", "Здраво", collationName, true); } // Advanced tests. - assertContains("abcde", "bcd", "UTF8_BINARY", true); - assertContains("abcde", "bde", "UTF8_BINARY", false); - assertContains("abcde", "fgh", "UTF8_BINARY", false); - assertContains("abcde", "abcde", "UNICODE", true); - assertContains("abcde", "aBcDe", "UNICODE", false); - assertContains("abcde", "fghij", "UNICODE", false); - assertContains("abcde", "C", "UTF8_LCASE", true); - assertContains("abcde", "AbCdE", "UTF8_LCASE", true); - assertContains("abcde", "X", "UTF8_LCASE", false); - assertContains("abcde", "c", "UNICODE_CI", true); - assertContains("abcde", "bCD", "UNICODE_CI", true); - assertContains("abcde", "123", "UNICODE_CI", false); - assertContains("ab世De", "b世D", "UTF8_BINARY", true); - assertContains("ab世De", "B世d", "UTF8_BINARY", false); - assertContains("äbćδe", "bćδ", "UTF8_BINARY", true); - assertContains("äbćδe", "BcΔ", "UTF8_BINARY", false); - assertContains("ab世De", "ab世De", "UNICODE", true); - assertContains("ab世De", "AB世dE", "UNICODE", false); - assertContains("äbćδe", "äbćδe", "UNICODE", true); - assertContains("äbćδe", "ÄBcΔÉ", "UNICODE", false); - assertContains("ab世De", "b世D", "UTF8_LCASE", true); - assertContains("ab世De", "B世d", "UTF8_LCASE", true); - assertContains("äbćδe", "bćδ", "UTF8_LCASE", true); - assertContains("äbćδe", "BcΔ", "UTF8_LCASE", false); - assertContains("ab世De", "ab世De", "UNICODE_CI", true); - assertContains("ab世De", "AB世dE", "UNICODE_CI", true); - assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); - assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - assertContains("The Kelvin.", "Kelvin", "UTF8_LCASE", true); - assertContains("The Kelvin.", "Kelvin", "UTF8_LCASE", true); - assertContains("The KKelvin.", "KKelvin", "UTF8_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); - assertContains("The KKelvin.", "KKelvin,", "UTF8_LCASE", false); + assertContains("abcde", "bcd", UTF8_BINARY, true); + assertContains("abcde", "bde", UTF8_BINARY, false); + assertContains("abcde", "fgh", UTF8_BINARY, false); + assertContains("abcde", "abcde", UNICODE, true); + assertContains("abcde", "aBcDe", UNICODE, false); + assertContains("abcde", "fghij", UNICODE, false); + assertContains("abcde", "C", UTF8_LCASE, true); + assertContains("abcde", "AbCdE", UTF8_LCASE, true); + assertContains("abcde", "X", UTF8_LCASE, false); + assertContains("abcde", "c", UNICODE_CI, true); + assertContains("abcde", "bCD", UNICODE_CI, true); + assertContains("abcde", "123", UNICODE_CI, false); + assertContains("ab世De", "b世D", UTF8_BINARY, true); + assertContains("ab世De", "B世d", UTF8_BINARY, false); + assertContains("äbćδe", "bćδ", UTF8_BINARY, true); + assertContains("äbćδe", "BcΔ", UTF8_BINARY, false); + assertContains("ab世De", "ab世De", UNICODE, true); + assertContains("ab世De", "AB世dE", UNICODE, false); + assertContains("äbćδe", "äbćδe", UNICODE, true); + assertContains("äbćδe", "ÄBcΔÉ", UNICODE, false); + assertContains("ab世De", "b世D", UTF8_LCASE, true); + assertContains("ab世De", "B世d", UTF8_LCASE, true); + assertContains("äbćδe", "bćδ", UTF8_LCASE, true); + assertContains("äbćδe", "BcΔ", UTF8_LCASE, false); + assertContains("ab世De", "ab世De", UNICODE_CI, true); + assertContains("ab世De", "AB世dE", UNICODE_CI, true); + assertContains("äbćδe", "ÄbćδE", UNICODE_CI, true); + assertContains("äbćδe", "ÄBcΔÉ", UNICODE_CI, false); + assertContains("The Kelvin.", "Kelvin", UTF8_LCASE, true); + assertContains("The Kelvin.", "Kelvin", UTF8_LCASE, true); + assertContains("The KKelvin.", "KKelvin", UTF8_LCASE, true); + assertContains("2 Kelvin.", "2 Kelvin", UTF8_LCASE, true); + assertContains("2 Kelvin.", "2 Kelvin", UTF8_LCASE, true); + assertContains("The KKelvin.", "KKelvin,", UTF8_LCASE, false); assertContains("abčćd", "ABCCD", "SR_CI_AI", true); // Case variation. - assertContains("aBcDe", "bcd", "UTF8_BINARY", false); - assertContains("aBcDe", "BcD", "UTF8_BINARY", true); - assertContains("aBcDe", "abcde", "UNICODE", false); - assertContains("aBcDe", "aBcDe", "UNICODE", true); - assertContains("aBcDe", "bcd", "UTF8_LCASE", true); - assertContains("aBcDe", "BCD", "UTF8_LCASE", true); - assertContains("aBcDe", "abcde", "UNICODE_CI", true); - assertContains("aBcDe", "AbCdE", "UNICODE_CI", true); + assertContains("aBcDe", "bcd", UTF8_BINARY, false); + assertContains("aBcDe", "BcD", UTF8_BINARY, true); + assertContains("aBcDe", "abcde", UNICODE, false); + assertContains("aBcDe", "aBcDe", UNICODE, true); + assertContains("aBcDe", "bcd", UTF8_LCASE, true); + assertContains("aBcDe", "BCD", UTF8_LCASE, true); + assertContains("aBcDe", "abcde", UNICODE_CI, true); + assertContains("aBcDe", "AbCdE", UNICODE_CI, true); // Accent variation. - assertContains("aBcDe", "bćd", "UTF8_BINARY", false); - assertContains("aBcDe", "BćD", "UTF8_BINARY", false); - assertContains("aBcDe", "abćde", "UNICODE", false); - assertContains("aBcDe", "aBćDe", "UNICODE", false); - assertContains("aBcDe", "bćd", "UTF8_LCASE", false); - assertContains("aBcDe", "BĆD", "UTF8_LCASE", false); - assertContains("aBcDe", "abćde", "UNICODE_CI", false); - assertContains("aBcDe", "AbĆdE", "UNICODE_CI", false); + assertContains("aBcDe", "bćd", UTF8_BINARY, false); + assertContains("aBcDe", "BćD", UTF8_BINARY, false); + assertContains("aBcDe", "abćde", UNICODE, false); + assertContains("aBcDe", "aBćDe", UNICODE, false); + assertContains("aBcDe", "bćd", UTF8_LCASE, false); + assertContains("aBcDe", "BĆD", UTF8_LCASE, false); + assertContains("aBcDe", "abćde", UNICODE_CI, false); + assertContains("aBcDe", "AbĆdE", UNICODE_CI, false); assertContains("abEEE", "Bèêë", "AF_CI_AI", true); // One-to-many case mapping (e.g. Turkish dotted I). - assertContains("i\u0307", "i", "UNICODE_CI", false); - assertContains("i\u0307", "\u0307", "UNICODE_CI", false); - assertContains("i\u0307", "İ", "UNICODE_CI", true); - assertContains("İ", "i", "UNICODE_CI", false); - assertContains("adi̇os", "io", "UNICODE_CI", false); - assertContains("adi̇os", "Io", "UNICODE_CI", false); - assertContains("adi̇os", "i\u0307o", "UNICODE_CI", true); - assertContains("adi̇os", "İo", "UNICODE_CI", true); - assertContains("adİos", "io", "UNICODE_CI", false); - assertContains("adİos", "Io", "UNICODE_CI", false); - assertContains("adİos", "i\u0307o", "UNICODE_CI", true); - assertContains("adİos", "İo", "UNICODE_CI", true); - assertContains("i\u0307", "i", "UTF8_LCASE", true); // != UNICODE_CI - assertContains("İ", "\u0307", "UTF8_LCASE", false); - assertContains("İ", "i", "UTF8_LCASE", false); - assertContains("i\u0307", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI - assertContains("i\u0307", "İ", "UTF8_LCASE", true); - assertContains("İ", "i", "UTF8_LCASE", false); - assertContains("adi̇os", "io", "UTF8_LCASE", false); - assertContains("adi̇os", "Io", "UTF8_LCASE", false); - assertContains("adi̇os", "i\u0307o", "UTF8_LCASE", true); - assertContains("adi̇os", "İo", "UTF8_LCASE", true); - assertContains("adİos", "io", "UTF8_LCASE", false); - assertContains("adİos", "Io", "UTF8_LCASE", false); - assertContains("adİos", "i\u0307o", "UTF8_LCASE", true); - assertContains("adİos", "İo", "UTF8_LCASE", true); + assertContains("i\u0307", "i", UNICODE_CI, false); + assertContains("i\u0307", "\u0307", UNICODE_CI, false); + assertContains("i\u0307", "İ", UNICODE_CI, true); + assertContains("İ", "i", UNICODE_CI, false); + assertContains("adi̇os", "io", UNICODE_CI, false); + assertContains("adi̇os", "Io", UNICODE_CI, false); + assertContains("adi̇os", "i\u0307o", UNICODE_CI, true); + assertContains("adi̇os", "İo", UNICODE_CI, true); + assertContains("adİos", "io", UNICODE_CI, false); + assertContains("adİos", "Io", UNICODE_CI, false); + assertContains("adİos", "i\u0307o", UNICODE_CI, true); + assertContains("adİos", "İo", UNICODE_CI, true); + assertContains("i\u0307", "i", UTF8_LCASE, true); // != UNICODE_CI + assertContains("İ", "\u0307", UTF8_LCASE, false); + assertContains("İ", "i", UTF8_LCASE, false); + assertContains("i\u0307", "\u0307", UTF8_LCASE, true); // != UNICODE_CI + assertContains("i\u0307", "İ", UTF8_LCASE, true); + assertContains("İ", "i", UTF8_LCASE, false); + assertContains("adi̇os", "io", UTF8_LCASE, false); + assertContains("adi̇os", "Io", UTF8_LCASE, false); + assertContains("adi̇os", "i\u0307o", UTF8_LCASE, true); + assertContains("adi̇os", "İo", UTF8_LCASE, true); + assertContains("adİos", "io", UTF8_LCASE, false); + assertContains("adİos", "Io", UTF8_LCASE, false); + assertContains("adİos", "i\u0307o", UTF8_LCASE, true); + assertContains("adİos", "İo", UTF8_LCASE, true); // Conditional case mapping (e.g. Greek sigmas). - assertContains("σ", "σ", "UTF8_BINARY", true); - assertContains("σ", "ς", "UTF8_BINARY", false); - assertContains("σ", "Σ", "UTF8_BINARY", false); - assertContains("ς", "σ", "UTF8_BINARY", false); - assertContains("ς", "ς", "UTF8_BINARY", true); - assertContains("ς", "Σ", "UTF8_BINARY", false); - assertContains("Σ", "σ", "UTF8_BINARY", false); - assertContains("Σ", "ς", "UTF8_BINARY", false); - assertContains("Σ", "Σ", "UTF8_BINARY", true); - assertContains("σ", "σ", "UTF8_LCASE", true); - assertContains("σ", "ς", "UTF8_LCASE", true); - assertContains("σ", "Σ", "UTF8_LCASE", true); - assertContains("ς", "σ", "UTF8_LCASE", true); - assertContains("ς", "ς", "UTF8_LCASE", true); - assertContains("ς", "Σ", "UTF8_LCASE", true); - assertContains("Σ", "σ", "UTF8_LCASE", true); - assertContains("Σ", "ς", "UTF8_LCASE", true); - assertContains("Σ", "Σ", "UTF8_LCASE", true); - assertContains("σ", "σ", "UNICODE", true); - assertContains("σ", "ς", "UNICODE", false); - assertContains("σ", "Σ", "UNICODE", false); - assertContains("ς", "σ", "UNICODE", false); - assertContains("ς", "ς", "UNICODE", true); - assertContains("ς", "Σ", "UNICODE", false); - assertContains("Σ", "σ", "UNICODE", false); - assertContains("Σ", "ς", "UNICODE", false); - assertContains("Σ", "Σ", "UNICODE", true); - assertContains("σ", "σ", "UNICODE_CI", true); - assertContains("σ", "ς", "UNICODE_CI", true); - assertContains("σ", "Σ", "UNICODE_CI", true); - assertContains("ς", "σ", "UNICODE_CI", true); - assertContains("ς", "ς", "UNICODE_CI", true); - assertContains("ς", "Σ", "UNICODE_CI", true); - assertContains("Σ", "σ", "UNICODE_CI", true); - assertContains("Σ", "ς", "UNICODE_CI", true); - assertContains("Σ", "Σ", "UNICODE_CI", true); - assertContains("ΣΑΛΑΤΑ", "Σ", "UTF8_BINARY", true); - assertContains("ΣΑΛΑΤΑ", "σ", "UTF8_BINARY", false); - assertContains("ΣΑΛΑΤΑ", "ς", "UTF8_BINARY", false); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_BINARY", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_BINARY", false); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_BINARY", false); - assertContains("ΣΑΛΑΤΑ", "Σ", "UTF8_LCASE", true); - assertContains("ΣΑΛΑΤΑ", "σ", "UTF8_LCASE", true); - assertContains("ΣΑΛΑΤΑ", "ς", "UTF8_LCASE", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_LCASE", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_LCASE", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_LCASE", true); - assertContains("ΣΑΛΑΤΑ", "Σ", "UNICODE", true); - assertContains("ΣΑΛΑΤΑ", "σ", "UNICODE", false); - assertContains("ΣΑΛΑΤΑ", "ς", "UNICODE", false); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE", false); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE", false); - assertContains("ΣΑΛΑΤΑ", "Σ", "UNICODE_CI", true); - assertContains("ΣΑΛΑΤΑ", "σ", "UNICODE_CI", true); - assertContains("ΣΑΛΑΤΑ", "ς", "UNICODE_CI", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE_CI", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE_CI", true); - assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE_CI", true); + assertContains("σ", "σ", UTF8_BINARY, true); + assertContains("σ", "ς", UTF8_BINARY, false); + assertContains("σ", "Σ", UTF8_BINARY, false); + assertContains("ς", "σ", UTF8_BINARY, false); + assertContains("ς", "ς", UTF8_BINARY, true); + assertContains("ς", "Σ", UTF8_BINARY, false); + assertContains("Σ", "σ", UTF8_BINARY, false); + assertContains("Σ", "ς", UTF8_BINARY, false); + assertContains("Σ", "Σ", UTF8_BINARY, true); + assertContains("σ", "σ", UTF8_LCASE, true); + assertContains("σ", "ς", UTF8_LCASE, true); + assertContains("σ", "Σ", UTF8_LCASE, true); + assertContains("ς", "σ", UTF8_LCASE, true); + assertContains("ς", "ς", UTF8_LCASE, true); + assertContains("ς", "Σ", UTF8_LCASE, true); + assertContains("Σ", "σ", UTF8_LCASE, true); + assertContains("Σ", "ς", UTF8_LCASE, true); + assertContains("Σ", "Σ", UTF8_LCASE, true); + assertContains("σ", "σ", UNICODE, true); + assertContains("σ", "ς", UNICODE, false); + assertContains("σ", "Σ", UNICODE, false); + assertContains("ς", "σ", UNICODE, false); + assertContains("ς", "ς", UNICODE, true); + assertContains("ς", "Σ", UNICODE, false); + assertContains("Σ", "σ", UNICODE, false); + assertContains("Σ", "ς", UNICODE, false); + assertContains("Σ", "Σ", UNICODE, true); + assertContains("σ", "σ", UNICODE_CI, true); + assertContains("σ", "ς", UNICODE_CI, true); + assertContains("σ", "Σ", UNICODE_CI, true); + assertContains("ς", "σ", UNICODE_CI, true); + assertContains("ς", "ς", UNICODE_CI, true); + assertContains("ς", "Σ", UNICODE_CI, true); + assertContains("Σ", "σ", UNICODE_CI, true); + assertContains("Σ", "ς", UNICODE_CI, true); + assertContains("Σ", "Σ", UNICODE_CI, true); + assertContains("ΣΑΛΑΤΑ", "Σ", UTF8_BINARY, true); + assertContains("ΣΑΛΑΤΑ", "σ", UTF8_BINARY, false); + assertContains("ΣΑΛΑΤΑ", "ς", UTF8_BINARY, false); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UTF8_BINARY, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", UTF8_BINARY, false); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", UTF8_BINARY, false); + assertContains("ΣΑΛΑΤΑ", "Σ", UTF8_LCASE, true); + assertContains("ΣΑΛΑΤΑ", "σ", UTF8_LCASE, true); + assertContains("ΣΑΛΑΤΑ", "ς", UTF8_LCASE, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UTF8_LCASE, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", UTF8_LCASE, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", UTF8_LCASE, true); + assertContains("ΣΑΛΑΤΑ", "Σ", UNICODE, true); + assertContains("ΣΑΛΑΤΑ", "σ", UNICODE, false); + assertContains("ΣΑΛΑΤΑ", "ς", UNICODE, false); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UNICODE, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", UNICODE, false); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", UNICODE, false); + assertContains("ΣΑΛΑΤΑ", "Σ", UNICODE_CI, true); + assertContains("ΣΑΛΑΤΑ", "σ", UNICODE_CI, true); + assertContains("ΣΑΛΑΤΑ", "ς", UNICODE_CI, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UNICODE_CI, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", UNICODE_CI, true); + assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", UNICODE_CI, true); // Surrogate pairs. - assertContains("a🙃b🙃c", "x", "UTF8_BINARY", false); - assertContains("a🙃b🙃c", "x", "UTF8_LCASE", false); - assertContains("a🙃b🙃c", "x", "UNICODE", false); - assertContains("a🙃b🙃c", "x", "UNICODE_CI", false); - assertContains("a🙃b🙃c", "b", "UTF8_BINARY", true); - assertContains("a🙃b🙃c", "b", "UTF8_LCASE", true); - assertContains("a🙃b🙃c", "b", "UNICODE", true); - assertContains("a🙃b🙃c", "b", "UNICODE_CI", true); - assertContains("a🙃b🙃c", "a🙃b", "UTF8_BINARY", true); - assertContains("a🙃b🙃c", "a🙃b", "UTF8_LCASE", true); - assertContains("a🙃b🙃c", "a🙃b", "UNICODE", true); - assertContains("a🙃b🙃c", "a🙃b", "UNICODE_CI", true); - assertContains("a🙃b🙃c", "b🙃c", "UTF8_BINARY", true); - assertContains("a🙃b🙃c", "b🙃c", "UTF8_LCASE", true); - assertContains("a🙃b🙃c", "b🙃c", "UNICODE", true); - assertContains("a🙃b🙃c", "b🙃c", "UNICODE_CI", true); - assertContains("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", true); - assertContains("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", true); - assertContains("a🙃b🙃c", "a🙃b🙃c", "UNICODE", true); - assertContains("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", true); - assertContains("😀😆😃😄", "😄😆", "UTF8_BINARY", false); - assertContains("😀😆😃😄", "😄😆", "UTF8_LCASE", false); - assertContains("😀😆😃😄", "😄😆", "UNICODE", false); - assertContains("😀😆😃😄", "😄😆", "UNICODE_CI", false); - assertContains("😀😆😃😄", "😆😃", "UTF8_BINARY", true); - assertContains("😀😆😃😄", "😆😃", "UTF8_LCASE", true); - assertContains("😀😆😃😄", "😆😃", "UNICODE", true); - assertContains("😀😆😃😄", "😆😃", "UNICODE_CI", true); - assertContains("😀😆😃😄", "😀😆", "UTF8_BINARY", true); - assertContains("😀😆😃😄", "😀😆", "UTF8_LCASE", true); - assertContains("😀😆😃😄", "😀😆", "UNICODE", true); - assertContains("😀😆😃😄", "😀😆", "UNICODE_CI", true); - assertContains("😀😆😃😄", "😃😄", "UTF8_BINARY", true); - assertContains("😀😆😃😄", "😃😄", "UTF8_LCASE", true); - assertContains("😀😆😃😄", "😃😄", "UNICODE", true); - assertContains("😀😆😃😄", "😃😄", "UNICODE_CI", true); - assertContains("😀😆😃😄", "😀😆😃😄", "UTF8_BINARY", true); - assertContains("😀😆😃😄", "😀😆😃😄", "UTF8_LCASE", true); - assertContains("😀😆😃😄", "😀😆😃😄", "UNICODE", true); - assertContains("😀😆😃😄", "😀😆😃😄", "UNICODE_CI", true); - assertContains("𐐅", "𐐅", "UTF8_BINARY", true); - assertContains("𐐅", "𐐅", "UTF8_LCASE", true); - assertContains("𐐅", "𐐅", "UNICODE", true); - assertContains("𐐅", "𐐅", "UNICODE_CI", true); - assertContains("𐐅", "𐐭", "UTF8_BINARY", false); - assertContains("𐐅", "𐐭", "UTF8_LCASE", true); - assertContains("𐐅", "𐐭", "UNICODE", false); - assertContains("𐐅", "𐐭", "UNICODE_CI", true); - assertContains("𝔸", "𝔸", "UTF8_BINARY", true); - assertContains("𝔸", "𝔸", "UTF8_LCASE", true); - assertContains("𝔸", "𝔸", "UNICODE", true); - assertContains("𝔸", "𝔸", "UNICODE_CI", true); + assertContains("a🙃b🙃c", "x", UTF8_BINARY, false); + assertContains("a🙃b🙃c", "x", UTF8_LCASE, false); + assertContains("a🙃b🙃c", "x", UNICODE, false); + assertContains("a🙃b🙃c", "x", UNICODE_CI, false); + assertContains("a🙃b🙃c", "b", UTF8_BINARY, true); + assertContains("a🙃b🙃c", "b", UTF8_LCASE, true); + assertContains("a🙃b🙃c", "b", UNICODE, true); + assertContains("a🙃b🙃c", "b", UNICODE_CI, true); + assertContains("a🙃b🙃c", "a🙃b", UTF8_BINARY, true); + assertContains("a🙃b🙃c", "a🙃b", UTF8_LCASE, true); + assertContains("a🙃b🙃c", "a🙃b", UNICODE, true); + assertContains("a🙃b🙃c", "a🙃b", UNICODE_CI, true); + assertContains("a🙃b🙃c", "b🙃c", UTF8_BINARY, true); + assertContains("a🙃b🙃c", "b🙃c", UTF8_LCASE, true); + assertContains("a🙃b🙃c", "b🙃c", UNICODE, true); + assertContains("a🙃b🙃c", "b🙃c", UNICODE_CI, true); + assertContains("a🙃b🙃c", "a🙃b🙃c", UTF8_BINARY, true); + assertContains("a🙃b🙃c", "a🙃b🙃c", UTF8_LCASE, true); + assertContains("a🙃b🙃c", "a🙃b🙃c", UNICODE, true); + assertContains("a🙃b🙃c", "a🙃b🙃c", UNICODE_CI, true); + assertContains("😀😆😃😄", "😄😆", UTF8_BINARY, false); + assertContains("😀😆😃😄", "😄😆", UTF8_LCASE, false); + assertContains("😀😆😃😄", "😄😆", UNICODE, false); + assertContains("😀😆😃😄", "😄😆", UNICODE_CI, false); + assertContains("😀😆😃😄", "😆😃", UTF8_BINARY, true); + assertContains("😀😆😃😄", "😆😃", UTF8_LCASE, true); + assertContains("😀😆😃😄", "😆😃", UNICODE, true); + assertContains("😀😆😃😄", "😆😃", UNICODE_CI, true); + assertContains("😀😆😃😄", "😀😆", UTF8_BINARY, true); + assertContains("😀😆😃😄", "😀😆", UTF8_LCASE, true); + assertContains("😀😆😃😄", "😀😆", UNICODE, true); + assertContains("😀😆😃😄", "😀😆", UNICODE_CI, true); + assertContains("😀😆😃😄", "😃😄", UTF8_BINARY, true); + assertContains("😀😆😃😄", "😃😄", UTF8_LCASE, true); + assertContains("😀😆😃😄", "😃😄", UNICODE, true); + assertContains("😀😆😃😄", "😃😄", UNICODE_CI, true); + assertContains("😀😆😃😄", "😀😆😃😄", UTF8_BINARY, true); + assertContains("😀😆😃😄", "😀😆😃😄", UTF8_LCASE, true); + assertContains("😀😆😃😄", "😀😆😃😄", UNICODE, true); + assertContains("😀😆😃😄", "😀😆😃😄", UNICODE_CI, true); + assertContains("𐐅", "𐐅", UTF8_BINARY, true); + assertContains("𐐅", "𐐅", UTF8_LCASE, true); + assertContains("𐐅", "𐐅", UNICODE, true); + assertContains("𐐅", "𐐅", UNICODE_CI, true); + assertContains("𐐅", "𐐭", UTF8_BINARY, false); + assertContains("𐐅", "𐐭", UTF8_LCASE, true); + assertContains("𐐅", "𐐭", UNICODE, false); + assertContains("𐐅", "𐐭", UNICODE_CI, true); + assertContains("𝔸", "𝔸", UTF8_BINARY, true); + assertContains("𝔸", "𝔸", UTF8_LCASE, true); + assertContains("𝔸", "𝔸", UNICODE, true); + assertContains("𝔸", "𝔸", UNICODE_CI, true); } /** @@ -549,211 +550,211 @@ public void testStartsWith() throws SparkException { assertStartsWith("Здраво", "Здраво", collationName, true); } // Advanced tests. - assertStartsWith("abcde", "abc", "UTF8_BINARY", true); - assertStartsWith("abcde", "abd", "UTF8_BINARY", false); - assertStartsWith("abcde", "fgh", "UTF8_BINARY", false); - assertStartsWith("abcde", "abcde", "UNICODE", true); - assertStartsWith("abcde", "aBcDe", "UNICODE", false); - assertStartsWith("abcde", "fghij", "UNICODE", false); - assertStartsWith("abcde", "A", "UTF8_LCASE", true); - assertStartsWith("abcde", "AbCdE", "UTF8_LCASE", true); - assertStartsWith("abcde", "X", "UTF8_LCASE", false); - assertStartsWith("abcde", "a", "UNICODE_CI", true); - assertStartsWith("abcde", "aBC", "UNICODE_CI", true); - assertStartsWith("abcde", "bcd", "UNICODE_CI", false); - assertStartsWith("abcde", "123", "UNICODE_CI", false); - assertStartsWith("ab世De", "ab世", "UTF8_BINARY", true); - assertStartsWith("ab世De", "aB世", "UTF8_BINARY", false); - assertStartsWith("äbćδe", "äbć", "UTF8_BINARY", true); - assertStartsWith("äbćδe", "äBc", "UTF8_BINARY", false); - assertStartsWith("ab世De", "ab世De", "UNICODE", true); - assertStartsWith("ab世De", "AB世dE", "UNICODE", false); - assertStartsWith("äbćδe", "äbćδe", "UNICODE", true); - assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE", false); - assertStartsWith("ab世De", "ab世", "UTF8_LCASE", true); - assertStartsWith("ab世De", "aB世", "UTF8_LCASE", true); - assertStartsWith("äbćδe", "äbć", "UTF8_LCASE", true); - assertStartsWith("äbćδe", "äBc", "UTF8_LCASE", false); - assertStartsWith("ab世De", "ab世De", "UNICODE_CI", true); - assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true); - assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); - assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - assertStartsWith("Kelvin.", "Kelvin", "UTF8_LCASE", true); - assertStartsWith("Kelvin.", "Kelvin", "UTF8_LCASE", true); - assertStartsWith("KKelvin.", "KKelvin", "UTF8_LCASE", true); - assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); - assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); - assertStartsWith("KKelvin.", "KKelvin,", "UTF8_LCASE", false); + assertStartsWith("abcde", "abc", UTF8_BINARY, true); + assertStartsWith("abcde", "abd", UTF8_BINARY, false); + assertStartsWith("abcde", "fgh", UTF8_BINARY, false); + assertStartsWith("abcde", "abcde", UNICODE, true); + assertStartsWith("abcde", "aBcDe", UNICODE, false); + assertStartsWith("abcde", "fghij", UNICODE, false); + assertStartsWith("abcde", "A", UTF8_LCASE, true); + assertStartsWith("abcde", "AbCdE", UTF8_LCASE, true); + assertStartsWith("abcde", "X", UTF8_LCASE, false); + assertStartsWith("abcde", "a", UNICODE_CI, true); + assertStartsWith("abcde", "aBC", UNICODE_CI, true); + assertStartsWith("abcde", "bcd", UNICODE_CI, false); + assertStartsWith("abcde", "123", UNICODE_CI, false); + assertStartsWith("ab世De", "ab世", UTF8_BINARY, true); + assertStartsWith("ab世De", "aB世", UTF8_BINARY, false); + assertStartsWith("äbćδe", "äbć", UTF8_BINARY, true); + assertStartsWith("äbćδe", "äBc", UTF8_BINARY, false); + assertStartsWith("ab世De", "ab世De", UNICODE, true); + assertStartsWith("ab世De", "AB世dE", UNICODE, false); + assertStartsWith("äbćδe", "äbćδe", UNICODE, true); + assertStartsWith("äbćδe", "ÄBcΔÉ", UNICODE, false); + assertStartsWith("ab世De", "ab世", UTF8_LCASE, true); + assertStartsWith("ab世De", "aB世", UTF8_LCASE, true); + assertStartsWith("äbćδe", "äbć", UTF8_LCASE, true); + assertStartsWith("äbćδe", "äBc", UTF8_LCASE, false); + assertStartsWith("ab世De", "ab世De", UNICODE_CI, true); + assertStartsWith("ab世De", "AB世dE", UNICODE_CI, true); + assertStartsWith("äbćδe", "ÄbćδE", UNICODE_CI, true); + assertStartsWith("äbćδe", "ÄBcΔÉ", UNICODE_CI, false); + assertStartsWith("Kelvin.", "Kelvin", UTF8_LCASE, true); + assertStartsWith("Kelvin.", "Kelvin", UTF8_LCASE, true); + assertStartsWith("KKelvin.", "KKelvin", UTF8_LCASE, true); + assertStartsWith("2 Kelvin.", "2 Kelvin", UTF8_LCASE, true); + assertStartsWith("2 Kelvin.", "2 Kelvin", UTF8_LCASE, true); + assertStartsWith("KKelvin.", "KKelvin,", UTF8_LCASE, false); assertStartsWith("Ћао", "Ца", "sr_Cyrl_CI_AI", false); assertStartsWith("Ћао", "ћа", "sr_Cyrl_CI_AI", true); assertStartsWith("Ćao", "Ca", "SR_CI", false); assertStartsWith("Ćao", "Ca", "SR_CI_AI", true); assertStartsWith("Ćao", "Ća", "SR", true); // Case variation. - assertStartsWith("aBcDe", "abc", "UTF8_BINARY", false); - assertStartsWith("aBcDe", "aBc", "UTF8_BINARY", true); - assertStartsWith("aBcDe", "abcde", "UNICODE", false); - assertStartsWith("aBcDe", "aBcDe", "UNICODE", true); - assertStartsWith("aBcDe", "abc", "UTF8_LCASE", true); - assertStartsWith("aBcDe", "ABC", "UTF8_LCASE", true); - assertStartsWith("aBcDe", "abcde", "UNICODE_CI", true); - assertStartsWith("aBcDe", "AbCdE", "UNICODE_CI", true); + assertStartsWith("aBcDe", "abc", UTF8_BINARY, false); + assertStartsWith("aBcDe", "aBc", UTF8_BINARY, true); + assertStartsWith("aBcDe", "abcde", UNICODE, false); + assertStartsWith("aBcDe", "aBcDe", UNICODE, true); + assertStartsWith("aBcDe", "abc", UTF8_LCASE, true); + assertStartsWith("aBcDe", "ABC", UTF8_LCASE, true); + assertStartsWith("aBcDe", "abcde", UNICODE_CI, true); + assertStartsWith("aBcDe", "AbCdE", UNICODE_CI, true); // Accent variation. - assertStartsWith("aBcDe", "abć", "UTF8_BINARY", false); - assertStartsWith("aBcDe", "aBć", "UTF8_BINARY", false); - assertStartsWith("aBcDe", "abćde", "UNICODE", false); - assertStartsWith("aBcDe", "aBćDe", "UNICODE", false); - assertStartsWith("aBcDe", "abć", "UTF8_LCASE", false); - assertStartsWith("aBcDe", "ABĆ", "UTF8_LCASE", false); - assertStartsWith("aBcDe", "abćde", "UNICODE_CI", false); - assertStartsWith("aBcDe", "AbĆdE", "UNICODE_CI", false); + assertStartsWith("aBcDe", "abć", UTF8_BINARY, false); + assertStartsWith("aBcDe", "aBć", UTF8_BINARY, false); + assertStartsWith("aBcDe", "abćde", UNICODE, false); + assertStartsWith("aBcDe", "aBćDe", UNICODE, false); + assertStartsWith("aBcDe", "abć", UTF8_LCASE, false); + assertStartsWith("aBcDe", "ABĆ", UTF8_LCASE, false); + assertStartsWith("aBcDe", "abćde", UNICODE_CI, false); + assertStartsWith("aBcDe", "AbĆdE", UNICODE_CI, false); // One-to-many case mapping (e.g. Turkish dotted I). - assertStartsWith("i\u0307", "i", "UNICODE_CI", false); - assertStartsWith("i\u0307", "İ", "UNICODE_CI", true); - assertStartsWith("İ", "i", "UNICODE_CI", false); - assertStartsWith("İİİ", "i̇i̇", "UNICODE_CI", true); - assertStartsWith("İİİ", "i̇i", "UNICODE_CI", false); - assertStartsWith("İi̇İ", "i̇İ", "UNICODE_CI", true); - assertStartsWith("i̇İi̇i̇", "İi̇İi", "UNICODE_CI", false); - assertStartsWith("i̇onic", "io", "UNICODE_CI", false); - assertStartsWith("i̇onic", "Io", "UNICODE_CI", false); - assertStartsWith("i̇onic", "i\u0307o", "UNICODE_CI", true); - assertStartsWith("i̇onic", "İo", "UNICODE_CI", true); - assertStartsWith("İonic", "io", "UNICODE_CI", false); - assertStartsWith("İonic", "Io", "UNICODE_CI", false); - assertStartsWith("İonic", "i\u0307o", "UNICODE_CI", true); - assertStartsWith("İonic", "İo", "UNICODE_CI", true); - assertStartsWith("i\u0307", "i", "UTF8_LCASE", true); // != UNICODE_CI - assertStartsWith("i\u0307", "İ", "UTF8_LCASE", true); - assertStartsWith("İ", "i", "UTF8_LCASE", false); - assertStartsWith("İİİ", "i̇i̇", "UTF8_LCASE", true); - assertStartsWith("İİİ", "i̇i", "UTF8_LCASE", false); - assertStartsWith("İi̇İ", "i̇İ", "UTF8_LCASE", true); - assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_LCASE", true); // != UNICODE_CI - assertStartsWith("i̇onic", "io", "UTF8_LCASE", false); - assertStartsWith("i̇onic", "Io", "UTF8_LCASE", false); - assertStartsWith("i̇onic", "i\u0307o", "UTF8_LCASE", true); - assertStartsWith("i̇onic", "İo", "UTF8_LCASE", true); - assertStartsWith("İonic", "io", "UTF8_LCASE", false); - assertStartsWith("İonic", "Io", "UTF8_LCASE", false); - assertStartsWith("İonic", "i\u0307o", "UTF8_LCASE", true); - assertStartsWith("İonic", "İo", "UTF8_LCASE", true); - assertStartsWith("oİ", "oİ", "UTF8_LCASE", true); - assertStartsWith("oİ", "oi̇", "UTF8_LCASE", true); + assertStartsWith("i\u0307", "i", UNICODE_CI, false); + assertStartsWith("i\u0307", "İ", UNICODE_CI, true); + assertStartsWith("İ", "i", UNICODE_CI, false); + assertStartsWith("İİİ", "i̇i̇", UNICODE_CI, true); + assertStartsWith("İİİ", "i̇i", UNICODE_CI, false); + assertStartsWith("İi̇İ", "i̇İ", UNICODE_CI, true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", UNICODE_CI, false); + assertStartsWith("i̇onic", "io", UNICODE_CI, false); + assertStartsWith("i̇onic", "Io", UNICODE_CI, false); + assertStartsWith("i̇onic", "i\u0307o", UNICODE_CI, true); + assertStartsWith("i̇onic", "İo", UNICODE_CI, true); + assertStartsWith("İonic", "io", UNICODE_CI, false); + assertStartsWith("İonic", "Io", UNICODE_CI, false); + assertStartsWith("İonic", "i\u0307o", UNICODE_CI, true); + assertStartsWith("İonic", "İo", UNICODE_CI, true); + assertStartsWith("i\u0307", "i", UTF8_LCASE, true); // != UNICODE_CI + assertStartsWith("i\u0307", "İ", UTF8_LCASE, true); + assertStartsWith("İ", "i", UTF8_LCASE, false); + assertStartsWith("İİİ", "i̇i̇", UTF8_LCASE, true); + assertStartsWith("İİİ", "i̇i", UTF8_LCASE, false); + assertStartsWith("İi̇İ", "i̇İ", UTF8_LCASE, true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", UTF8_LCASE, true); // != UNICODE_CI + assertStartsWith("i̇onic", "io", UTF8_LCASE, false); + assertStartsWith("i̇onic", "Io", UTF8_LCASE, false); + assertStartsWith("i̇onic", "i\u0307o", UTF8_LCASE, true); + assertStartsWith("i̇onic", "İo", UTF8_LCASE, true); + assertStartsWith("İonic", "io", UTF8_LCASE, false); + assertStartsWith("İonic", "Io", UTF8_LCASE, false); + assertStartsWith("İonic", "i\u0307o", UTF8_LCASE, true); + assertStartsWith("İonic", "İo", UTF8_LCASE, true); + assertStartsWith("oİ", "oİ", UTF8_LCASE, true); + assertStartsWith("oİ", "oi̇", UTF8_LCASE, true); // Conditional case mapping (e.g. Greek sigmas). - assertStartsWith("σ", "σ", "UTF8_BINARY", true); - assertStartsWith("σ", "ς", "UTF8_BINARY", false); - assertStartsWith("σ", "Σ", "UTF8_BINARY", false); - assertStartsWith("ς", "σ", "UTF8_BINARY", false); - assertStartsWith("ς", "ς", "UTF8_BINARY", true); - assertStartsWith("ς", "Σ", "UTF8_BINARY", false); - assertStartsWith("Σ", "σ", "UTF8_BINARY", false); - assertStartsWith("Σ", "ς", "UTF8_BINARY", false); - assertStartsWith("Σ", "Σ", "UTF8_BINARY", true); - assertStartsWith("σ", "σ", "UTF8_LCASE", true); - assertStartsWith("σ", "ς", "UTF8_LCASE", true); - assertStartsWith("σ", "Σ", "UTF8_LCASE", true); - assertStartsWith("ς", "σ", "UTF8_LCASE", true); - assertStartsWith("ς", "ς", "UTF8_LCASE", true); - assertStartsWith("ς", "Σ", "UTF8_LCASE", true); - assertStartsWith("Σ", "σ", "UTF8_LCASE", true); - assertStartsWith("Σ", "ς", "UTF8_LCASE", true); - assertStartsWith("Σ", "Σ", "UTF8_LCASE", true); - assertStartsWith("σ", "σ", "UNICODE", true); - assertStartsWith("σ", "ς", "UNICODE", false); - assertStartsWith("σ", "Σ", "UNICODE", false); - assertStartsWith("ς", "σ", "UNICODE", false); - assertStartsWith("ς", "ς", "UNICODE", true); - assertStartsWith("ς", "Σ", "UNICODE", false); - assertStartsWith("Σ", "σ", "UNICODE", false); - assertStartsWith("Σ", "ς", "UNICODE", false); - assertStartsWith("Σ", "Σ", "UNICODE", true); - assertStartsWith("σ", "σ", "UNICODE_CI", true); - assertStartsWith("σ", "ς", "UNICODE_CI", true); - assertStartsWith("σ", "Σ", "UNICODE_CI", true); - assertStartsWith("ς", "σ", "UNICODE_CI", true); - assertStartsWith("ς", "ς", "UNICODE_CI", true); - assertStartsWith("ς", "Σ", "UNICODE_CI", true); - assertStartsWith("Σ", "σ", "UNICODE_CI", true); - assertStartsWith("Σ", "ς", "UNICODE_CI", true); - assertStartsWith("Σ", "Σ", "UNICODE_CI", true); - assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_BINARY", true); - assertStartsWith("ΣΑΛΑΤΑ", "σ", "UTF8_BINARY", false); - assertStartsWith("ΣΑΛΑΤΑ", "ς", "UTF8_BINARY", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_BINARY", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_BINARY", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_BINARY", false); - assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_LCASE", true); - assertStartsWith("ΣΑΛΑΤΑ", "σ", "UTF8_LCASE", true); - assertStartsWith("ΣΑΛΑΤΑ", "ς", "UTF8_LCASE", true); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_LCASE", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_LCASE", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_LCASE", false); - assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE", true); - assertStartsWith("ΣΑΛΑΤΑ", "σ", "UNICODE", false); - assertStartsWith("ΣΑΛΑΤΑ", "ς", "UNICODE", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE", false); - assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE_CI", true); - assertStartsWith("ΣΑΛΑΤΑ", "σ", "UNICODE_CI", true); - assertStartsWith("ΣΑΛΑΤΑ", "ς", "UNICODE_CI", true); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE_CI", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE_CI", false); - assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE_CI", false); + assertStartsWith("σ", "σ", UTF8_BINARY, true); + assertStartsWith("σ", "ς", UTF8_BINARY, false); + assertStartsWith("σ", "Σ", UTF8_BINARY, false); + assertStartsWith("ς", "σ", UTF8_BINARY, false); + assertStartsWith("ς", "ς", UTF8_BINARY, true); + assertStartsWith("ς", "Σ", UTF8_BINARY, false); + assertStartsWith("Σ", "σ", UTF8_BINARY, false); + assertStartsWith("Σ", "ς", UTF8_BINARY, false); + assertStartsWith("Σ", "Σ", UTF8_BINARY, true); + assertStartsWith("σ", "σ", UTF8_LCASE, true); + assertStartsWith("σ", "ς", UTF8_LCASE, true); + assertStartsWith("σ", "Σ", UTF8_LCASE, true); + assertStartsWith("ς", "σ", UTF8_LCASE, true); + assertStartsWith("ς", "ς", UTF8_LCASE, true); + assertStartsWith("ς", "Σ", UTF8_LCASE, true); + assertStartsWith("Σ", "σ", UTF8_LCASE, true); + assertStartsWith("Σ", "ς", UTF8_LCASE, true); + assertStartsWith("Σ", "Σ", UTF8_LCASE, true); + assertStartsWith("σ", "σ", UNICODE, true); + assertStartsWith("σ", "ς", UNICODE, false); + assertStartsWith("σ", "Σ", UNICODE, false); + assertStartsWith("ς", "σ", UNICODE, false); + assertStartsWith("ς", "ς", UNICODE, true); + assertStartsWith("ς", "Σ", UNICODE, false); + assertStartsWith("Σ", "σ", UNICODE, false); + assertStartsWith("Σ", "ς", UNICODE, false); + assertStartsWith("Σ", "Σ", UNICODE, true); + assertStartsWith("σ", "σ", UNICODE_CI, true); + assertStartsWith("σ", "ς", UNICODE_CI, true); + assertStartsWith("σ", "Σ", UNICODE_CI, true); + assertStartsWith("ς", "σ", UNICODE_CI, true); + assertStartsWith("ς", "ς", UNICODE_CI, true); + assertStartsWith("ς", "Σ", UNICODE_CI, true); + assertStartsWith("Σ", "σ", UNICODE_CI, true); + assertStartsWith("Σ", "ς", UNICODE_CI, true); + assertStartsWith("Σ", "Σ", UNICODE_CI, true); + assertStartsWith("ΣΑΛΑΤΑ", "Σ", UTF8_BINARY, true); + assertStartsWith("ΣΑΛΑΤΑ", "σ", UTF8_BINARY, false); + assertStartsWith("ΣΑΛΑΤΑ", "ς", UTF8_BINARY, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UTF8_BINARY, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UTF8_BINARY, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UTF8_BINARY, false); + assertStartsWith("ΣΑΛΑΤΑ", "Σ", UTF8_LCASE, true); + assertStartsWith("ΣΑΛΑΤΑ", "σ", UTF8_LCASE, true); + assertStartsWith("ΣΑΛΑΤΑ", "ς", UTF8_LCASE, true); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UTF8_LCASE, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UTF8_LCASE, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UTF8_LCASE, false); + assertStartsWith("ΣΑΛΑΤΑ", "Σ", UNICODE, true); + assertStartsWith("ΣΑΛΑΤΑ", "σ", UNICODE, false); + assertStartsWith("ΣΑΛΑΤΑ", "ς", UNICODE, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UNICODE, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UNICODE, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UNICODE, false); + assertStartsWith("ΣΑΛΑΤΑ", "Σ", UNICODE_CI, true); + assertStartsWith("ΣΑΛΑΤΑ", "σ", UNICODE_CI, true); + assertStartsWith("ΣΑΛΑΤΑ", "ς", UNICODE_CI, true); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UNICODE_CI, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UNICODE_CI, false); + assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UNICODE_CI, false); // Surrogate pairs. - assertStartsWith("a🙃b🙃c", "x", "UTF8_BINARY", false); - assertStartsWith("a🙃b🙃c", "x", "UTF8_LCASE", false); - assertStartsWith("a🙃b🙃c", "x", "UNICODE", false); - assertStartsWith("a🙃b🙃c", "x", "UNICODE_CI", false); - assertStartsWith("a🙃b🙃c", "b", "UTF8_BINARY", false); - assertStartsWith("a🙃b🙃c", "b", "UTF8_LCASE", false); - assertStartsWith("a🙃b🙃c", "b", "UNICODE", false); - assertStartsWith("a🙃b🙃c", "b", "UNICODE_CI", false); - assertStartsWith("a🙃b🙃c", "a🙃b", "UTF8_BINARY", true); - assertStartsWith("a🙃b🙃c", "a🙃b", "UTF8_LCASE", true); - assertStartsWith("a🙃b🙃c", "a🙃b", "UNICODE", true); - assertStartsWith("a🙃b🙃c", "a🙃b", "UNICODE_CI", true); - assertStartsWith("a🙃b🙃c", "b🙃c", "UTF8_BINARY", false); - assertStartsWith("a🙃b🙃c", "b🙃c", "UTF8_LCASE", false); - assertStartsWith("a🙃b🙃c", "b🙃c", "UNICODE", false); - assertStartsWith("a🙃b🙃c", "b🙃c", "UNICODE_CI", false); - assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", true); - assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", true); - assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE", true); - assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", true); - assertStartsWith("😀😆😃😄", "😄😆", "UTF8_BINARY", false); - assertStartsWith("😀😆😃😄", "😄😆", "UTF8_LCASE", false); - assertStartsWith("😀😆😃😄", "😄😆", "UNICODE", false); - assertStartsWith("😀😆😃😄", "😄😆", "UNICODE_CI", false); - assertStartsWith("😀😆😃😄", "😆😃", "UTF8_BINARY", false); - assertStartsWith("😀😆😃😄", "😆😃", "UTF8_LCASE", false); - assertStartsWith("😀😆😃😄", "😆😃", "UNICODE", false); - assertStartsWith("😀😆😃😄", "😆😃", "UNICODE_CI", false); - assertStartsWith("😀😆😃😄", "😀😆", "UTF8_BINARY", true); - assertStartsWith("😀😆😃😄", "😀😆", "UTF8_LCASE", true); - assertStartsWith("😀😆😃😄", "😀😆", "UNICODE", true); - assertStartsWith("😀😆😃😄", "😀😆", "UNICODE_CI", true); - assertStartsWith("😀😆😃😄", "😃😄", "UTF8_BINARY", false); - assertStartsWith("😀😆😃😄", "😃😄", "UTF8_LCASE", false); - assertStartsWith("😀😆😃😄", "😃😄", "UNICODE", false); - assertStartsWith("😀😆😃😄", "😃😄", "UNICODE_CI", false); - assertStartsWith("😀😆😃😄", "😀😆😃😄", "UTF8_BINARY", true); - assertStartsWith("😀😆😃😄", "😀😆😃😄", "UTF8_LCASE", true); - assertStartsWith("😀😆😃😄", "😀😆😃😄", "UNICODE", true); - assertStartsWith("😀😆😃😄", "😀😆😃😄", "UNICODE_CI", true); - assertStartsWith("𐐅", "𐐅", "UTF8_BINARY", true); - assertStartsWith("𐐅", "𐐅", "UTF8_LCASE", true); - assertStartsWith("𐐅", "𐐅", "UNICODE", true); - assertStartsWith("𐐅", "𐐅", "UNICODE_CI", true); - assertStartsWith("𐐅", "𐐭", "UTF8_BINARY", false); - assertStartsWith("𐐅", "𐐭", "UTF8_LCASE", true); - assertStartsWith("𐐅", "𐐭", "UNICODE", false); - assertStartsWith("𐐅", "𐐭", "UNICODE_CI", true); - assertStartsWith("𝔸", "𝔸", "UTF8_BINARY", true); - assertStartsWith("𝔸", "𝔸", "UTF8_LCASE", true); - assertStartsWith("𝔸", "𝔸", "UNICODE", true); - assertStartsWith("𝔸", "𝔸", "UNICODE_CI", true); + assertStartsWith("a🙃b🙃c", "x", UTF8_BINARY, false); + assertStartsWith("a🙃b🙃c", "x", UTF8_LCASE, false); + assertStartsWith("a🙃b🙃c", "x", UNICODE, false); + assertStartsWith("a🙃b🙃c", "x", UNICODE_CI, false); + assertStartsWith("a🙃b🙃c", "b", UTF8_BINARY, false); + assertStartsWith("a🙃b🙃c", "b", UTF8_LCASE, false); + assertStartsWith("a🙃b🙃c", "b", UNICODE, false); + assertStartsWith("a🙃b🙃c", "b", UNICODE_CI, false); + assertStartsWith("a🙃b🙃c", "a🙃b", UTF8_BINARY, true); + assertStartsWith("a🙃b🙃c", "a🙃b", UTF8_LCASE, true); + assertStartsWith("a🙃b🙃c", "a🙃b", UNICODE, true); + assertStartsWith("a🙃b🙃c", "a🙃b", UNICODE_CI, true); + assertStartsWith("a🙃b🙃c", "b🙃c", UTF8_BINARY, false); + assertStartsWith("a🙃b🙃c", "b🙃c", UTF8_LCASE, false); + assertStartsWith("a🙃b🙃c", "b🙃c", UNICODE, false); + assertStartsWith("a🙃b🙃c", "b🙃c", UNICODE_CI, false); + assertStartsWith("a🙃b🙃c", "a🙃b🙃c", UTF8_BINARY, true); + assertStartsWith("a🙃b🙃c", "a🙃b🙃c", UTF8_LCASE, true); + assertStartsWith("a🙃b🙃c", "a🙃b🙃c", UNICODE, true); + assertStartsWith("a🙃b🙃c", "a🙃b🙃c", UNICODE_CI, true); + assertStartsWith("😀😆😃😄", "😄😆", UTF8_BINARY, false); + assertStartsWith("😀😆😃😄", "😄😆", UTF8_LCASE, false); + assertStartsWith("😀😆😃😄", "😄😆", UNICODE, false); + assertStartsWith("😀😆😃😄", "😄😆", UNICODE_CI, false); + assertStartsWith("😀😆😃😄", "😆😃", UTF8_BINARY, false); + assertStartsWith("😀😆😃😄", "😆😃", UTF8_LCASE, false); + assertStartsWith("😀😆😃😄", "😆😃", UNICODE, false); + assertStartsWith("😀😆😃😄", "😆😃", UNICODE_CI, false); + assertStartsWith("😀😆😃😄", "😀😆", UTF8_BINARY, true); + assertStartsWith("😀😆😃😄", "😀😆", UTF8_LCASE, true); + assertStartsWith("😀😆😃😄", "😀😆", UNICODE, true); + assertStartsWith("😀😆😃😄", "😀😆", UNICODE_CI, true); + assertStartsWith("😀😆😃😄", "😃😄", UTF8_BINARY, false); + assertStartsWith("😀😆😃😄", "😃😄", UTF8_LCASE, false); + assertStartsWith("😀😆😃😄", "😃😄", UNICODE, false); + assertStartsWith("😀😆😃😄", "😃😄", UNICODE_CI, false); + assertStartsWith("😀😆😃😄", "😀😆😃😄", UTF8_BINARY, true); + assertStartsWith("😀😆😃😄", "😀😆😃😄", UTF8_LCASE, true); + assertStartsWith("😀😆😃😄", "😀😆😃😄", UNICODE, true); + assertStartsWith("😀😆😃😄", "😀😆😃😄", UNICODE_CI, true); + assertStartsWith("𐐅", "𐐅", UTF8_BINARY, true); + assertStartsWith("𐐅", "𐐅", UTF8_LCASE, true); + assertStartsWith("𐐅", "𐐅", UNICODE, true); + assertStartsWith("𐐅", "𐐅", UNICODE_CI, true); + assertStartsWith("𐐅", "𐐭", UTF8_BINARY, false); + assertStartsWith("𐐅", "𐐭", UTF8_LCASE, true); + assertStartsWith("𐐅", "𐐭", UNICODE, false); + assertStartsWith("𐐅", "𐐭", UNICODE_CI, true); + assertStartsWith("𝔸", "𝔸", UTF8_BINARY, true); + assertStartsWith("𝔸", "𝔸", UTF8_LCASE, true); + assertStartsWith("𝔸", "𝔸", UNICODE, true); + assertStartsWith("𝔸", "𝔸", UNICODE_CI, true); } /** @@ -806,212 +807,212 @@ public void testEndsWith() throws SparkException { assertEndsWith("Здраво", "Здраво", collationName, true); } // Advanced tests. - assertEndsWith("abcde", "cde", "UTF8_BINARY", true); - assertEndsWith("abcde", "bde", "UTF8_BINARY", false); - assertEndsWith("abcde", "fgh", "UTF8_BINARY", false); - assertEndsWith("abcde", "abcde", "UNICODE", true); - assertEndsWith("abcde", "aBcDe", "UNICODE", false); - assertEndsWith("abcde", "fghij", "UNICODE", false); - assertEndsWith("abcde", "E", "UTF8_LCASE", true); - assertEndsWith("abcde", "AbCdE", "UTF8_LCASE", true); - assertEndsWith("abcde", "X", "UTF8_LCASE", false); - assertEndsWith("abcde", "e", "UNICODE_CI", true); - assertEndsWith("abcde", "CDe", "UNICODE_CI", true); - assertEndsWith("abcde", "bcd", "UNICODE_CI", false); - assertEndsWith("abcde", "123", "UNICODE_CI", false); - assertEndsWith("ab世De", "世De", "UTF8_BINARY", true); - assertEndsWith("ab世De", "世dE", "UTF8_BINARY", false); - assertEndsWith("äbćδe", "ćδe", "UTF8_BINARY", true); - assertEndsWith("äbćδe", "cΔé", "UTF8_BINARY", false); - assertEndsWith("ab世De", "ab世De", "UNICODE", true); - assertEndsWith("ab世De", "AB世dE", "UNICODE", false); - assertEndsWith("äbćδe", "äbćδe", "UNICODE", true); - assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE", false); - assertEndsWith("ab世De", "世De", "UTF8_LCASE", true); - assertEndsWith("ab世De", "世dE", "UTF8_LCASE", true); - assertEndsWith("äbćδe", "ćδe", "UTF8_LCASE", true); - assertEndsWith("äbćδe", "cδE", "UTF8_LCASE", false); - assertEndsWith("ab世De", "ab世De", "UNICODE_CI", true); - assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true); - assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); - assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - assertEndsWith("The Kelvin", "Kelvin", "UTF8_LCASE", true); - assertEndsWith("The Kelvin", "Kelvin", "UTF8_LCASE", true); - assertEndsWith("The KKelvin", "KKelvin", "UTF8_LCASE", true); - assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true); - assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true); - assertEndsWith("The KKelvin", "KKelvin,", "UTF8_LCASE", false); + assertEndsWith("abcde", "cde", UTF8_BINARY, true); + assertEndsWith("abcde", "bde", UTF8_BINARY, false); + assertEndsWith("abcde", "fgh", UTF8_BINARY, false); + assertEndsWith("abcde", "abcde", UNICODE, true); + assertEndsWith("abcde", "aBcDe", UNICODE, false); + assertEndsWith("abcde", "fghij", UNICODE, false); + assertEndsWith("abcde", "E", UTF8_LCASE, true); + assertEndsWith("abcde", "AbCdE", UTF8_LCASE, true); + assertEndsWith("abcde", "X", UTF8_LCASE, false); + assertEndsWith("abcde", "e", UNICODE_CI, true); + assertEndsWith("abcde", "CDe", UNICODE_CI, true); + assertEndsWith("abcde", "bcd", UNICODE_CI, false); + assertEndsWith("abcde", "123", UNICODE_CI, false); + assertEndsWith("ab世De", "世De", UTF8_BINARY, true); + assertEndsWith("ab世De", "世dE", UTF8_BINARY, false); + assertEndsWith("äbćδe", "ćδe", UTF8_BINARY, true); + assertEndsWith("äbćδe", "cΔé", UTF8_BINARY, false); + assertEndsWith("ab世De", "ab世De", UNICODE, true); + assertEndsWith("ab世De", "AB世dE", UNICODE, false); + assertEndsWith("äbćδe", "äbćδe", UNICODE, true); + assertEndsWith("äbćδe", "ÄBcΔÉ", UNICODE, false); + assertEndsWith("ab世De", "世De", UTF8_LCASE, true); + assertEndsWith("ab世De", "世dE", UTF8_LCASE, true); + assertEndsWith("äbćδe", "ćδe", UTF8_LCASE, true); + assertEndsWith("äbćδe", "cδE", UTF8_LCASE, false); + assertEndsWith("ab世De", "ab世De", UNICODE_CI, true); + assertEndsWith("ab世De", "AB世dE", UNICODE_CI, true); + assertEndsWith("äbćδe", "ÄbćδE", UNICODE_CI, true); + assertEndsWith("äbćδe", "ÄBcΔÉ", UNICODE_CI, false); + assertEndsWith("The Kelvin", "Kelvin", UTF8_LCASE, true); + assertEndsWith("The Kelvin", "Kelvin", UTF8_LCASE, true); + assertEndsWith("The KKelvin", "KKelvin", UTF8_LCASE, true); + assertEndsWith("The 2 Kelvin", "2 Kelvin", UTF8_LCASE, true); + assertEndsWith("The 2 Kelvin", "2 Kelvin", UTF8_LCASE, true); + assertEndsWith("The KKelvin", "KKelvin,", UTF8_LCASE, false); assertEndsWith("Ћевапчићи", "цици", "sr_Cyrl_CI_AI", false); assertEndsWith("Ћевапчићи", "чИЋи", "sr_Cyrl_CI_AI", true); assertEndsWith("Ćevapčići", "cici", "SR_CI", false); assertEndsWith("Ćevapčići", "cici", "SR_CI_AI", true); assertEndsWith("Ćevapčići", "čići", "SR", true); // Case variation. - assertEndsWith("aBcDe", "cde", "UTF8_BINARY", false); - assertEndsWith("aBcDe", "cDe", "UTF8_BINARY", true); - assertEndsWith("aBcDe", "abcde", "UNICODE", false); - assertEndsWith("aBcDe", "aBcDe", "UNICODE", true); - assertEndsWith("aBcDe", "cde", "UTF8_LCASE", true); - assertEndsWith("aBcDe", "CDE", "UTF8_LCASE", true); - assertEndsWith("aBcDe", "abcde", "UNICODE_CI", true); - assertEndsWith("aBcDe", "AbCdE", "UNICODE_CI", true); + assertEndsWith("aBcDe", "cde", UTF8_BINARY, false); + assertEndsWith("aBcDe", "cDe", UTF8_BINARY, true); + assertEndsWith("aBcDe", "abcde", UNICODE, false); + assertEndsWith("aBcDe", "aBcDe", UNICODE, true); + assertEndsWith("aBcDe", "cde", UTF8_LCASE, true); + assertEndsWith("aBcDe", "CDE", UTF8_LCASE, true); + assertEndsWith("aBcDe", "abcde", UNICODE_CI, true); + assertEndsWith("aBcDe", "AbCdE", UNICODE_CI, true); // Accent variation. - assertEndsWith("aBcDe", "ćde", "UTF8_BINARY", false); - assertEndsWith("aBcDe", "ćDe", "UTF8_BINARY", false); - assertEndsWith("aBcDe", "abćde", "UNICODE", false); - assertEndsWith("aBcDe", "aBćDe", "UNICODE", false); - assertEndsWith("aBcDe", "ćde", "UTF8_LCASE", false); - assertEndsWith("aBcDe", "ĆDE", "UTF8_LCASE", false); - assertEndsWith("aBcDe", "abćde", "UNICODE_CI", false); - assertEndsWith("aBcDe", "AbĆdE", "UNICODE_CI", false); + assertEndsWith("aBcDe", "ćde", UTF8_BINARY, false); + assertEndsWith("aBcDe", "ćDe", UTF8_BINARY, false); + assertEndsWith("aBcDe", "abćde", UNICODE, false); + assertEndsWith("aBcDe", "aBćDe", UNICODE, false); + assertEndsWith("aBcDe", "ćde", UTF8_LCASE, false); + assertEndsWith("aBcDe", "ĆDE", UTF8_LCASE, false); + assertEndsWith("aBcDe", "abćde", UNICODE_CI, false); + assertEndsWith("aBcDe", "AbĆdE", UNICODE_CI, false); // One-to-many case mapping (e.g. Turkish dotted I). - assertEndsWith("i\u0307", "\u0307", "UNICODE_CI", false); - assertEndsWith("i\u0307", "İ", "UNICODE_CI", true); - assertEndsWith("İ", "i", "UNICODE_CI", false); - assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true); - assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false); - assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true); - assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UNICODE_CI", false); - assertEndsWith("the i\u0307o", "io", "UNICODE_CI", false); - assertEndsWith("the i\u0307o", "Io", "UNICODE_CI", false); - assertEndsWith("the i\u0307o", "i\u0307o", "UNICODE_CI", true); - assertEndsWith("the i\u0307o", "İo", "UNICODE_CI", true); - assertEndsWith("the İo", "io", "UNICODE_CI", false); - assertEndsWith("the İo", "Io", "UNICODE_CI", false); - assertEndsWith("the İo", "i\u0307o", "UNICODE_CI", true); - assertEndsWith("the İo", "İo", "UNICODE_CI", true); - assertEndsWith("i\u0307", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI - assertEndsWith("i\u0307", "İ", "UTF8_LCASE", true); - assertEndsWith("İ", "\u0307", "UTF8_LCASE", false); - assertEndsWith("İİİ", "i̇i̇", "UTF8_LCASE", true); - assertEndsWith("İİİ", "ii̇", "UTF8_LCASE", false); - assertEndsWith("İi̇İ", "İi̇", "UTF8_LCASE", true); - assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_LCASE", true); // != UNICODE_CI - assertEndsWith("i̇İi̇i̇", "\u0307İİ", "UTF8_LCASE", false); - assertEndsWith("the i\u0307o", "io", "UTF8_LCASE", false); - assertEndsWith("the i\u0307o", "Io", "UTF8_LCASE", false); - assertEndsWith("the i\u0307o", "i\u0307o", "UTF8_LCASE", true); - assertEndsWith("the i\u0307o", "İo", "UTF8_LCASE", true); - assertEndsWith("the İo", "io", "UTF8_LCASE", false); - assertEndsWith("the İo", "Io", "UTF8_LCASE", false); - assertEndsWith("the İo", "i\u0307o", "UTF8_LCASE", true); - assertEndsWith("the İo", "İo", "UTF8_LCASE", true); - assertEndsWith("İo", "İo", "UTF8_LCASE", true); - assertEndsWith("İo", "i̇o", "UTF8_LCASE", true); + assertEndsWith("i\u0307", "\u0307", UNICODE_CI, false); + assertEndsWith("i\u0307", "İ", UNICODE_CI, true); + assertEndsWith("İ", "i", UNICODE_CI, false); + assertEndsWith("İİİ", "i̇i̇", UNICODE_CI, true); + assertEndsWith("İİİ", "ii̇", UNICODE_CI, false); + assertEndsWith("İi̇İ", "İi̇", UNICODE_CI, true); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", UNICODE_CI, false); + assertEndsWith("the i\u0307o", "io", UNICODE_CI, false); + assertEndsWith("the i\u0307o", "Io", UNICODE_CI, false); + assertEndsWith("the i\u0307o", "i\u0307o", UNICODE_CI, true); + assertEndsWith("the i\u0307o", "İo", UNICODE_CI, true); + assertEndsWith("the İo", "io", UNICODE_CI, false); + assertEndsWith("the İo", "Io", UNICODE_CI, false); + assertEndsWith("the İo", "i\u0307o", UNICODE_CI, true); + assertEndsWith("the İo", "İo", UNICODE_CI, true); + assertEndsWith("i\u0307", "\u0307", UTF8_LCASE, true); // != UNICODE_CI + assertEndsWith("i\u0307", "İ", UTF8_LCASE, true); + assertEndsWith("İ", "\u0307", UTF8_LCASE, false); + assertEndsWith("İİİ", "i̇i̇", UTF8_LCASE, true); + assertEndsWith("İİİ", "ii̇", UTF8_LCASE, false); + assertEndsWith("İi̇İ", "İi̇", UTF8_LCASE, true); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", UTF8_LCASE, true); // != UNICODE_CI + assertEndsWith("i̇İi̇i̇", "\u0307İİ", UTF8_LCASE, false); + assertEndsWith("the i\u0307o", "io", UTF8_LCASE, false); + assertEndsWith("the i\u0307o", "Io", UTF8_LCASE, false); + assertEndsWith("the i\u0307o", "i\u0307o", UTF8_LCASE, true); + assertEndsWith("the i\u0307o", "İo", UTF8_LCASE, true); + assertEndsWith("the İo", "io", UTF8_LCASE, false); + assertEndsWith("the İo", "Io", UTF8_LCASE, false); + assertEndsWith("the İo", "i\u0307o", UTF8_LCASE, true); + assertEndsWith("the İo", "İo", UTF8_LCASE, true); + assertEndsWith("İo", "İo", UTF8_LCASE, true); + assertEndsWith("İo", "i̇o", UTF8_LCASE, true); // Conditional case mapping (e.g. Greek sigmas). - assertEndsWith("σ", "σ", "UTF8_BINARY", true); - assertEndsWith("σ", "ς", "UTF8_BINARY", false); - assertEndsWith("σ", "Σ", "UTF8_BINARY", false); - assertEndsWith("ς", "σ", "UTF8_BINARY", false); - assertEndsWith("ς", "ς", "UTF8_BINARY", true); - assertEndsWith("ς", "Σ", "UTF8_BINARY", false); - assertEndsWith("Σ", "σ", "UTF8_BINARY", false); - assertEndsWith("Σ", "ς", "UTF8_BINARY", false); - assertEndsWith("Σ", "Σ", "UTF8_BINARY", true); - assertEndsWith("σ", "σ", "UTF8_LCASE", true); - assertEndsWith("σ", "ς", "UTF8_LCASE", true); - assertEndsWith("σ", "Σ", "UTF8_LCASE", true); - assertEndsWith("ς", "σ", "UTF8_LCASE", true); - assertEndsWith("ς", "ς", "UTF8_LCASE", true); - assertEndsWith("ς", "Σ", "UTF8_LCASE", true); - assertEndsWith("Σ", "σ", "UTF8_LCASE", true); - assertEndsWith("Σ", "ς", "UTF8_LCASE", true); - assertEndsWith("Σ", "Σ", "UTF8_LCASE", true); - assertEndsWith("σ", "σ", "UNICODE", true); - assertEndsWith("σ", "ς", "UNICODE", false); - assertEndsWith("σ", "Σ", "UNICODE", false); - assertEndsWith("ς", "σ", "UNICODE", false); - assertEndsWith("ς", "ς", "UNICODE", true); - assertEndsWith("ς", "Σ", "UNICODE", false); - assertEndsWith("Σ", "σ", "UNICODE", false); - assertEndsWith("Σ", "ς", "UNICODE", false); - assertEndsWith("Σ", "Σ", "UNICODE", true); - assertEndsWith("σ", "σ", "UNICODE_CI", true); - assertEndsWith("σ", "ς", "UNICODE_CI", true); - assertEndsWith("σ", "Σ", "UNICODE_CI", true); - assertEndsWith("ς", "σ", "UNICODE_CI", true); - assertEndsWith("ς", "ς", "UNICODE_CI", true); - assertEndsWith("ς", "Σ", "UNICODE_CI", true); - assertEndsWith("Σ", "σ", "UNICODE_CI", true); - assertEndsWith("Σ", "ς", "UNICODE_CI", true); - assertEndsWith("Σ", "Σ", "UNICODE_CI", true); - assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_BINARY", false); - assertEndsWith("ΣΑΛΑΤΑ", "σ", "UTF8_BINARY", false); - assertEndsWith("ΣΑΛΑΤΑ", "ς", "UTF8_BINARY", false); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_BINARY", true); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_BINARY", false); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_BINARY", false); - assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_LCASE", false); - assertEndsWith("ΣΑΛΑΤΑ", "σ", "UTF8_LCASE", false); - assertEndsWith("ΣΑΛΑΤΑ", "ς", "UTF8_LCASE", false); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_LCASE", true); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_LCASE", true); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_LCASE", true); - assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE", false); - assertEndsWith("ΣΑΛΑΤΑ", "σ", "UNICODE", false); - assertEndsWith("ΣΑΛΑΤΑ", "ς", "UNICODE", false); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE", true); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE", false); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE", false); - assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE_CI", false); - assertEndsWith("ΣΑΛΑΤΑ", "σ", "UNICODE_CI", false); - assertEndsWith("ΣΑΛΑΤΑ", "ς", "UNICODE_CI", false); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE_CI", true); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE_CI", true); - assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE_CI", true); + assertEndsWith("σ", "σ", UTF8_BINARY, true); + assertEndsWith("σ", "ς", UTF8_BINARY, false); + assertEndsWith("σ", "Σ", UTF8_BINARY, false); + assertEndsWith("ς", "σ", UTF8_BINARY, false); + assertEndsWith("ς", "ς", UTF8_BINARY, true); + assertEndsWith("ς", "Σ", UTF8_BINARY, false); + assertEndsWith("Σ", "σ", UTF8_BINARY, false); + assertEndsWith("Σ", "ς", UTF8_BINARY, false); + assertEndsWith("Σ", "Σ", UTF8_BINARY, true); + assertEndsWith("σ", "σ", UTF8_LCASE, true); + assertEndsWith("σ", "ς", UTF8_LCASE, true); + assertEndsWith("σ", "Σ", UTF8_LCASE, true); + assertEndsWith("ς", "σ", UTF8_LCASE, true); + assertEndsWith("ς", "ς", UTF8_LCASE, true); + assertEndsWith("ς", "Σ", UTF8_LCASE, true); + assertEndsWith("Σ", "σ", UTF8_LCASE, true); + assertEndsWith("Σ", "ς", UTF8_LCASE, true); + assertEndsWith("Σ", "Σ", UTF8_LCASE, true); + assertEndsWith("σ", "σ", UNICODE, true); + assertEndsWith("σ", "ς", UNICODE, false); + assertEndsWith("σ", "Σ", UNICODE, false); + assertEndsWith("ς", "σ", UNICODE, false); + assertEndsWith("ς", "ς", UNICODE, true); + assertEndsWith("ς", "Σ", UNICODE, false); + assertEndsWith("Σ", "σ", UNICODE, false); + assertEndsWith("Σ", "ς", UNICODE, false); + assertEndsWith("Σ", "Σ", UNICODE, true); + assertEndsWith("σ", "σ", UNICODE_CI, true); + assertEndsWith("σ", "ς", UNICODE_CI, true); + assertEndsWith("σ", "Σ", UNICODE_CI, true); + assertEndsWith("ς", "σ", UNICODE_CI, true); + assertEndsWith("ς", "ς", UNICODE_CI, true); + assertEndsWith("ς", "Σ", UNICODE_CI, true); + assertEndsWith("Σ", "σ", UNICODE_CI, true); + assertEndsWith("Σ", "ς", UNICODE_CI, true); + assertEndsWith("Σ", "Σ", UNICODE_CI, true); + assertEndsWith("ΣΑΛΑΤΑ", "Σ", UTF8_BINARY, false); + assertEndsWith("ΣΑΛΑΤΑ", "σ", UTF8_BINARY, false); + assertEndsWith("ΣΑΛΑΤΑ", "ς", UTF8_BINARY, false); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UTF8_BINARY, true); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UTF8_BINARY, false); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UTF8_BINARY, false); + assertEndsWith("ΣΑΛΑΤΑ", "Σ", UTF8_LCASE, false); + assertEndsWith("ΣΑΛΑΤΑ", "σ", UTF8_LCASE, false); + assertEndsWith("ΣΑΛΑΤΑ", "ς", UTF8_LCASE, false); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UTF8_LCASE, true); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UTF8_LCASE, true); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UTF8_LCASE, true); + assertEndsWith("ΣΑΛΑΤΑ", "Σ", UNICODE, false); + assertEndsWith("ΣΑΛΑΤΑ", "σ", UNICODE, false); + assertEndsWith("ΣΑΛΑΤΑ", "ς", UNICODE, false); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UNICODE, true); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UNICODE, false); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UNICODE, false); + assertEndsWith("ΣΑΛΑΤΑ", "Σ", UNICODE_CI, false); + assertEndsWith("ΣΑΛΑΤΑ", "σ", UNICODE_CI, false); + assertEndsWith("ΣΑΛΑΤΑ", "ς", UNICODE_CI, false); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", UNICODE_CI, true); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", UNICODE_CI, true); + assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", UNICODE_CI, true); // Surrogate pairs. - assertEndsWith("a🙃b🙃c", "x", "UTF8_BINARY", false); - assertEndsWith("a🙃b🙃c", "x", "UTF8_LCASE", false); - assertEndsWith("a🙃b🙃c", "x", "UNICODE", false); - assertEndsWith("a🙃b🙃c", "x", "UNICODE_CI", false); - assertEndsWith("a🙃b🙃c", "b", "UTF8_BINARY", false); - assertEndsWith("a🙃b🙃c", "b", "UTF8_LCASE", false); - assertEndsWith("a🙃b🙃c", "b", "UNICODE", false); - assertEndsWith("a🙃b🙃c", "b", "UNICODE_CI", false); - assertEndsWith("a🙃b🙃c", "a🙃b", "UTF8_BINARY", false); - assertEndsWith("a🙃b🙃c", "a🙃b", "UTF8_LCASE", false); - assertEndsWith("a🙃b🙃c", "a🙃b", "UNICODE", false); - assertEndsWith("a🙃b🙃c", "a🙃b", "UNICODE_CI", false); - assertEndsWith("a🙃b🙃c", "b🙃c", "UTF8_BINARY", true); - assertEndsWith("a🙃b🙃c", "b🙃c", "UTF8_LCASE", true); - assertEndsWith("a🙃b🙃c", "b🙃c", "UNICODE", true); - assertEndsWith("a🙃b🙃c", "b🙃c", "UNICODE_CI", true); - assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", true); - assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", true); - assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE", true); - assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", true); - assertEndsWith("😀😆😃😄", "😄😆", "UTF8_BINARY", false); - assertEndsWith("😀😆😃😄", "😄😆", "UTF8_LCASE", false); - assertEndsWith("😀😆😃😄", "😄😆", "UNICODE", false); - assertEndsWith("😀😆😃😄", "😄😆", "UNICODE_CI", false); - assertEndsWith("😀😆😃😄", "😆😃", "UTF8_BINARY", false); - assertEndsWith("😀😆😃😄", "😆😃", "UTF8_LCASE", false); - assertEndsWith("😀😆😃😄", "😆😃", "UNICODE", false); - assertEndsWith("😀😆😃😄", "😆😃", "UNICODE_CI", false); - assertEndsWith("😀😆😃😄", "😀😆", "UTF8_BINARY", false); - assertEndsWith("😀😆😃😄", "😀😆", "UTF8_LCASE", false); - assertEndsWith("😀😆😃😄", "😀😆", "UNICODE", false); - assertEndsWith("😀😆😃😄", "😀😆", "UNICODE_CI", false); - assertEndsWith("😀😆😃😄", "😃😄", "UTF8_BINARY", true); - assertEndsWith("😀😆😃😄", "😃😄", "UTF8_LCASE", true); - assertEndsWith("😀😆😃😄", "😃😄", "UNICODE", true); - assertEndsWith("😀😆😃😄", "😃😄", "UNICODE_CI", true); - assertEndsWith("😀😆😃😄", "😀😆😃😄", "UTF8_BINARY", true); - assertEndsWith("😀😆😃😄", "😀😆😃😄", "UTF8_LCASE", true); - assertEndsWith("😀😆😃😄", "😀😆😃😄", "UNICODE", true); - assertEndsWith("😀😆😃😄", "😀😆😃😄", "UNICODE_CI", true); - assertEndsWith("𐐅", "𐐅", "UTF8_BINARY", true); - assertEndsWith("𐐅", "𐐅", "UTF8_LCASE", true); - assertEndsWith("𐐅", "𐐅", "UNICODE", true); - assertEndsWith("𐐅", "𐐅", "UNICODE_CI", true); - assertEndsWith("𐐅", "𐐭", "UTF8_BINARY", false); - assertEndsWith("𐐅", "𐐭", "UTF8_LCASE", true); - assertEndsWith("𐐅", "𐐭", "UNICODE", false); - assertEndsWith("𐐅", "𐐭", "UNICODE_CI", true); - assertEndsWith("𝔸", "𝔸", "UTF8_BINARY", true); - assertEndsWith("𝔸", "𝔸", "UTF8_LCASE", true); - assertEndsWith("𝔸", "𝔸", "UNICODE", true); - assertEndsWith("𝔸", "𝔸", "UNICODE_CI", true); + assertEndsWith("a🙃b🙃c", "x", UTF8_BINARY, false); + assertEndsWith("a🙃b🙃c", "x", UTF8_LCASE, false); + assertEndsWith("a🙃b🙃c", "x", UNICODE, false); + assertEndsWith("a🙃b🙃c", "x", UNICODE_CI, false); + assertEndsWith("a🙃b🙃c", "b", UTF8_BINARY, false); + assertEndsWith("a🙃b🙃c", "b", UTF8_LCASE, false); + assertEndsWith("a🙃b🙃c", "b", UNICODE, false); + assertEndsWith("a🙃b🙃c", "b", UNICODE_CI, false); + assertEndsWith("a🙃b🙃c", "a🙃b", UTF8_BINARY, false); + assertEndsWith("a🙃b🙃c", "a🙃b", UTF8_LCASE, false); + assertEndsWith("a🙃b🙃c", "a🙃b", UNICODE, false); + assertEndsWith("a🙃b🙃c", "a🙃b", UNICODE_CI, false); + assertEndsWith("a🙃b🙃c", "b🙃c", UTF8_BINARY, true); + assertEndsWith("a🙃b🙃c", "b🙃c", UTF8_LCASE, true); + assertEndsWith("a🙃b🙃c", "b🙃c", UNICODE, true); + assertEndsWith("a🙃b🙃c", "b🙃c", UNICODE_CI, true); + assertEndsWith("a🙃b🙃c", "a🙃b🙃c", UTF8_BINARY, true); + assertEndsWith("a🙃b🙃c", "a🙃b🙃c", UTF8_LCASE, true); + assertEndsWith("a🙃b🙃c", "a🙃b🙃c", UNICODE, true); + assertEndsWith("a🙃b🙃c", "a🙃b🙃c", UNICODE_CI, true); + assertEndsWith("😀😆😃😄", "😄😆", UTF8_BINARY, false); + assertEndsWith("😀😆😃😄", "😄😆", UTF8_LCASE, false); + assertEndsWith("😀😆😃😄", "😄😆", UNICODE, false); + assertEndsWith("😀😆😃😄", "😄😆", UNICODE_CI, false); + assertEndsWith("😀😆😃😄", "😆😃", UTF8_BINARY, false); + assertEndsWith("😀😆😃😄", "😆😃", UTF8_LCASE, false); + assertEndsWith("😀😆😃😄", "😆😃", UNICODE, false); + assertEndsWith("😀😆😃😄", "😆😃", UNICODE_CI, false); + assertEndsWith("😀😆😃😄", "😀😆", UTF8_BINARY, false); + assertEndsWith("😀😆😃😄", "😀😆", UTF8_LCASE, false); + assertEndsWith("😀😆😃😄", "😀😆", UNICODE, false); + assertEndsWith("😀😆😃😄", "😀😆", UNICODE_CI, false); + assertEndsWith("😀😆😃😄", "😃😄", UTF8_BINARY, true); + assertEndsWith("😀😆😃😄", "😃😄", UTF8_LCASE, true); + assertEndsWith("😀😆😃😄", "😃😄", UNICODE, true); + assertEndsWith("😀😆😃😄", "😃😄", UNICODE_CI, true); + assertEndsWith("😀😆😃😄", "😀😆😃😄", UTF8_BINARY, true); + assertEndsWith("😀😆😃😄", "😀😆😃😄", UTF8_LCASE, true); + assertEndsWith("😀😆😃😄", "😀😆😃😄", UNICODE, true); + assertEndsWith("😀😆😃😄", "😀😆😃😄", UNICODE_CI, true); + assertEndsWith("𐐅", "𐐅", UTF8_BINARY, true); + assertEndsWith("𐐅", "𐐅", UTF8_LCASE, true); + assertEndsWith("𐐅", "𐐅", UNICODE, true); + assertEndsWith("𐐅", "𐐅", UNICODE_CI, true); + assertEndsWith("𐐅", "𐐭", UTF8_BINARY, false); + assertEndsWith("𐐅", "𐐭", UTF8_LCASE, true); + assertEndsWith("𐐅", "𐐭", UNICODE, false); + assertEndsWith("𐐅", "𐐭", UNICODE_CI, true); + assertEndsWith("𝔸", "𝔸", UTF8_BINARY, true); + assertEndsWith("𝔸", "𝔸", UTF8_LCASE, true); + assertEndsWith("𝔸", "𝔸", UNICODE, true); + assertEndsWith("𝔸", "𝔸", UNICODE_CI, true); } /** @@ -1057,158 +1058,158 @@ public void testStringSplitSQL() throws SparkException { var array_AOB = new UTF8String[] { UTF8String.fromString("A𐐅B") }; var array_AoB = new UTF8String[] { UTF8String.fromString("A𐐭B") }; // Empty strings. - assertStringSplitSQL("", "", "UTF8_BINARY", empty_match); - assertStringSplitSQL("abc", "", "UTF8_BINARY", array_abc); - assertStringSplitSQL("", "abc", "UTF8_BINARY", empty_match); - assertStringSplitSQL("", "", "UNICODE", empty_match); - assertStringSplitSQL("abc", "", "UNICODE", array_abc); - assertStringSplitSQL("", "abc", "UNICODE", empty_match); - assertStringSplitSQL("", "", "UTF8_LCASE", empty_match); - assertStringSplitSQL("abc", "", "UTF8_LCASE", array_abc); - assertStringSplitSQL("", "abc", "UTF8_LCASE", empty_match); - assertStringSplitSQL("", "", "UNICODE_CI", empty_match); - assertStringSplitSQL("abc", "", "UNICODE_CI", array_abc); - assertStringSplitSQL("", "abc", "UNICODE_CI", empty_match); + assertStringSplitSQL("", "", UTF8_BINARY, empty_match); + assertStringSplitSQL("abc", "", UTF8_BINARY, array_abc); + assertStringSplitSQL("", "abc", UTF8_BINARY, empty_match); + assertStringSplitSQL("", "", UNICODE, empty_match); + assertStringSplitSQL("abc", "", UNICODE, array_abc); + assertStringSplitSQL("", "abc", UNICODE, empty_match); + assertStringSplitSQL("", "", UTF8_LCASE, empty_match); + assertStringSplitSQL("abc", "", UTF8_LCASE, array_abc); + assertStringSplitSQL("", "abc", UTF8_LCASE, empty_match); + assertStringSplitSQL("", "", UNICODE_CI, empty_match); + assertStringSplitSQL("abc", "", UNICODE_CI, array_abc); + assertStringSplitSQL("", "abc", UNICODE_CI, empty_match); // Basic tests. - assertStringSplitSQL("1a2", "a", "UTF8_BINARY", array_1_2); - assertStringSplitSQL("1a2", "A", "UTF8_BINARY", array_1a2); - assertStringSplitSQL("1a2", "b", "UTF8_BINARY", array_1a2); - assertStringSplitSQL("1a2", "1a2", "UNICODE", full_match); - assertStringSplitSQL("1a2", "1A2", "UNICODE", array_1a2); - assertStringSplitSQL("1a2", "3b4", "UNICODE", array_1a2); - assertStringSplitSQL("1a2", "A", "UTF8_LCASE", array_1_2); - assertStringSplitSQL("1a2", "1A2", "UTF8_LCASE", full_match); - assertStringSplitSQL("1a2", "X", "UTF8_LCASE", array_1a2); - assertStringSplitSQL("1a2", "a", "UNICODE_CI", array_1_2); - assertStringSplitSQL("1a2", "A", "UNICODE_CI", array_1_2); - assertStringSplitSQL("1a2", "1A2", "UNICODE_CI", full_match); - assertStringSplitSQL("1a2", "123", "UNICODE_CI", array_1a2); + assertStringSplitSQL("1a2", "a", UTF8_BINARY, array_1_2); + assertStringSplitSQL("1a2", "A", UTF8_BINARY, array_1a2); + assertStringSplitSQL("1a2", "b", UTF8_BINARY, array_1a2); + assertStringSplitSQL("1a2", "1a2", UNICODE, full_match); + assertStringSplitSQL("1a2", "1A2", UNICODE, array_1a2); + assertStringSplitSQL("1a2", "3b4", UNICODE, array_1a2); + assertStringSplitSQL("1a2", "A", UTF8_LCASE, array_1_2); + assertStringSplitSQL("1a2", "1A2", UTF8_LCASE, full_match); + assertStringSplitSQL("1a2", "X", UTF8_LCASE, array_1a2); + assertStringSplitSQL("1a2", "a", UNICODE_CI, array_1_2); + assertStringSplitSQL("1a2", "A", UNICODE_CI, array_1_2); + assertStringSplitSQL("1a2", "1A2", UNICODE_CI, full_match); + assertStringSplitSQL("1a2", "123", UNICODE_CI, array_1a2); // Advanced tests. - assertStringSplitSQL("äb世De", "b世D", "UTF8_BINARY", array_a_e); - assertStringSplitSQL("äb世De", "B世d", "UTF8_BINARY", array_special); - assertStringSplitSQL("äbćδe", "bćδ", "UTF8_BINARY", array_a_e); - assertStringSplitSQL("äbćδe", "BcΔ", "UTF8_BINARY", array_abcde); - assertStringSplitSQL("äb世De", "äb世De", "UNICODE", full_match); - assertStringSplitSQL("äb世De", "äB世de", "UNICODE", array_special); - assertStringSplitSQL("äbćδe", "äbćδe", "UNICODE", full_match); - assertStringSplitSQL("äbćδe", "ÄBcΔÉ", "UNICODE", array_abcde); - assertStringSplitSQL("äb世De", "b世D", "UTF8_LCASE", array_a_e); - assertStringSplitSQL("äb世De", "B世d", "UTF8_LCASE", array_a_e); - assertStringSplitSQL("äbćδe", "bćδ", "UTF8_LCASE", array_a_e); - assertStringSplitSQL("äbćδe", "BcΔ", "UTF8_LCASE", array_abcde); - assertStringSplitSQL("äb世De", "ab世De", "UNICODE_CI", array_special); - assertStringSplitSQL("äb世De", "AB世dE", "UNICODE_CI", array_special); - assertStringSplitSQL("äbćδe", "ÄbćδE", "UNICODE_CI", full_match); - assertStringSplitSQL("äbćδe", "ÄBcΔÉ", "UNICODE_CI", array_abcde); + assertStringSplitSQL("äb世De", "b世D", UTF8_BINARY, array_a_e); + assertStringSplitSQL("äb世De", "B世d", UTF8_BINARY, array_special); + assertStringSplitSQL("äbćδe", "bćδ", UTF8_BINARY, array_a_e); + assertStringSplitSQL("äbćδe", "BcΔ", UTF8_BINARY, array_abcde); + assertStringSplitSQL("äb世De", "äb世De", UNICODE, full_match); + assertStringSplitSQL("äb世De", "äB世de", UNICODE, array_special); + assertStringSplitSQL("äbćδe", "äbćδe", UNICODE, full_match); + assertStringSplitSQL("äbćδe", "ÄBcΔÉ", UNICODE, array_abcde); + assertStringSplitSQL("äb世De", "b世D", UTF8_LCASE, array_a_e); + assertStringSplitSQL("äb世De", "B世d", UTF8_LCASE, array_a_e); + assertStringSplitSQL("äbćδe", "bćδ", UTF8_LCASE, array_a_e); + assertStringSplitSQL("äbćδe", "BcΔ", UTF8_LCASE, array_abcde); + assertStringSplitSQL("äb世De", "ab世De", UNICODE_CI, array_special); + assertStringSplitSQL("äb世De", "AB世dE", UNICODE_CI, array_special); + assertStringSplitSQL("äbćδe", "ÄbćδE", UNICODE_CI, full_match); + assertStringSplitSQL("äbćδe", "ÄBcΔÉ", UNICODE_CI, array_abcde); // Case variation. - assertStringSplitSQL("AaXbB", "x", "UTF8_BINARY", array_AaXbB); - assertStringSplitSQL("AaXbB", "X", "UTF8_BINARY", array_Aa_bB); - assertStringSplitSQL("AaXbB", "axb", "UNICODE", array_AaXbB); - assertStringSplitSQL("AaXbB", "aXb", "UNICODE", array_A_B); - assertStringSplitSQL("AaXbB", "axb", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("AaXbB", "AXB", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("AaXbB", "axb", "UNICODE_CI", array_A_B); - assertStringSplitSQL("AaXbB", "AxB", "UNICODE_CI", array_A_B); + assertStringSplitSQL("AaXbB", "x", UTF8_BINARY, array_AaXbB); + assertStringSplitSQL("AaXbB", "X", UTF8_BINARY, array_Aa_bB); + assertStringSplitSQL("AaXbB", "axb", UNICODE, array_AaXbB); + assertStringSplitSQL("AaXbB", "aXb", UNICODE, array_A_B); + assertStringSplitSQL("AaXbB", "axb", UTF8_LCASE, array_A_B); + assertStringSplitSQL("AaXbB", "AXB", UTF8_LCASE, array_A_B); + assertStringSplitSQL("AaXbB", "axb", UNICODE_CI, array_A_B); + assertStringSplitSQL("AaXbB", "AxB", UNICODE_CI, array_A_B); // Accent variation. - assertStringSplitSQL("aBcDe", "bćd", "UTF8_BINARY", array_aBcDe); - assertStringSplitSQL("aBcDe", "BćD", "UTF8_BINARY", array_aBcDe); - assertStringSplitSQL("aBcDe", "abćde", "UNICODE", array_aBcDe); - assertStringSplitSQL("aBcDe", "aBćDe", "UNICODE", array_aBcDe); - assertStringSplitSQL("aBcDe", "bćd", "UTF8_LCASE", array_aBcDe); - assertStringSplitSQL("aBcDe", "BĆD", "UTF8_LCASE", array_aBcDe); - assertStringSplitSQL("aBcDe", "abćde", "UNICODE_CI", array_aBcDe); - assertStringSplitSQL("aBcDe", "AbĆdE", "UNICODE_CI", array_aBcDe); + assertStringSplitSQL("aBcDe", "bćd", UTF8_BINARY, array_aBcDe); + assertStringSplitSQL("aBcDe", "BćD", UTF8_BINARY, array_aBcDe); + assertStringSplitSQL("aBcDe", "abćde", UNICODE, array_aBcDe); + assertStringSplitSQL("aBcDe", "aBćDe", UNICODE, array_aBcDe); + assertStringSplitSQL("aBcDe", "bćd", UTF8_LCASE, array_aBcDe); + assertStringSplitSQL("aBcDe", "BĆD", UTF8_LCASE, array_aBcDe); + assertStringSplitSQL("aBcDe", "abćde", UNICODE_CI, array_aBcDe); + assertStringSplitSQL("aBcDe", "AbĆdE", UNICODE_CI, array_aBcDe); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringSplitSQL("İ", "i", "UTF8_BINARY", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "i", "UTF8_LCASE", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "i", "UNICODE", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "i", "UNICODE_CI", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "\u0307", "UTF8_BINARY", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "\u0307", "UTF8_LCASE", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "\u0307", "UNICODE", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("İ", "\u0307", "UNICODE_CI", array_Turkish_uppercase_dotted_I); - assertStringSplitSQL("i\u0307", "i", "UTF8_BINARY", array_dot); - assertStringSplitSQL("i\u0307", "i", "UTF8_LCASE", array_dot); - assertStringSplitSQL("i\u0307", "i", "UNICODE", array_Turkish_lowercase_dotted_i); - assertStringSplitSQL("i\u0307", "i", "UNICODE_CI", array_Turkish_lowercase_dotted_i); - assertStringSplitSQL("i\u0307", "\u0307", "UTF8_BINARY", array_i); - assertStringSplitSQL("i\u0307", "\u0307", "UTF8_LCASE", array_i); - assertStringSplitSQL("i\u0307", "\u0307", "UNICODE", array_Turkish_lowercase_dotted_i); - assertStringSplitSQL("i\u0307", "\u0307", "UNICODE_CI", array_Turkish_lowercase_dotted_i); - assertStringSplitSQL("AİB", "İ", "UTF8_BINARY", array_A_B); - assertStringSplitSQL("AİB", "İ", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("AİB", "İ", "UNICODE", array_A_B); - assertStringSplitSQL("AİB", "İ", "UNICODE_CI", array_A_B); - assertStringSplitSQL("AİB", "i\u0307", "UTF8_BINARY", array_AIB); - assertStringSplitSQL("AİB", "i\u0307", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("AİB", "i\u0307", "UNICODE", array_AIB); - assertStringSplitSQL("AİB", "i\u0307", "UNICODE_CI", array_A_B); - assertStringSplitSQL("Ai\u0307B", "İ", "UTF8_BINARY", array_AiB); - assertStringSplitSQL("Ai\u0307B", "İ", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("Ai\u0307B", "İ", "UNICODE", array_AiB); - assertStringSplitSQL("Ai\u0307B", "İ", "UNICODE_CI", array_A_B); - assertStringSplitSQL("Ai\u0307B", "i\u0307", "UTF8_BINARY", array_A_B); - assertStringSplitSQL("Ai\u0307B", "i\u0307", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("Ai\u0307B", "i\u0307", "UNICODE", array_A_B); - assertStringSplitSQL("Ai\u0307B", "i\u0307", "UNICODE_CI", array_A_B); + assertStringSplitSQL("İ", "i", UTF8_BINARY, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "i", UTF8_LCASE, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "i", UNICODE, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "i", UNICODE_CI, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "\u0307", UTF8_BINARY, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "\u0307", UTF8_LCASE, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "\u0307", UNICODE, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("İ", "\u0307", UNICODE_CI, array_Turkish_uppercase_dotted_I); + assertStringSplitSQL("i\u0307", "i", UTF8_BINARY, array_dot); + assertStringSplitSQL("i\u0307", "i", UTF8_LCASE, array_dot); + assertStringSplitSQL("i\u0307", "i", UNICODE, array_Turkish_lowercase_dotted_i); + assertStringSplitSQL("i\u0307", "i", UNICODE_CI, array_Turkish_lowercase_dotted_i); + assertStringSplitSQL("i\u0307", "\u0307", UTF8_BINARY, array_i); + assertStringSplitSQL("i\u0307", "\u0307", UTF8_LCASE, array_i); + assertStringSplitSQL("i\u0307", "\u0307", UNICODE, array_Turkish_lowercase_dotted_i); + assertStringSplitSQL("i\u0307", "\u0307", UNICODE_CI, array_Turkish_lowercase_dotted_i); + assertStringSplitSQL("AİB", "İ", UTF8_BINARY, array_A_B); + assertStringSplitSQL("AİB", "İ", UTF8_LCASE, array_A_B); + assertStringSplitSQL("AİB", "İ", UNICODE, array_A_B); + assertStringSplitSQL("AİB", "İ", UNICODE_CI, array_A_B); + assertStringSplitSQL("AİB", "i\u0307", UTF8_BINARY, array_AIB); + assertStringSplitSQL("AİB", "i\u0307", UTF8_LCASE, array_A_B); + assertStringSplitSQL("AİB", "i\u0307", UNICODE, array_AIB); + assertStringSplitSQL("AİB", "i\u0307", UNICODE_CI, array_A_B); + assertStringSplitSQL("Ai\u0307B", "İ", UTF8_BINARY, array_AiB); + assertStringSplitSQL("Ai\u0307B", "İ", UTF8_LCASE, array_A_B); + assertStringSplitSQL("Ai\u0307B", "İ", UNICODE, array_AiB); + assertStringSplitSQL("Ai\u0307B", "İ", UNICODE_CI, array_A_B); + assertStringSplitSQL("Ai\u0307B", "i\u0307", UTF8_BINARY, array_A_B); + assertStringSplitSQL("Ai\u0307B", "i\u0307", UTF8_LCASE, array_A_B); + assertStringSplitSQL("Ai\u0307B", "i\u0307", UNICODE, array_A_B); + assertStringSplitSQL("Ai\u0307B", "i\u0307", UNICODE_CI, array_A_B); // Conditional case mapping (e.g. Greek sigmas). - assertStringSplitSQL("σ", "σ", "UTF8_BINARY", full_match); - assertStringSplitSQL("σ", "σ", "UTF8_LCASE", full_match); - assertStringSplitSQL("σ", "σ", "UNICODE", full_match); - assertStringSplitSQL("σ", "σ", "UNICODE_CI", full_match); - assertStringSplitSQL("σ", "ς", "UTF8_BINARY", array_small_nonfinal_sigma); - assertStringSplitSQL("σ", "ς", "UTF8_LCASE", full_match); - assertStringSplitSQL("σ", "ς", "UNICODE", array_small_nonfinal_sigma); - assertStringSplitSQL("σ", "ς", "UNICODE_CI", full_match); - assertStringSplitSQL("σ", "Σ", "UTF8_BINARY", array_small_nonfinal_sigma); - assertStringSplitSQL("σ", "Σ", "UTF8_LCASE", full_match); - assertStringSplitSQL("σ", "Σ", "UNICODE", array_small_nonfinal_sigma); - assertStringSplitSQL("σ", "Σ", "UNICODE_CI", full_match); - assertStringSplitSQL("ς", "σ", "UTF8_BINARY", array_small_final_sigma); - assertStringSplitSQL("ς", "σ", "UTF8_LCASE", full_match); - assertStringSplitSQL("ς", "σ", "UNICODE", array_small_final_sigma); - assertStringSplitSQL("ς", "σ", "UNICODE_CI", full_match); - assertStringSplitSQL("ς", "ς", "UTF8_BINARY", full_match); - assertStringSplitSQL("ς", "ς", "UTF8_LCASE", full_match); - assertStringSplitSQL("ς", "ς", "UNICODE", full_match); - assertStringSplitSQL("ς", "ς", "UNICODE_CI", full_match); - assertStringSplitSQL("ς", "Σ", "UTF8_BINARY", array_small_final_sigma); - assertStringSplitSQL("ς", "Σ", "UTF8_LCASE", full_match); - assertStringSplitSQL("ς", "Σ", "UNICODE", array_small_final_sigma); - assertStringSplitSQL("ς", "Σ", "UNICODE_CI", full_match); - assertStringSplitSQL("Σ", "σ", "UTF8_BINARY", array_capital_sigma); - assertStringSplitSQL("Σ", "σ", "UTF8_LCASE", full_match); - assertStringSplitSQL("Σ", "σ", "UNICODE", array_capital_sigma); - assertStringSplitSQL("Σ", "σ", "UNICODE_CI", full_match); - assertStringSplitSQL("Σ", "ς", "UTF8_BINARY", array_capital_sigma); - assertStringSplitSQL("Σ", "ς", "UTF8_LCASE", full_match); - assertStringSplitSQL("Σ", "ς", "UNICODE", array_capital_sigma); - assertStringSplitSQL("Σ", "ς", "UNICODE_CI", full_match); - assertStringSplitSQL("Σ", "Σ", "UTF8_BINARY", full_match); - assertStringSplitSQL("Σ", "Σ", "UTF8_LCASE", full_match); - assertStringSplitSQL("Σ", "Σ", "UNICODE", full_match); - assertStringSplitSQL("Σ", "Σ", "UNICODE_CI", full_match); + assertStringSplitSQL("σ", "σ", UTF8_BINARY, full_match); + assertStringSplitSQL("σ", "σ", UTF8_LCASE, full_match); + assertStringSplitSQL("σ", "σ", UNICODE, full_match); + assertStringSplitSQL("σ", "σ", UNICODE_CI, full_match); + assertStringSplitSQL("σ", "ς", UTF8_BINARY, array_small_nonfinal_sigma); + assertStringSplitSQL("σ", "ς", UTF8_LCASE, full_match); + assertStringSplitSQL("σ", "ς", UNICODE, array_small_nonfinal_sigma); + assertStringSplitSQL("σ", "ς", UNICODE_CI, full_match); + assertStringSplitSQL("σ", "Σ", UTF8_BINARY, array_small_nonfinal_sigma); + assertStringSplitSQL("σ", "Σ", UTF8_LCASE, full_match); + assertStringSplitSQL("σ", "Σ", UNICODE, array_small_nonfinal_sigma); + assertStringSplitSQL("σ", "Σ", UNICODE_CI, full_match); + assertStringSplitSQL("ς", "σ", UTF8_BINARY, array_small_final_sigma); + assertStringSplitSQL("ς", "σ", UTF8_LCASE, full_match); + assertStringSplitSQL("ς", "σ", UNICODE, array_small_final_sigma); + assertStringSplitSQL("ς", "σ", UNICODE_CI, full_match); + assertStringSplitSQL("ς", "ς", UTF8_BINARY, full_match); + assertStringSplitSQL("ς", "ς", UTF8_LCASE, full_match); + assertStringSplitSQL("ς", "ς", UNICODE, full_match); + assertStringSplitSQL("ς", "ς", UNICODE_CI, full_match); + assertStringSplitSQL("ς", "Σ", UTF8_BINARY, array_small_final_sigma); + assertStringSplitSQL("ς", "Σ", UTF8_LCASE, full_match); + assertStringSplitSQL("ς", "Σ", UNICODE, array_small_final_sigma); + assertStringSplitSQL("ς", "Σ", UNICODE_CI, full_match); + assertStringSplitSQL("Σ", "σ", UTF8_BINARY, array_capital_sigma); + assertStringSplitSQL("Σ", "σ", UTF8_LCASE, full_match); + assertStringSplitSQL("Σ", "σ", UNICODE, array_capital_sigma); + assertStringSplitSQL("Σ", "σ", UNICODE_CI, full_match); + assertStringSplitSQL("Σ", "ς", UTF8_BINARY, array_capital_sigma); + assertStringSplitSQL("Σ", "ς", UTF8_LCASE, full_match); + assertStringSplitSQL("Σ", "ς", UNICODE, array_capital_sigma); + assertStringSplitSQL("Σ", "ς", UNICODE_CI, full_match); + assertStringSplitSQL("Σ", "Σ", UTF8_BINARY, full_match); + assertStringSplitSQL("Σ", "Σ", UTF8_LCASE, full_match); + assertStringSplitSQL("Σ", "Σ", UNICODE, full_match); + assertStringSplitSQL("Σ", "Σ", UNICODE_CI, full_match); // Surrogate pairs. - assertStringSplitSQL("a🙃b🙃c", "🙃", "UTF8_BINARY", array_a_b_c); - assertStringSplitSQL("a🙃b🙃c", "🙃", "UTF8_LCASE", array_a_b_c); - assertStringSplitSQL("a🙃b🙃c", "🙃", "UNICODE", array_a_b_c); - assertStringSplitSQL("a🙃b🙃c", "🙃", "UNICODE_CI", array_a_b_c); - assertStringSplitSQL("😀😆😃😄", "😆😃", "UTF8_BINARY", array_emojis); - assertStringSplitSQL("😀😆😃😄", "😆😃", "UTF8_LCASE", array_emojis); - assertStringSplitSQL("😀😆😃😄", "😆😃", "UNICODE", array_emojis); - assertStringSplitSQL("😀😆😃😄", "😆😃", "UNICODE_CI", array_emojis); - assertStringSplitSQL("A𐐅B", "𐐅", "UTF8_BINARY", array_A_B); - assertStringSplitSQL("A𐐅B", "𐐅", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("A𐐅B", "𐐅", "UNICODE", array_A_B); - assertStringSplitSQL("A𐐅B", "𐐅", "UNICODE_CI", array_A_B); - assertStringSplitSQL("A𐐅B", "𐐭", "UTF8_BINARY", array_AOB); - assertStringSplitSQL("A𐐅B", "𐐭", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("A𐐅B", "𐐭", "UNICODE", array_AOB); - assertStringSplitSQL("A𐐅B", "𐐭", "UNICODE_CI", array_A_B); - assertStringSplitSQL("A𐐭B", "𐐅", "UTF8_BINARY", array_AoB); - assertStringSplitSQL("A𐐭B", "𐐅", "UTF8_LCASE", array_A_B); - assertStringSplitSQL("A𐐭B", "𐐅", "UNICODE", array_AoB); - assertStringSplitSQL("A𐐭B", "𐐅", "UNICODE_CI", array_A_B); + assertStringSplitSQL("a🙃b🙃c", "🙃", UTF8_BINARY, array_a_b_c); + assertStringSplitSQL("a🙃b🙃c", "🙃", UTF8_LCASE, array_a_b_c); + assertStringSplitSQL("a🙃b🙃c", "🙃", UNICODE, array_a_b_c); + assertStringSplitSQL("a🙃b🙃c", "🙃", UNICODE_CI, array_a_b_c); + assertStringSplitSQL("😀😆😃😄", "😆😃", UTF8_BINARY, array_emojis); + assertStringSplitSQL("😀😆😃😄", "😆😃", UTF8_LCASE, array_emojis); + assertStringSplitSQL("😀😆😃😄", "😆😃", UNICODE, array_emojis); + assertStringSplitSQL("😀😆😃😄", "😆😃", UNICODE_CI, array_emojis); + assertStringSplitSQL("A𐐅B", "𐐅", UTF8_BINARY, array_A_B); + assertStringSplitSQL("A𐐅B", "𐐅", UTF8_LCASE, array_A_B); + assertStringSplitSQL("A𐐅B", "𐐅", UNICODE, array_A_B); + assertStringSplitSQL("A𐐅B", "𐐅", UNICODE_CI, array_A_B); + assertStringSplitSQL("A𐐅B", "𐐭", UTF8_BINARY, array_AOB); + assertStringSplitSQL("A𐐅B", "𐐭", UTF8_LCASE, array_A_B); + assertStringSplitSQL("A𐐅B", "𐐭", UNICODE, array_AOB); + assertStringSplitSQL("A𐐅B", "𐐭", UNICODE_CI, array_A_B); + assertStringSplitSQL("A𐐭B", "𐐅", UTF8_BINARY, array_AoB); + assertStringSplitSQL("A𐐭B", "𐐅", UTF8_LCASE, array_A_B); + assertStringSplitSQL("A𐐭B", "𐐅", UNICODE, array_AoB); + assertStringSplitSQL("A𐐭B", "𐐅", UNICODE_CI, array_A_B); } /** @@ -1391,156 +1392,156 @@ public void testInitCap() throws SparkException { assertInitCap("θαλασσινος", collationName, "Θαλασσινος"); } // Advanced tests. - assertInitCap("aBćDe", "UTF8_BINARY", "Abćde"); - assertInitCap("aBćDe", "UTF8_LCASE", "Abćde"); - assertInitCap("aBćDe", "UNICODE", "Abćde"); - assertInitCap("aBćDe", "UNICODE_CI", "Abćde"); - assertInitCap("ab世De", "UTF8_BINARY", "Ab世de"); - assertInitCap("ab世De", "UTF8_LCASE", "Ab世De"); - assertInitCap("ab世De", "UNICODE", "Ab世De"); - assertInitCap("ab世De", "UNICODE_CI", "Ab世De"); - assertInitCap("äbćδe", "UTF8_BINARY", "Äbćδe"); - assertInitCap("äbćδe", "UTF8_LCASE", "Äbćδe"); - assertInitCap("äbćδe", "UNICODE", "Äbćδe"); - assertInitCap("äbćδe", "UNICODE_CI", "Äbćδe"); - assertInitCap("ÄBĆΔE", "UTF8_BINARY", "Äbćδe"); - assertInitCap("ÄBĆΔE", "UTF8_LCASE", "Äbćδe"); - assertInitCap("ÄBĆΔE", "UNICODE", "Äbćδe"); - assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); + assertInitCap("aBćDe", UTF8_BINARY, "Abćde"); + assertInitCap("aBćDe", UTF8_LCASE, "Abćde"); + assertInitCap("aBćDe", UNICODE, "Abćde"); + assertInitCap("aBćDe", UNICODE_CI, "Abćde"); + assertInitCap("ab世De", UTF8_BINARY, "Ab世de"); + assertInitCap("ab世De", UTF8_LCASE, "Ab世De"); + assertInitCap("ab世De", UNICODE, "Ab世De"); + assertInitCap("ab世De", UNICODE_CI, "Ab世De"); + assertInitCap("äbćδe", UTF8_BINARY, "Äbćδe"); + assertInitCap("äbćδe", UTF8_LCASE, "Äbćδe"); + assertInitCap("äbćδe", UNICODE, "Äbćδe"); + assertInitCap("äbćδe", UNICODE_CI, "Äbćδe"); + assertInitCap("ÄBĆΔE", UTF8_BINARY, "Äbćδe"); + assertInitCap("ÄBĆΔE", UTF8_LCASE, "Äbćδe"); + assertInitCap("ÄBĆΔE", UNICODE, "Äbćδe"); + assertInitCap("ÄBĆΔE", UNICODE_CI, "Äbćδe"); assertInitCap("êéfgh", "AF_CI_AI", "Êéfgh"); assertInitCap("öoAÄ", "DE_CI_AI", "Öoaä"); // Case-variable character length - assertInitCap("İo", "UTF8_BINARY", "İo", "I\u0307o"); - assertInitCap("İo", "UTF8_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "İo"); - assertInitCap("İo", "UNICODE_CI", "İo"); - assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o"); - assertInitCap("i\u0307o", "UTF8_LCASE", "I\u0307o"); - assertInitCap("i\u0307o", "UNICODE", "I\u0307o"); - assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o"); + assertInitCap("İo", UTF8_BINARY, "İo", "I\u0307o"); + assertInitCap("İo", UTF8_LCASE, "İo"); + assertInitCap("İo", UNICODE, "İo"); + assertInitCap("İo", UNICODE_CI, "İo"); + assertInitCap("i\u0307o", UTF8_BINARY, "I\u0307o"); + assertInitCap("i\u0307o", UTF8_LCASE, "I\u0307o"); + assertInitCap("i\u0307o", UNICODE, "I\u0307o"); + assertInitCap("i\u0307o", UNICODE_CI, "I\u0307o"); // Different possible word boundaries - assertInitCap("aB 世 de", "UTF8_BINARY", "Ab 世 De"); - assertInitCap("aB 世 de", "UTF8_LCASE", "Ab 世 De"); - assertInitCap("aB 世 de", "UNICODE", "Ab 世 De"); - assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); + assertInitCap("aB 世 de", UTF8_BINARY, "Ab 世 De"); + assertInitCap("aB 世 de", UTF8_LCASE, "Ab 世 De"); + assertInitCap("aB 世 de", UNICODE, "Ab 世 De"); + assertInitCap("aB 世 de", UNICODE_CI, "Ab 世 De"); // One-to-many case mapping (e.g. Turkish dotted I). - assertInitCap("İ", "UTF8_BINARY", "İ", "I\u0307"); - assertInitCap("İ", "UTF8_LCASE", "İ"); - assertInitCap("İ", "UNICODE", "İ"); - assertInitCap("İ", "UNICODE_CI", "İ"); - assertInitCap("I\u0307", "UTF8_BINARY","I\u0307"); - assertInitCap("I\u0307", "UTF8_LCASE","I\u0307"); - assertInitCap("I\u0307", "UNICODE","I\u0307"); - assertInitCap("I\u0307", "UNICODE_CI","I\u0307"); - assertInitCap("İonic", "UTF8_BINARY", "İonic", "I\u0307onic"); - assertInitCap("İonic", "UTF8_LCASE", "İonic"); - assertInitCap("İonic", "UNICODE", "İonic"); - assertInitCap("İonic", "UNICODE_CI", "İonic"); - assertInitCap("i\u0307onic", "UTF8_BINARY","I\u0307onic"); - assertInitCap("i\u0307onic", "UTF8_LCASE","I\u0307onic"); - assertInitCap("i\u0307onic", "UNICODE","I\u0307onic"); - assertInitCap("i\u0307onic", "UNICODE_CI","I\u0307onic"); - assertInitCap("FIDELİO", "UTF8_BINARY", "Fideli\u0307o"); - assertInitCap("FIDELİO", "UTF8_LCASE", "Fideli\u0307o"); - assertInitCap("FIDELİO", "UNICODE", "Fideli\u0307o"); - assertInitCap("FIDELİO", "UNICODE_CI", "Fideli\u0307o"); + assertInitCap("İ", UTF8_BINARY, "İ", "I\u0307"); + assertInitCap("İ", UTF8_LCASE, "İ"); + assertInitCap("İ", UNICODE, "İ"); + assertInitCap("İ", UNICODE_CI, "İ"); + assertInitCap("I\u0307", UTF8_BINARY,"I\u0307"); + assertInitCap("I\u0307", UTF8_LCASE,"I\u0307"); + assertInitCap("I\u0307", UNICODE,"I\u0307"); + assertInitCap("I\u0307", UNICODE_CI,"I\u0307"); + assertInitCap("İonic", UTF8_BINARY, "İonic", "I\u0307onic"); + assertInitCap("İonic", UTF8_LCASE, "İonic"); + assertInitCap("İonic", UNICODE, "İonic"); + assertInitCap("İonic", UNICODE_CI, "İonic"); + assertInitCap("i\u0307onic", UTF8_BINARY,"I\u0307onic"); + assertInitCap("i\u0307onic", UTF8_LCASE,"I\u0307onic"); + assertInitCap("i\u0307onic", UNICODE,"I\u0307onic"); + assertInitCap("i\u0307onic", UNICODE_CI,"I\u0307onic"); + assertInitCap("FIDELİO", UTF8_BINARY, "Fideli\u0307o"); + assertInitCap("FIDELİO", UTF8_LCASE, "Fideli\u0307o"); + assertInitCap("FIDELİO", UNICODE, "Fideli\u0307o"); + assertInitCap("FIDELİO", UNICODE_CI, "Fideli\u0307o"); // Surrogate pairs. - assertInitCap("a🙃B🙃c", "UTF8_BINARY", "A🙃b🙃c"); - assertInitCap("a🙃B🙃c", "UTF8_LCASE", "A🙃B🙃C"); - assertInitCap("a🙃B🙃c", "UNICODE", "A🙃B🙃C"); - assertInitCap("a🙃B🙃c", "UNICODE_CI", "A🙃B🙃C"); - assertInitCap("😄 😆", "UTF8_BINARY", "😄 😆"); - assertInitCap("😄 😆", "UTF8_LCASE", "😄 😆"); - assertInitCap("😄 😆", "UNICODE", "😄 😆"); - assertInitCap("😄 😆", "UNICODE_CI", "😄 😆"); - assertInitCap("😀😆😃😄", "UTF8_BINARY", "😀😆😃😄"); - assertInitCap("😀😆😃😄", "UTF8_LCASE", "😀😆😃😄"); - assertInitCap("😀😆😃😄", "UNICODE", "😀😆😃😄"); - assertInitCap("😀😆😃😄", "UNICODE_CI", "😀😆😃😄"); - assertInitCap("𝔸", "UTF8_BINARY", "𝔸"); - assertInitCap("𝔸", "UTF8_LCASE", "𝔸"); - assertInitCap("𝔸", "UNICODE", "𝔸"); - assertInitCap("𝔸", "UNICODE_CI", "𝔸"); - assertInitCap("𐐅", "UTF8_BINARY", "\uD801\uDC05", "𐐭"); - assertInitCap("𐐅", "UTF8_LCASE", "𐐅"); - assertInitCap("𐐅", "UNICODE", "𐐅"); - assertInitCap("𐐅", "UNICODE_CI", "𐐅"); - assertInitCap("𐐭", "UTF8_BINARY", "\uD801\uDC05", "𐐭"); - assertInitCap("𐐭", "UTF8_LCASE", "𐐅"); - assertInitCap("𐐭", "UNICODE", "𐐅"); - assertInitCap("𐐭", "UNICODE_CI", "𐐅"); - assertInitCap("𐐭𝔸", "UTF8_BINARY", "\uD801\uDC05\uD835\uDD38", "𐐭𝔸"); - assertInitCap("𐐭𝔸", "UTF8_LCASE", "𐐅𝔸"); - assertInitCap("𐐭𝔸", "UNICODE", "𐐅𝔸"); - assertInitCap("𐐭𝔸", "UNICODE_CI", "𐐅𝔸"); + assertInitCap("a🙃B🙃c", UTF8_BINARY, "A🙃b🙃c"); + assertInitCap("a🙃B🙃c", UTF8_LCASE, "A🙃B🙃C"); + assertInitCap("a🙃B🙃c", UNICODE, "A🙃B🙃C"); + assertInitCap("a🙃B🙃c", UNICODE_CI, "A🙃B🙃C"); + assertInitCap("😄 😆", UTF8_BINARY, "😄 😆"); + assertInitCap("😄 😆", UTF8_LCASE, "😄 😆"); + assertInitCap("😄 😆", UNICODE, "😄 😆"); + assertInitCap("😄 😆", UNICODE_CI, "😄 😆"); + assertInitCap("😀😆😃😄", UTF8_BINARY, "😀😆😃😄"); + assertInitCap("😀😆😃😄", UTF8_LCASE, "😀😆😃😄"); + assertInitCap("😀😆😃😄", UNICODE, "😀😆😃😄"); + assertInitCap("😀😆😃😄", UNICODE_CI, "😀😆😃😄"); + assertInitCap("𝔸", UTF8_BINARY, "𝔸"); + assertInitCap("𝔸", UTF8_LCASE, "𝔸"); + assertInitCap("𝔸", UNICODE, "𝔸"); + assertInitCap("𝔸", UNICODE_CI, "𝔸"); + assertInitCap("𐐅", UTF8_BINARY, "\uD801\uDC05", "𐐭"); + assertInitCap("𐐅", UTF8_LCASE, "𐐅"); + assertInitCap("𐐅", UNICODE, "𐐅"); + assertInitCap("𐐅", UNICODE_CI, "𐐅"); + assertInitCap("𐐭", UTF8_BINARY, "\uD801\uDC05", "𐐭"); + assertInitCap("𐐭", UTF8_LCASE, "𐐅"); + assertInitCap("𐐭", UNICODE, "𐐅"); + assertInitCap("𐐭", UNICODE_CI, "𐐅"); + assertInitCap("𐐭𝔸", UTF8_BINARY, "\uD801\uDC05\uD835\uDD38", "𐐭𝔸"); + assertInitCap("𐐭𝔸", UTF8_LCASE, "𐐅𝔸"); + assertInitCap("𐐭𝔸", UNICODE, "𐐅𝔸"); + assertInitCap("𐐭𝔸", UNICODE_CI, "𐐅𝔸"); // Ligatures. - assertInitCap("ß fi ffi ff st ῗ", "UTF8_BINARY", "Ss Fi Ffi Ff St Ϊ͂", "ß fi ffi ff st ῗ"); - assertInitCap("ß fi ffi ff st ῗ", "UTF8_LCASE", "Ss Fi Ffi Ff St \u0399\u0308\u0342"); - assertInitCap("ß fi ffi ff st ῗ", "UNICODE", "Ss Fi Ffi Ff St \u0399\u0308\u0342"); - assertInitCap("ß fi ffi ff st ῗ", "UNICODE", "Ss Fi Ffi Ff St \u0399\u0308\u0342"); - assertInitCap("œ ǽ", "UTF8_BINARY", "Œ Ǽ", "Œ Ǽ"); + assertInitCap("ß fi ffi ff st ῗ", UTF8_BINARY, "Ss Fi Ffi Ff St Ϊ͂", "ß fi ffi ff st ῗ"); + assertInitCap("ß fi ffi ff st ῗ", UTF8_LCASE, "Ss Fi Ffi Ff St \u0399\u0308\u0342"); + assertInitCap("ß fi ffi ff st ῗ", UNICODE, "Ss Fi Ffi Ff St \u0399\u0308\u0342"); + assertInitCap("ß fi ffi ff st ῗ", UNICODE, "Ss Fi Ffi Ff St \u0399\u0308\u0342"); + assertInitCap("œ ǽ", UTF8_BINARY, "Œ Ǽ", "Œ Ǽ"); // Different possible word boundaries. - assertInitCap("a b c", "UTF8_BINARY", "A B C"); - assertInitCap("a b c", "UNICODE", "A B C"); - assertInitCap("a b c", "UTF8_LCASE", "A B C"); - assertInitCap("a b c", "UNICODE_CI", "A B C"); - assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c"); - assertInitCap("a.b,c", "UNICODE", "A.b,C"); - assertInitCap("a.b,c", "UTF8_LCASE", "A.b,C"); - assertInitCap("a.b,c", "UNICODE_CI", "A.b,C"); - assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c"); - assertInitCap("a. b-c", "UNICODE", "A. B-C"); - assertInitCap("a. b-c", "UTF8_LCASE", "A. B-C"); - assertInitCap("a. b-c", "UNICODE_CI", "A. B-C"); - assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c"); - assertInitCap("a?b世c", "UNICODE", "A?B世C"); - assertInitCap("a?b世c", "UTF8_LCASE", "A?B世C"); - assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); + assertInitCap("a b c", UTF8_BINARY, "A B C"); + assertInitCap("a b c", UNICODE, "A B C"); + assertInitCap("a b c", UTF8_LCASE, "A B C"); + assertInitCap("a b c", UNICODE_CI, "A B C"); + assertInitCap("a.b,c", UTF8_BINARY, "A.b,c"); + assertInitCap("a.b,c", UNICODE, "A.b,C"); + assertInitCap("a.b,c", UTF8_LCASE, "A.b,C"); + assertInitCap("a.b,c", UNICODE_CI, "A.b,C"); + assertInitCap("a. b-c", UTF8_BINARY, "A. B-c"); + assertInitCap("a. b-c", UNICODE, "A. B-C"); + assertInitCap("a. b-c", UTF8_LCASE, "A. B-C"); + assertInitCap("a. b-c", UNICODE_CI, "A. B-C"); + assertInitCap("a?b世c", UTF8_BINARY, "A?b世c"); + assertInitCap("a?b世c", UNICODE, "A?B世C"); + assertInitCap("a?b世c", UTF8_LCASE, "A?B世C"); + assertInitCap("a?b世c", UNICODE_CI, "A?B世C"); // Titlecase characters that are different from uppercase characters. - assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz"); - assertInitCap("dzDZDz", "UNICODE", "Dzdzdz"); - assertInitCap("dzDZDz", "UTF8_LCASE", "Dzdzdz"); - assertInitCap("dzDZDz", "UNICODE_CI", "Dzdzdz"); - assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY", "Džaba Ljubav Njegova"); - assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova"); - assertInitCap("džaba Ljubav NJegova", "UTF8_LCASE", "Džaba Ljubav Njegova"); - assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY", + assertInitCap("dzDZDz", UTF8_BINARY, "Dzdzdz"); + assertInitCap("dzDZDz", UNICODE, "Dzdzdz"); + assertInitCap("dzDZDz", UTF8_LCASE, "Dzdzdz"); + assertInitCap("dzDZDz", UNICODE_CI, "Dzdzdz"); + assertInitCap("džaba Ljubav NJegova", UTF8_BINARY, "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", UNICODE, "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", UTF8_LCASE, "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", UNICODE_CI, "Džaba Ljubav Njegova"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", UTF8_BINARY, "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota","ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_LCASE", + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", UTF8_LCASE, "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE", + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", UNICODE, "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡςΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI", + assertInitCap("ß fi ffi ff st ΣΗΜΕΡςΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", UNICODE_CI, "Ss Fi Ffi Ff St Σημερςινος Ασημενιος İota"); // Characters that map to multiple characters when titlecased and lowercased. - assertInitCap("ß fi ffi ff st İOTA", "UTF8_BINARY", "Ss Fi Ffi Ff St İota", "ß fi ffi ff st İota"); - assertInitCap("ß fi ffi ff st OİOTA", "UTF8_BINARY", + assertInitCap("ß fi ffi ff st İOTA", UTF8_BINARY, "Ss Fi Ffi Ff St İota", "ß fi ffi ff st İota"); + assertInitCap("ß fi ffi ff st OİOTA", UTF8_BINARY, "Ss Fi Ffi Ff St Oi\u0307ota", "ß fi ffi ff st Oi̇ota"); // Lowercasing Greek letter sigma ('Σ') when case-ignorable character present. - assertInitCap("`Σ", "UTF8_BINARY", "`σ", "`σ"); - assertInitCap("1`Σ`` AΣ", "UTF8_BINARY", "1`σ`` Aς", "1`σ`` Aς"); - assertInitCap("a1`Σ``", "UTF8_BINARY", "A1`σ``", "A1`σ``"); - assertInitCap("a`Σ``", "UTF8_BINARY", "A`ς``", "A`σ``"); - assertInitCap("a`Σ``1", "UTF8_BINARY", "A`ς``1", "A`σ``1"); - assertInitCap("a`Σ``A", "UTF8_BINARY", "A`σ``a", "A`σ``a"); - assertInitCap("ΘΑ�Σ�ΟΣ�", "UTF8_BINARY", "Θα�σ�ος�", "Θα�σ�ος�"); - assertInitCap("ΘΑᵩΣ�ΟᵩΣᵩ�", "UTF8_BINARY", "Θαᵩς�οᵩςᵩ�", "Θαᵩς�οᵩςᵩ�"); - assertInitCap("ΘΑ�ᵩΣ�ΟᵩΣᵩ�", "UTF8_BINARY", "Θα�ᵩσ�οᵩςᵩ�", "Θα�ᵩσ�οᵩςᵩ�"); - assertInitCap("ΘΑ�ᵩΣᵩ�ΟᵩΣᵩ�", "UTF8_BINARY", "Θα�ᵩσᵩ�οᵩςᵩ�", "Θα�ᵩσᵩ�οᵩςᵩ�"); - assertInitCap("ΘΑ�Σ�Ο�Σ�", "UTF8_BINARY", "Θα�σ�ο�σ�", "Θα�σ�ο�σ�"); + assertInitCap("`Σ", UTF8_BINARY, "`σ", "`σ"); + assertInitCap("1`Σ`` AΣ", UTF8_BINARY, "1`σ`` Aς", "1`σ`` Aς"); + assertInitCap("a1`Σ``", UTF8_BINARY, "A1`σ``", "A1`σ``"); + assertInitCap("a`Σ``", UTF8_BINARY, "A`ς``", "A`σ``"); + assertInitCap("a`Σ``1", UTF8_BINARY, "A`ς``1", "A`σ``1"); + assertInitCap("a`Σ``A", UTF8_BINARY, "A`σ``a", "A`σ``a"); + assertInitCap("ΘΑ�Σ�ΟΣ�", UTF8_BINARY, "Θα�σ�ος�", "Θα�σ�ος�"); + assertInitCap("ΘΑᵩΣ�ΟᵩΣᵩ�", UTF8_BINARY, "Θαᵩς�οᵩςᵩ�", "Θαᵩς�οᵩςᵩ�"); + assertInitCap("ΘΑ�ᵩΣ�ΟᵩΣᵩ�", UTF8_BINARY, "Θα�ᵩσ�οᵩςᵩ�", "Θα�ᵩσ�οᵩςᵩ�"); + assertInitCap("ΘΑ�ᵩΣᵩ�ΟᵩΣᵩ�", UTF8_BINARY, "Θα�ᵩσᵩ�οᵩςᵩ�", "Θα�ᵩσᵩ�οᵩςᵩ�"); + assertInitCap("ΘΑ�Σ�Ο�Σ�", UTF8_BINARY, "Θα�σ�ο�σ�", "Θα�σ�ο�σ�"); // Disallowed bytes and invalid sequences. assertInitCap(UTF8String.fromBytes(new byte[] { (byte)0xC0, (byte)0xC1, (byte)0xF5}).toString(), - "UTF8_BINARY", "���", "���"); + UTF8_BINARY, "���", "���"); assertInitCap(UTF8String.fromBytes( new byte[]{(byte)0xC0, (byte)0xC1, (byte)0xF5, 0x20, 0x61, 0x41, (byte)0xC0}).toString(), - "UTF8_BINARY", + UTF8_BINARY, "��� Aa�", "��� Aa�"); assertInitCap(UTF8String.fromBytes(new byte[]{(byte)0xC2,(byte)0xC2}).toString(), - "UTF8_BINARY", "��", "��"); + UTF8_BINARY, "��", "��"); assertInitCap(UTF8String.fromBytes( new byte[]{0x61, 0x41, (byte)0xC2, (byte)0xC2, 0x41}).toString(), - "UTF8_BINARY", + UTF8_BINARY, "Aa��a", "Aa��a"); } @@ -1559,147 +1560,147 @@ private void assertStringInstr(String string, String substring, @Test public void testStringInstr() throws SparkException { // Empty strings. - assertStringInstr("", "", "UTF8_BINARY", 1); - assertStringInstr("", "", "UTF8_LCASE", 1); - assertStringInstr("", "", "UNICODE_CI", 1); - assertStringInstr("", "", "UNICODE", 1); - assertStringInstr("a", "", "UTF8_BINARY", 1); - assertStringInstr("a", "", "UTF8_LCASE", 1); - assertStringInstr("a", "", "UNICODE", 1); - assertStringInstr("a", "", "UNICODE_CI", 1); - assertStringInstr("", "x", "UTF8_BINARY", 0); - assertStringInstr("", "x", "UTF8_LCASE", 0); - assertStringInstr("", "x", "UNICODE", 0); - assertStringInstr("", "x", "UNICODE_CI", 0); + assertStringInstr("", "", UTF8_BINARY, 1); + assertStringInstr("", "", UTF8_LCASE, 1); + assertStringInstr("", "", UNICODE_CI, 1); + assertStringInstr("", "", UNICODE, 1); + assertStringInstr("a", "", UTF8_BINARY, 1); + assertStringInstr("a", "", UTF8_LCASE, 1); + assertStringInstr("a", "", UNICODE, 1); + assertStringInstr("a", "", UNICODE_CI, 1); + assertStringInstr("", "x", UTF8_BINARY, 0); + assertStringInstr("", "x", UTF8_LCASE, 0); + assertStringInstr("", "x", UNICODE, 0); + assertStringInstr("", "x", UNICODE_CI, 0); // Basic tests. - assertStringInstr("aaads", "aa", "UTF8_BINARY", 1); - assertStringInstr("aaads", "aa", "UTF8_LCASE", 1); - assertStringInstr("aaads", "aa", "UNICODE", 1); - assertStringInstr("aaads", "aa", "UNICODE_CI", 1); - assertStringInstr("aaads", "ds", "UTF8_BINARY", 4); - assertStringInstr("aaads", "ds", "UTF8_LCASE", 4); - assertStringInstr("aaads", "ds", "UNICODE", 4); - assertStringInstr("aaads", "ds", "UNICODE_CI", 4); - assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); - assertStringInstr("aaads", "Aa", "UTF8_LCASE", 1); - assertStringInstr("aaads", "Aa", "UNICODE", 0); - assertStringInstr("aaads", "Aa", "UNICODE_CI", 1); - assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); - assertStringInstr("aaaDs", "de", "UTF8_LCASE", 0); - assertStringInstr("aaaDs", "de", "UNICODE", 0); - assertStringInstr("aaaDs", "de", "UNICODE_CI", 0); - assertStringInstr("aaaDs", "ds", "UTF8_BINARY", 0); - assertStringInstr("aaaDs", "ds", "UTF8_LCASE", 4); - assertStringInstr("aaaDs", "ds", "UNICODE", 0); - assertStringInstr("aaaDs", "ds", "UNICODE_CI", 4); - assertStringInstr("aaadS", "Ds", "UTF8_BINARY", 0); - assertStringInstr("aaadS", "Ds", "UTF8_LCASE", 4); - assertStringInstr("aaadS", "Ds", "UNICODE", 0); - assertStringInstr("aaadS", "Ds", "UNICODE_CI", 4); + assertStringInstr("aaads", "aa", UTF8_BINARY, 1); + assertStringInstr("aaads", "aa", UTF8_LCASE, 1); + assertStringInstr("aaads", "aa", UNICODE, 1); + assertStringInstr("aaads", "aa", UNICODE_CI, 1); + assertStringInstr("aaads", "ds", UTF8_BINARY, 4); + assertStringInstr("aaads", "ds", UTF8_LCASE, 4); + assertStringInstr("aaads", "ds", UNICODE, 4); + assertStringInstr("aaads", "ds", UNICODE_CI, 4); + assertStringInstr("aaads", "Aa", UTF8_BINARY, 0); + assertStringInstr("aaads", "Aa", UTF8_LCASE, 1); + assertStringInstr("aaads", "Aa", UNICODE, 0); + assertStringInstr("aaads", "Aa", UNICODE_CI, 1); + assertStringInstr("aaaDs", "de", UTF8_BINARY, 0); + assertStringInstr("aaaDs", "de", UTF8_LCASE, 0); + assertStringInstr("aaaDs", "de", UNICODE, 0); + assertStringInstr("aaaDs", "de", UNICODE_CI, 0); + assertStringInstr("aaaDs", "ds", UTF8_BINARY, 0); + assertStringInstr("aaaDs", "ds", UTF8_LCASE, 4); + assertStringInstr("aaaDs", "ds", UNICODE, 0); + assertStringInstr("aaaDs", "ds", UNICODE_CI, 4); + assertStringInstr("aaadS", "Ds", UTF8_BINARY, 0); + assertStringInstr("aaadS", "Ds", UTF8_LCASE, 4); + assertStringInstr("aaadS", "Ds", UNICODE, 0); + assertStringInstr("aaadS", "Ds", UNICODE_CI, 4); assertStringInstr("aaaČŠčšcs", "cs", "SR", 8); assertStringInstr("aaaČŠčšcs", "cs", "SR_CI_AI", 4); // Advanced tests. - assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); - assertStringInstr("test大千世界X大千世界", "大千", "UTF8_LCASE", 5); - assertStringInstr("test大千世界X大千世界", "大千", "UNICODE", 5); - assertStringInstr("test大千世界X大千世界", "大千", "UNICODE_CI", 5); - assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); - assertStringInstr("test大千世界X大千世界", "界X", "UTF8_LCASE", 8); - assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); - assertStringInstr("test大千世界X大千世界", "界X", "UNICODE_CI", 8); - assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY", 0); - assertStringInstr("test大千世界X大千世界", "界x", "UTF8_LCASE", 8); - assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); - assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("test大千世界X大千世界", "界y", "UTF8_BINARY", 0); - assertStringInstr("test大千世界X大千世界", "界y", "UTF8_LCASE", 0); - assertStringInstr("test大千世界X大千世界", "界y", "UNICODE", 0); - assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); + assertStringInstr("test大千世界X大千世界", "大千", UTF8_BINARY, 5); + assertStringInstr("test大千世界X大千世界", "大千", UTF8_LCASE, 5); + assertStringInstr("test大千世界X大千世界", "大千", UNICODE, 5); + assertStringInstr("test大千世界X大千世界", "大千", UNICODE_CI, 5); + assertStringInstr("test大千世界X大千世界", "界X", UTF8_BINARY, 8); + assertStringInstr("test大千世界X大千世界", "界X", UTF8_LCASE, 8); + assertStringInstr("test大千世界X大千世界", "界X", UNICODE, 8); + assertStringInstr("test大千世界X大千世界", "界X", UNICODE_CI, 8); + assertStringInstr("test大千世界X大千世界", "界x", UTF8_BINARY, 0); + assertStringInstr("test大千世界X大千世界", "界x", UTF8_LCASE, 8); + assertStringInstr("test大千世界X大千世界", "界x", UNICODE, 0); + assertStringInstr("test大千世界X大千世界", "界x", UNICODE_CI, 8); + assertStringInstr("test大千世界X大千世界", "界y", UTF8_BINARY, 0); + assertStringInstr("test大千世界X大千世界", "界y", UTF8_LCASE, 0); + assertStringInstr("test大千世界X大千世界", "界y", UNICODE, 0); + assertStringInstr("test大千世界X大千世界", "界y", UNICODE_CI, 0); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringInstr("i\u0307", "i", "UNICODE_CI", 0); - assertStringInstr("i\u0307", "\u0307", "UNICODE_CI", 0); - assertStringInstr("i\u0307", "İ", "UNICODE_CI", 1); - assertStringInstr("İ", "i", "UNICODE_CI", 0); - assertStringInstr("İoi̇o12", "i\u0307o", "UNICODE_CI", 1); - assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); - assertStringInstr("abİoi̇o", "i\u0307o", "UNICODE_CI", 3); - assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); - assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); - assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); - assertStringInstr("i\u0307", "i", "UTF8_LCASE", 1); // != UNICODE_CI - assertStringInstr("i\u0307", "\u0307", "UTF8_LCASE", 2); // != UNICODE_CI - assertStringInstr("i\u0307", "İ", "UTF8_LCASE", 1); - assertStringInstr("İ", "i", "UTF8_LCASE", 0); - assertStringInstr("İoi̇o12", "i\u0307o", "UTF8_LCASE", 1); - assertStringInstr("i̇oİo12", "İo", "UTF8_LCASE", 1); - assertStringInstr("abİoi̇o", "i\u0307o", "UTF8_LCASE", 3); - assertStringInstr("abi̇oİo", "İo", "UTF8_LCASE", 3); - assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_LCASE", 3); - assertStringInstr("ai̇oxXİo", "Xx", "UTF8_LCASE", 5); - assertStringInstr("abİoi̇o", "\u0307o", "UTF8_LCASE", 6); - assertStringInstr("aİoi̇oxx", "XX", "UTF8_LCASE", 7); + assertStringInstr("i\u0307", "i", UNICODE_CI, 0); + assertStringInstr("i\u0307", "\u0307", UNICODE_CI, 0); + assertStringInstr("i\u0307", "İ", UNICODE_CI, 1); + assertStringInstr("İ", "i", UNICODE_CI, 0); + assertStringInstr("İoi̇o12", "i\u0307o", UNICODE_CI, 1); + assertStringInstr("i̇oİo12", "İo", UNICODE_CI, 1); + assertStringInstr("abİoi̇o", "i\u0307o", UNICODE_CI, 3); + assertStringInstr("abi̇oİo", "İo", UNICODE_CI, 3); + assertStringInstr("ai̇oxXİo", "Xx", UNICODE_CI, 5); + assertStringInstr("aİoi̇oxx", "XX", UNICODE_CI, 7); + assertStringInstr("i\u0307", "i", UTF8_LCASE, 1); // != UNICODE_CI + assertStringInstr("i\u0307", "\u0307", UTF8_LCASE, 2); // != UNICODE_CI + assertStringInstr("i\u0307", "İ", UTF8_LCASE, 1); + assertStringInstr("İ", "i", UTF8_LCASE, 0); + assertStringInstr("İoi̇o12", "i\u0307o", UTF8_LCASE, 1); + assertStringInstr("i̇oİo12", "İo", UTF8_LCASE, 1); + assertStringInstr("abİoi̇o", "i\u0307o", UTF8_LCASE, 3); + assertStringInstr("abi̇oİo", "İo", UTF8_LCASE, 3); + assertStringInstr("abI\u0307oi̇o", "İo", UTF8_LCASE, 3); + assertStringInstr("ai̇oxXİo", "Xx", UTF8_LCASE, 5); + assertStringInstr("abİoi̇o", "\u0307o", UTF8_LCASE, 6); + assertStringInstr("aİoi̇oxx", "XX", UTF8_LCASE, 7); // Conditional case mapping (e.g. Greek sigmas). - assertStringInstr("σ", "σ", "UTF8_BINARY", 1); - assertStringInstr("σ", "ς", "UTF8_BINARY", 0); - assertStringInstr("σ", "Σ", "UTF8_BINARY", 0); - assertStringInstr("ς", "σ", "UTF8_BINARY", 0); - assertStringInstr("ς", "ς", "UTF8_BINARY", 1); - assertStringInstr("ς", "Σ", "UTF8_BINARY", 0); - assertStringInstr("Σ", "σ", "UTF8_BINARY", 0); - assertStringInstr("Σ", "ς", "UTF8_BINARY", 0); - assertStringInstr("Σ", "Σ", "UTF8_BINARY", 1); - assertStringInstr("σ", "σ", "UTF8_LCASE", 1); - assertStringInstr("σ", "ς", "UTF8_LCASE", 1); - assertStringInstr("σ", "Σ", "UTF8_LCASE", 1); - assertStringInstr("ς", "σ", "UTF8_LCASE", 1); - assertStringInstr("ς", "ς", "UTF8_LCASE", 1); - assertStringInstr("ς", "Σ", "UTF8_LCASE", 1); - assertStringInstr("Σ", "σ", "UTF8_LCASE", 1); - assertStringInstr("Σ", "ς", "UTF8_LCASE", 1); - assertStringInstr("Σ", "Σ", "UTF8_LCASE", 1); - assertStringInstr("σ", "σ", "UNICODE", 1); - assertStringInstr("σ", "ς", "UNICODE", 0); - assertStringInstr("σ", "Σ", "UNICODE", 0); - assertStringInstr("ς", "σ", "UNICODE", 0); - assertStringInstr("ς", "ς", "UNICODE", 1); - assertStringInstr("ς", "Σ", "UNICODE", 0); - assertStringInstr("Σ", "σ", "UNICODE", 0); - assertStringInstr("Σ", "ς", "UNICODE", 0); - assertStringInstr("Σ", "Σ", "UNICODE", 1); - assertStringInstr("σ", "σ", "UNICODE_CI", 1); - assertStringInstr("σ", "ς", "UNICODE_CI", 1); - assertStringInstr("σ", "Σ", "UNICODE_CI", 1); - assertStringInstr("ς", "σ", "UNICODE_CI", 1); - assertStringInstr("ς", "ς", "UNICODE_CI", 1); - assertStringInstr("ς", "Σ", "UNICODE_CI", 1); - assertStringInstr("Σ", "σ", "UNICODE_CI", 1); - assertStringInstr("Σ", "ς", "UNICODE_CI", 1); - assertStringInstr("Σ", "Σ", "UNICODE_CI", 1); + assertStringInstr("σ", "σ", UTF8_BINARY, 1); + assertStringInstr("σ", "ς", UTF8_BINARY, 0); + assertStringInstr("σ", "Σ", UTF8_BINARY, 0); + assertStringInstr("ς", "σ", UTF8_BINARY, 0); + assertStringInstr("ς", "ς", UTF8_BINARY, 1); + assertStringInstr("ς", "Σ", UTF8_BINARY, 0); + assertStringInstr("Σ", "σ", UTF8_BINARY, 0); + assertStringInstr("Σ", "ς", UTF8_BINARY, 0); + assertStringInstr("Σ", "Σ", UTF8_BINARY, 1); + assertStringInstr("σ", "σ", UTF8_LCASE, 1); + assertStringInstr("σ", "ς", UTF8_LCASE, 1); + assertStringInstr("σ", "Σ", UTF8_LCASE, 1); + assertStringInstr("ς", "σ", UTF8_LCASE, 1); + assertStringInstr("ς", "ς", UTF8_LCASE, 1); + assertStringInstr("ς", "Σ", UTF8_LCASE, 1); + assertStringInstr("Σ", "σ", UTF8_LCASE, 1); + assertStringInstr("Σ", "ς", UTF8_LCASE, 1); + assertStringInstr("Σ", "Σ", UTF8_LCASE, 1); + assertStringInstr("σ", "σ", UNICODE, 1); + assertStringInstr("σ", "ς", UNICODE, 0); + assertStringInstr("σ", "Σ", UNICODE, 0); + assertStringInstr("ς", "σ", UNICODE, 0); + assertStringInstr("ς", "ς", UNICODE, 1); + assertStringInstr("ς", "Σ", UNICODE, 0); + assertStringInstr("Σ", "σ", UNICODE, 0); + assertStringInstr("Σ", "ς", UNICODE, 0); + assertStringInstr("Σ", "Σ", UNICODE, 1); + assertStringInstr("σ", "σ", UNICODE_CI, 1); + assertStringInstr("σ", "ς", UNICODE_CI, 1); + assertStringInstr("σ", "Σ", UNICODE_CI, 1); + assertStringInstr("ς", "σ", UNICODE_CI, 1); + assertStringInstr("ς", "ς", UNICODE_CI, 1); + assertStringInstr("ς", "Σ", UNICODE_CI, 1); + assertStringInstr("Σ", "σ", UNICODE_CI, 1); + assertStringInstr("Σ", "ς", UNICODE_CI, 1); + assertStringInstr("Σ", "Σ", UNICODE_CI, 1); // Surrogate pairs. - assertStringInstr("a🙃b", "a", "UTF8_BINARY", 1); - assertStringInstr("a🙃b", "a", "UTF8_LCASE", 1); - assertStringInstr("a🙃b", "a", "UNICODE", 1); - assertStringInstr("a🙃b", "a", "UNICODE_CI", 1); - assertStringInstr("a🙃b", "🙃", "UTF8_BINARY", 2); - assertStringInstr("a🙃b", "🙃", "UTF8_LCASE", 2); - assertStringInstr("a🙃b", "🙃", "UNICODE", 2); - assertStringInstr("a🙃b", "🙃", "UNICODE_CI", 2); - assertStringInstr("a🙃b", "b", "UTF8_BINARY", 3); - assertStringInstr("a🙃b", "b", "UTF8_LCASE", 3); - assertStringInstr("a🙃b", "b", "UNICODE", 3); - assertStringInstr("a🙃b", "b", "UNICODE_CI", 3); - assertStringInstr("a🙃🙃b", "🙃", "UTF8_BINARY", 2); - assertStringInstr("a🙃🙃b", "🙃", "UTF8_LCASE", 2); - assertStringInstr("a🙃🙃b", "🙃", "UNICODE", 2); - assertStringInstr("a🙃🙃b", "🙃", "UNICODE_CI", 2); - assertStringInstr("a🙃🙃b", "b", "UTF8_BINARY", 4); - assertStringInstr("a🙃🙃b", "b", "UTF8_LCASE", 4); - assertStringInstr("a🙃🙃b", "b", "UNICODE", 4); - assertStringInstr("a🙃🙃b", "b", "UNICODE_CI", 4); - assertStringInstr("a🙃x🙃b", "b", "UTF8_BINARY", 5); - assertStringInstr("a🙃x🙃b", "b", "UTF8_LCASE", 5); - assertStringInstr("a🙃x🙃b", "b", "UNICODE", 5); - assertStringInstr("a🙃x🙃b", "b", "UNICODE_CI", 5); + assertStringInstr("a🙃b", "a", UTF8_BINARY, 1); + assertStringInstr("a🙃b", "a", UTF8_LCASE, 1); + assertStringInstr("a🙃b", "a", UNICODE, 1); + assertStringInstr("a🙃b", "a", UNICODE_CI, 1); + assertStringInstr("a🙃b", "🙃", UTF8_BINARY, 2); + assertStringInstr("a🙃b", "🙃", UTF8_LCASE, 2); + assertStringInstr("a🙃b", "🙃", UNICODE, 2); + assertStringInstr("a🙃b", "🙃", UNICODE_CI, 2); + assertStringInstr("a🙃b", "b", UTF8_BINARY, 3); + assertStringInstr("a🙃b", "b", UTF8_LCASE, 3); + assertStringInstr("a🙃b", "b", UNICODE, 3); + assertStringInstr("a🙃b", "b", UNICODE_CI, 3); + assertStringInstr("a🙃🙃b", "🙃", UTF8_BINARY, 2); + assertStringInstr("a🙃🙃b", "🙃", UTF8_LCASE, 2); + assertStringInstr("a🙃🙃b", "🙃", UNICODE, 2); + assertStringInstr("a🙃🙃b", "🙃", UNICODE_CI, 2); + assertStringInstr("a🙃🙃b", "b", UTF8_BINARY, 4); + assertStringInstr("a🙃🙃b", "b", UTF8_LCASE, 4); + assertStringInstr("a🙃🙃b", "b", UNICODE, 4); + assertStringInstr("a🙃🙃b", "b", UNICODE_CI, 4); + assertStringInstr("a🙃x🙃b", "b", UTF8_BINARY, 5); + assertStringInstr("a🙃x🙃b", "b", UTF8_LCASE, 5); + assertStringInstr("a🙃x🙃b", "b", UNICODE, 5); + assertStringInstr("a🙃x🙃b", "b", UNICODE_CI, 5); } /** @@ -1717,256 +1718,256 @@ private void assertFindInSet(String word, UTF8String set, String collationName, @Test public void testFindInSet() throws SparkException { // Empty strings. - assertFindInSet("", UTF8String.fromString(""), "UTF8_BINARY", 1); - assertFindInSet("", UTF8String.fromString(""), "UTF8_LCASE", 1); - assertFindInSet("", UTF8String.fromString(""), "UNICODE", 1); - assertFindInSet("", UTF8String.fromString(""), "UNICODE_CI", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_LCASE", 1); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE", 1); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE", 6); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6); - assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0); - assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0); - assertFindInSet("", UTF8String.fromString("abc"), "UNICODE", 0); - assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0); + assertFindInSet("", UTF8String.fromString(""), UTF8_BINARY, 1); + assertFindInSet("", UTF8String.fromString(""), UTF8_LCASE, 1); + assertFindInSet("", UTF8String.fromString(""), UNICODE, 1); + assertFindInSet("", UTF8String.fromString(""), UNICODE_CI, 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), UTF8_BINARY, 1); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), UTF8_LCASE, 1); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), UNICODE, 1); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), UNICODE_CI, 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), UTF8_BINARY, 6); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), UTF8_LCASE, 6); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), UNICODE, 6); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), UNICODE_CI, 6); + assertFindInSet("", UTF8String.fromString("abc"), UTF8_BINARY, 0); + assertFindInSet("", UTF8String.fromString("abc"), UTF8_LCASE, 0); + assertFindInSet("", UTF8String.fromString("abc"), UNICODE, 0); + assertFindInSet("", UTF8String.fromString("abc"), UNICODE_CI, 0); // Basic tests. - assertFindInSet("xx", UTF8String.fromString("xx"), "UTF8_BINARY", 1); - assertFindInSet("xx", UTF8String.fromString("xx"), "UTF8_LCASE", 1); - assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1); - assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE_CI", 1); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); - assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1); - assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1); - assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 1); - assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 1); - assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); - assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5); - assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 5); - assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 5); - assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 5); - assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); - assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3); - assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 3); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); - assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4); - assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4); + assertFindInSet("xx", UTF8String.fromString("xx"), UTF8_BINARY, 1); + assertFindInSet("xx", UTF8String.fromString("xx"), UTF8_LCASE, 1); + assertFindInSet("xx", UTF8String.fromString("xx"), UNICODE, 1); + assertFindInSet("xx", UTF8String.fromString("xx"), UNICODE_CI, 1); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 0); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 1); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 1); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 1); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 1); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 0); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 0); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 5); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 5); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 5); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 5); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 0); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 0); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 3); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 3); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), UTF8_BINARY, 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), UTF8_LCASE, 4); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), UNICODE, 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), UNICODE_CI, 4); // Advanced tests. - assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY", 5); - assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 5); - assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5); - assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5); - assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY", 0); - assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 4); - assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0); - assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4); - assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UTF8_BINARY", 0); - assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UTF8_LCASE", 5); - assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE", 0); - assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UTF8_BINARY, 5); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UTF8_LCASE, 5); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UNICODE, 5); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UNICODE_CI, 5); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UTF8_BINARY, 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UTF8_LCASE, 4); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UNICODE, 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), UNICODE_CI, 4); + assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), UTF8_BINARY, 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), UTF8_LCASE, 5); + assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), UNICODE, 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), UNICODE_CI, 5); // One-to-many case mapping (e.g. Turkish dotted I). - assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UTF8_BINARY", 0); - assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UTF8_LCASE", 1); - assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UNICODE", 0); - assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UTF8_BINARY", 1); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UTF8_LCASE", 1); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UNICODE", 1); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("i\u0307"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("i\u0307"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("i\u0307"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("i\u0307"), "UNICODE_CI", 0); - assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UTF8_BINARY", 0); - assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UTF8_LCASE", 1); - assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UNICODE", 0); - assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UTF8_BINARY", 1); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UTF8_LCASE", 1); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UNICODE", 1); - assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UNICODE_CI", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UTF8_BINARY", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UNICODE", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UNICODE", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 0); - assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 0); - assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 2); - assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UNICODE", 0); - assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UNICODE", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UTF8_BINARY", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UTF8_LCASE", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UNICODE", 2); - assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UTF8_BINARY", 0); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UTF8_LCASE", 0); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UNICODE", 0); - assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UNICODE_CI", 0); - assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY", 0); - assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UTF8_LCASE", 2); - assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UNICODE", 0); - assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2); - assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UTF8_BINARY", 0); - assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UTF8_LCASE", 2); - assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UNICODE", 0); - assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UNICODE_CI", 2); + assertFindInSet("i\u0307", UTF8String.fromString("İ"), UTF8_BINARY, 0); + assertFindInSet("i\u0307", UTF8String.fromString("İ"), UTF8_LCASE, 1); + assertFindInSet("i\u0307", UTF8String.fromString("İ"), UNICODE, 0); + assertFindInSet("i\u0307", UTF8String.fromString("İ"), UNICODE_CI, 1); + assertFindInSet("i", UTF8String.fromString("İ"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("İ"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("İ"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("İ"), UNICODE_CI, 0); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), UTF8_BINARY, 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), UTF8_LCASE, 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), UNICODE, 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), UNICODE_CI, 1); + assertFindInSet("i", UTF8String.fromString("i\u0307"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("i\u0307"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("i\u0307"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("i\u0307"), UNICODE_CI, 0); + assertFindInSet("i\u0307", UTF8String.fromString("İ,"), UTF8_BINARY, 0); + assertFindInSet("i\u0307", UTF8String.fromString("İ,"), UTF8_LCASE, 1); + assertFindInSet("i\u0307", UTF8String.fromString("İ,"), UNICODE, 0); + assertFindInSet("i\u0307", UTF8String.fromString("İ,"), UNICODE_CI, 1); + assertFindInSet("i", UTF8String.fromString("İ,"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("İ,"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("İ,"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("İ,"), UNICODE_CI, 0); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), UTF8_BINARY, 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), UTF8_LCASE, 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), UNICODE, 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), UNICODE_CI, 1); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), UNICODE_CI, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), UTF8_BINARY, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), UTF8_LCASE, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), UNICODE, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), UNICODE_CI, 2); + assertFindInSet("i", UTF8String.fromString("ab,İ"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("ab,İ"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("ab,İ"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("ab,İ"), UNICODE_CI, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), UTF8_BINARY, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), UTF8_LCASE, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), UNICODE, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), UNICODE_CI, 2); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), UNICODE_CI, 0); + assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), UTF8_BINARY, 0); + assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), UTF8_LCASE, 2); + assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), UNICODE, 0); + assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), UNICODE_CI, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), UTF8_BINARY, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), UTF8_LCASE, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), UNICODE, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), UNICODE_CI, 2); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), UNICODE_CI, 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), UTF8_BINARY, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), UTF8_LCASE, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), UNICODE, 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), UNICODE_CI, 2); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), UTF8_BINARY, 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), UTF8_LCASE, 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), UNICODE, 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), UNICODE_CI, 0); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), UTF8_BINARY, 0); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), UTF8_LCASE, 2); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), UNICODE, 0); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), UNICODE_CI, 2); + assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), UTF8_BINARY, 0); + assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), UTF8_LCASE, 2); + assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), UNICODE, 0); + assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), UNICODE_CI, 2); // Conditional case mapping (e.g. Greek sigmas). - assertFindInSet("σ", UTF8String.fromString("σ"), "UTF8_BINARY", 1); - assertFindInSet("σ", UTF8String.fromString("ς"), "UTF8_BINARY", 0); - assertFindInSet("σ", UTF8String.fromString("Σ"), "UTF8_BINARY", 0); - assertFindInSet("ς", UTF8String.fromString("σ"), "UTF8_BINARY", 0); - assertFindInSet("ς", UTF8String.fromString("ς"), "UTF8_BINARY", 1); - assertFindInSet("ς", UTF8String.fromString("Σ"), "UTF8_BINARY", 0); - assertFindInSet("Σ", UTF8String.fromString("σ"), "UTF8_BINARY", 0); - assertFindInSet("Σ", UTF8String.fromString("ς"), "UTF8_BINARY", 0); - assertFindInSet("Σ", UTF8String.fromString("Σ"), "UTF8_BINARY", 1); - assertFindInSet("σ", UTF8String.fromString("σ"), "UTF8_LCASE", 1); - assertFindInSet("σ", UTF8String.fromString("ς"), "UTF8_LCASE", 1); - assertFindInSet("σ", UTF8String.fromString("Σ"), "UTF8_LCASE", 1); - assertFindInSet("ς", UTF8String.fromString("σ"), "UTF8_LCASE", 1); - assertFindInSet("ς", UTF8String.fromString("ς"), "UTF8_LCASE", 1); - assertFindInSet("ς", UTF8String.fromString("Σ"), "UTF8_LCASE", 1); - assertFindInSet("Σ", UTF8String.fromString("σ"), "UTF8_LCASE", 1); - assertFindInSet("Σ", UTF8String.fromString("ς"), "UTF8_LCASE", 1); - assertFindInSet("Σ", UTF8String.fromString("Σ"), "UTF8_LCASE", 1); - assertFindInSet("σ", UTF8String.fromString("σ"), "UNICODE", 1); - assertFindInSet("σ", UTF8String.fromString("ς"), "UNICODE", 0); - assertFindInSet("σ", UTF8String.fromString("Σ"), "UNICODE", 0); - assertFindInSet("ς", UTF8String.fromString("σ"), "UNICODE", 0); - assertFindInSet("ς", UTF8String.fromString("ς"), "UNICODE", 1); - assertFindInSet("ς", UTF8String.fromString("Σ"), "UNICODE", 0); - assertFindInSet("Σ", UTF8String.fromString("σ"), "UNICODE", 0); - assertFindInSet("Σ", UTF8String.fromString("ς"), "UNICODE", 0); - assertFindInSet("Σ", UTF8String.fromString("Σ"), "UNICODE", 1); - assertFindInSet("σ", UTF8String.fromString("σ"), "UNICODE_CI", 1); - assertFindInSet("σ", UTF8String.fromString("ς"), "UNICODE_CI", 1); - assertFindInSet("σ", UTF8String.fromString("Σ"), "UNICODE_CI", 1); - assertFindInSet("ς", UTF8String.fromString("σ"), "UNICODE_CI", 1); - assertFindInSet("ς", UTF8String.fromString("ς"), "UNICODE_CI", 1); - assertFindInSet("ς", UTF8String.fromString("Σ"), "UNICODE_CI", 1); - assertFindInSet("Σ", UTF8String.fromString("σ"), "UNICODE_CI", 1); - assertFindInSet("Σ", UTF8String.fromString("ς"), "UNICODE_CI", 1); - assertFindInSet("Σ", UTF8String.fromString("Σ"), "UNICODE_CI", 1); + assertFindInSet("σ", UTF8String.fromString("σ"), UTF8_BINARY, 1); + assertFindInSet("σ", UTF8String.fromString("ς"), UTF8_BINARY, 0); + assertFindInSet("σ", UTF8String.fromString("Σ"), UTF8_BINARY, 0); + assertFindInSet("ς", UTF8String.fromString("σ"), UTF8_BINARY, 0); + assertFindInSet("ς", UTF8String.fromString("ς"), UTF8_BINARY, 1); + assertFindInSet("ς", UTF8String.fromString("Σ"), UTF8_BINARY, 0); + assertFindInSet("Σ", UTF8String.fromString("σ"), UTF8_BINARY, 0); + assertFindInSet("Σ", UTF8String.fromString("ς"), UTF8_BINARY, 0); + assertFindInSet("Σ", UTF8String.fromString("Σ"), UTF8_BINARY, 1); + assertFindInSet("σ", UTF8String.fromString("σ"), UTF8_LCASE, 1); + assertFindInSet("σ", UTF8String.fromString("ς"), UTF8_LCASE, 1); + assertFindInSet("σ", UTF8String.fromString("Σ"), UTF8_LCASE, 1); + assertFindInSet("ς", UTF8String.fromString("σ"), UTF8_LCASE, 1); + assertFindInSet("ς", UTF8String.fromString("ς"), UTF8_LCASE, 1); + assertFindInSet("ς", UTF8String.fromString("Σ"), UTF8_LCASE, 1); + assertFindInSet("Σ", UTF8String.fromString("σ"), UTF8_LCASE, 1); + assertFindInSet("Σ", UTF8String.fromString("ς"), UTF8_LCASE, 1); + assertFindInSet("Σ", UTF8String.fromString("Σ"), UTF8_LCASE, 1); + assertFindInSet("σ", UTF8String.fromString("σ"), UNICODE, 1); + assertFindInSet("σ", UTF8String.fromString("ς"), UNICODE, 0); + assertFindInSet("σ", UTF8String.fromString("Σ"), UNICODE, 0); + assertFindInSet("ς", UTF8String.fromString("σ"), UNICODE, 0); + assertFindInSet("ς", UTF8String.fromString("ς"), UNICODE, 1); + assertFindInSet("ς", UTF8String.fromString("Σ"), UNICODE, 0); + assertFindInSet("Σ", UTF8String.fromString("σ"), UNICODE, 0); + assertFindInSet("Σ", UTF8String.fromString("ς"), UNICODE, 0); + assertFindInSet("Σ", UTF8String.fromString("Σ"), UNICODE, 1); + assertFindInSet("σ", UTF8String.fromString("σ"), UNICODE_CI, 1); + assertFindInSet("σ", UTF8String.fromString("ς"), UNICODE_CI, 1); + assertFindInSet("σ", UTF8String.fromString("Σ"), UNICODE_CI, 1); + assertFindInSet("ς", UTF8String.fromString("σ"), UNICODE_CI, 1); + assertFindInSet("ς", UTF8String.fromString("ς"), UNICODE_CI, 1); + assertFindInSet("ς", UTF8String.fromString("Σ"), UNICODE_CI, 1); + assertFindInSet("Σ", UTF8String.fromString("σ"), UNICODE_CI, 1); + assertFindInSet("Σ", UTF8String.fromString("ς"), UNICODE_CI, 1); + assertFindInSet("Σ", UTF8String.fromString("Σ"), UNICODE_CI, 1); // Surrogate pairs. - assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 0); - assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 0); - assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 0); - assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 0); - assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 1); - assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 1); - assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 1); - assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 1); - assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 2); - assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 2); - assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 2); - assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 2); - assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 3); - assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 3); - assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 3); - assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 3); - assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_BINARY", 0); - assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_LCASE", 0); - assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE", 0); - assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE_CI", 0); - assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_BINARY", 1); - assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_LCASE", 1); - assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE", 1); - assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE_CI", 1); - assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UTF8_BINARY", 2); - assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UTF8_LCASE", 2); - assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UNICODE", 2); - assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UNICODE_CI", 2); - assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0); - assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 0); - assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0); - assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 0); - assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 1); - assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 1); - assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 1); - assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1); - assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0); - assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 1); - assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0); - assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1); - assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 3); - assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 3); - assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 3); - assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1); - assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 2); - assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 2); - assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 2); - assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 2); - assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0); - assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 2); - assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0); - assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 2); + assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), UTF8_BINARY, 0); + assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), UTF8_LCASE, 0); + assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), UNICODE, 0); + assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), UNICODE_CI, 0); + assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), UTF8_BINARY, 1); + assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), UTF8_LCASE, 1); + assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), UNICODE, 1); + assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), UNICODE_CI, 1); + assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), UTF8_BINARY, 2); + assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), UTF8_LCASE, 2); + assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), UNICODE, 2); + assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), UNICODE_CI, 2); + assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), UTF8_BINARY, 3); + assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), UTF8_LCASE, 3); + assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), UNICODE, 3); + assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), UNICODE_CI, 3); + assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), UTF8_BINARY, 0); + assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), UTF8_LCASE, 0); + assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), UNICODE, 0); + assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), UNICODE_CI, 0); + assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), UTF8_BINARY, 1); + assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), UTF8_LCASE, 1); + assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), UNICODE, 1); + assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), UNICODE_CI, 1); + assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), UTF8_BINARY, 2); + assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), UTF8_LCASE, 2); + assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), UNICODE, 2); + assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), UNICODE_CI, 2); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), UTF8_BINARY, 0); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), UTF8_LCASE, 0); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), UNICODE, 0); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), UNICODE_CI, 0); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), UTF8_BINARY, 1); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), UTF8_LCASE, 1); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), UNICODE, 1); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), UNICODE_CI, 1); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), UTF8_BINARY, 0); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), UTF8_LCASE, 1); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), UNICODE, 0); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), UNICODE_CI, 1); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), UTF8_BINARY, 3); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), UTF8_LCASE, 3); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), UNICODE, 3); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), UNICODE_CI, 1); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), UTF8_BINARY, 2); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), UTF8_LCASE, 2); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), UNICODE, 2); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), UNICODE_CI, 2); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), UTF8_BINARY, 0); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), UTF8_LCASE, 2); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), UNICODE, 0); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), UNICODE_CI, 2); // Invalid UTF8 strings assertFindInSet("C", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UTF8_BINARY", 3); + UTF8_BINARY, 3); assertFindInSet("c", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UTF8_LCASE", 2); + UTF8_LCASE, 2); assertFindInSet("C", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UNICODE", 2); + UNICODE, 2); assertFindInSet("c", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UNICODE_CI", 2); + UNICODE_CI, 2); } /** @@ -1986,145 +1987,145 @@ private void assertStringReplace(String source, String search, String replace, @Test public void testStringReplace() throws SparkException { // Empty strings. - assertStringReplace("", "", "", "UTF8_BINARY", ""); - assertStringReplace("", "", "", "UTF8_LCASE", ""); - assertStringReplace("", "", "", "UNICODE", ""); - assertStringReplace("", "", "", "UNICODE_CI", ""); - assertStringReplace("abc", "", "", "UTF8_BINARY", "abc"); - assertStringReplace("abc", "", "", "UTF8_LCASE", "abc"); - assertStringReplace("abc", "", "", "UNICODE", "abc"); - assertStringReplace("abc", "", "", "UNICODE_CI", "abc"); - assertStringReplace("", "x", "", "UTF8_BINARY", ""); - assertStringReplace("", "x", "", "UTF8_LCASE", ""); - assertStringReplace("", "x", "", "UNICODE", ""); - assertStringReplace("", "x", "", "UNICODE_CI", ""); - assertStringReplace("", "", "x", "UTF8_BINARY", ""); - assertStringReplace("", "", "x", "UTF8_LCASE", ""); - assertStringReplace("", "", "x", "UNICODE", ""); - assertStringReplace("", "", "x", "UNICODE_CI", ""); - assertStringReplace("", "b", "x", "UTF8_BINARY", ""); - assertStringReplace("", "b", "x", "UTF8_LCASE", ""); - assertStringReplace("", "b", "x", "UNICODE", ""); - assertStringReplace("", "b", "x", "UNICODE_CI", ""); - assertStringReplace("abc", "b", "", "UTF8_BINARY", "ac"); - assertStringReplace("abc", "b", "", "UTF8_LCASE", "ac"); - assertStringReplace("abc", "b", "", "UNICODE", "ac"); - assertStringReplace("abc", "b", "", "UNICODE_CI", "ac"); - assertStringReplace("abc", "", "x", "UTF8_BINARY", "abc"); - assertStringReplace("abc", "", "x", "UTF8_LCASE", "abc"); - assertStringReplace("abc", "", "x", "UNICODE", "abc"); - assertStringReplace("abc", "", "x", "UNICODE_CI", "abc"); + assertStringReplace("", "", "", UTF8_BINARY, ""); + assertStringReplace("", "", "", UTF8_LCASE, ""); + assertStringReplace("", "", "", UNICODE, ""); + assertStringReplace("", "", "", UNICODE_CI, ""); + assertStringReplace("abc", "", "", UTF8_BINARY, "abc"); + assertStringReplace("abc", "", "", UTF8_LCASE, "abc"); + assertStringReplace("abc", "", "", UNICODE, "abc"); + assertStringReplace("abc", "", "", UNICODE_CI, "abc"); + assertStringReplace("", "x", "", UTF8_BINARY, ""); + assertStringReplace("", "x", "", UTF8_LCASE, ""); + assertStringReplace("", "x", "", UNICODE, ""); + assertStringReplace("", "x", "", UNICODE_CI, ""); + assertStringReplace("", "", "x", UTF8_BINARY, ""); + assertStringReplace("", "", "x", UTF8_LCASE, ""); + assertStringReplace("", "", "x", UNICODE, ""); + assertStringReplace("", "", "x", UNICODE_CI, ""); + assertStringReplace("", "b", "x", UTF8_BINARY, ""); + assertStringReplace("", "b", "x", UTF8_LCASE, ""); + assertStringReplace("", "b", "x", UNICODE, ""); + assertStringReplace("", "b", "x", UNICODE_CI, ""); + assertStringReplace("abc", "b", "", UTF8_BINARY, "ac"); + assertStringReplace("abc", "b", "", UTF8_LCASE, "ac"); + assertStringReplace("abc", "b", "", UNICODE, "ac"); + assertStringReplace("abc", "b", "", UNICODE_CI, "ac"); + assertStringReplace("abc", "", "x", UTF8_BINARY, "abc"); + assertStringReplace("abc", "", "x", UTF8_LCASE, "abc"); + assertStringReplace("abc", "", "x", UNICODE, "abc"); + assertStringReplace("abc", "", "x", UNICODE_CI, "abc"); // Basic tests. - assertStringReplace("replace", "pl", "", "UTF8_BINARY", "reace"); - assertStringReplace("replace", "pl", "", "UTF8_LCASE", "reace"); - assertStringReplace("replace", "pl", "", "UNICODE", "reace"); - assertStringReplace("replace", "pl", "", "UNICODE_CI", "reace"); - assertStringReplace("replace", "", "123", "UTF8_BINARY", "replace"); - assertStringReplace("replace", "", "123", "UTF8_LCASE", "replace"); - assertStringReplace("replace", "", "123", "UNICODE", "replace"); - assertStringReplace("replace", "", "123", "UNICODE_CI", "replace"); - assertStringReplace("abcabc", "b", "12", "UTF8_BINARY", "a12ca12c"); - assertStringReplace("abcabc", "b", "12", "UTF8_LCASE", "a12ca12c"); - assertStringReplace("abcabc", "b", "12", "UNICODE", "a12ca12c"); - assertStringReplace("abcabc", "b", "12", "UNICODE_CI", "a12ca12c"); - assertStringReplace("replace", "plx", "123", "UTF8_BINARY", "replace"); - assertStringReplace("replace", "plx", "123", "UTF8_LCASE", "replace"); - assertStringReplace("replace", "plx", "123", "UNICODE", "replace"); - assertStringReplace("replace", "plx", "123", "UNICODE_CI", "replace"); - assertStringReplace("Replace", "re", "", "UTF8_BINARY", "Replace"); - assertStringReplace("Replace", "re", "", "UTF8_LCASE", "place"); - assertStringReplace("Replace", "re", "", "UNICODE", "Replace"); - assertStringReplace("Replace", "re", "", "UNICODE_CI", "place"); - assertStringReplace("abcdabcd", "Bc", "", "UTF8_BINARY", "abcdabcd"); - assertStringReplace("abcdabcd", "Bc", "", "UTF8_LCASE", "adad"); - assertStringReplace("abcdabcd", "Bc", "", "UNICODE", "abcdabcd"); - assertStringReplace("abcdabcd", "Bc", "", "UNICODE_CI", "adad"); - assertStringReplace("AbcdabCd", "Bc", "", "UTF8_BINARY", "AbcdabCd"); - assertStringReplace("AbcdabCd", "Bc", "", "UTF8_LCASE", "Adad"); - assertStringReplace("AbcdabCd", "Bc", "", "UNICODE", "AbcdabCd"); - assertStringReplace("AbcdabCd", "Bc", "", "UNICODE_CI", "Adad"); + assertStringReplace("replace", "pl", "", UTF8_BINARY, "reace"); + assertStringReplace("replace", "pl", "", UTF8_LCASE, "reace"); + assertStringReplace("replace", "pl", "", UNICODE, "reace"); + assertStringReplace("replace", "pl", "", UNICODE_CI, "reace"); + assertStringReplace("replace", "", "123", UTF8_BINARY, "replace"); + assertStringReplace("replace", "", "123", UTF8_LCASE, "replace"); + assertStringReplace("replace", "", "123", UNICODE, "replace"); + assertStringReplace("replace", "", "123", UNICODE_CI, "replace"); + assertStringReplace("abcabc", "b", "12", UTF8_BINARY, "a12ca12c"); + assertStringReplace("abcabc", "b", "12", UTF8_LCASE, "a12ca12c"); + assertStringReplace("abcabc", "b", "12", UNICODE, "a12ca12c"); + assertStringReplace("abcabc", "b", "12", UNICODE_CI, "a12ca12c"); + assertStringReplace("replace", "plx", "123", UTF8_BINARY, "replace"); + assertStringReplace("replace", "plx", "123", UTF8_LCASE, "replace"); + assertStringReplace("replace", "plx", "123", UNICODE, "replace"); + assertStringReplace("replace", "plx", "123", UNICODE_CI, "replace"); + assertStringReplace("Replace", "re", "", UTF8_BINARY, "Replace"); + assertStringReplace("Replace", "re", "", UTF8_LCASE, "place"); + assertStringReplace("Replace", "re", "", UNICODE, "Replace"); + assertStringReplace("Replace", "re", "", UNICODE_CI, "place"); + assertStringReplace("abcdabcd", "Bc", "", UTF8_BINARY, "abcdabcd"); + assertStringReplace("abcdabcd", "Bc", "", UTF8_LCASE, "adad"); + assertStringReplace("abcdabcd", "Bc", "", UNICODE, "abcdabcd"); + assertStringReplace("abcdabcd", "Bc", "", UNICODE_CI, "adad"); + assertStringReplace("AbcdabCd", "Bc", "", UTF8_BINARY, "AbcdabCd"); + assertStringReplace("AbcdabCd", "Bc", "", UTF8_LCASE, "Adad"); + assertStringReplace("AbcdabCd", "Bc", "", UNICODE, "AbcdabCd"); + assertStringReplace("AbcdabCd", "Bc", "", UNICODE_CI, "Adad"); // Advanced tests. - assertStringReplace("abcdabcd", "bc", "", "UTF8_BINARY", "adad"); - assertStringReplace("r世eplace", "pl", "123", "UTF8_BINARY", "r世e123ace"); - assertStringReplace("世Replace", "re", "", "UTF8_BINARY", "世Replace"); - assertStringReplace("r世eplace", "pl", "xx", "UTF8_LCASE", "r世exxace"); - assertStringReplace("repl世ace", "PL", "AB", "UTF8_LCASE", "reAB世ace"); - assertStringReplace("re世place", "世", "x", "UTF8_LCASE", "rexplace"); - assertStringReplace("re世place", "plx", "123", "UNICODE", "re世place"); - assertStringReplace("replace世", "", "123", "UNICODE", "replace世"); - assertStringReplace("aBc世abc", "b", "12", "UNICODE", "aBc世a12c"); - assertStringReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c"); - assertStringReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad"); - assertStringReplace("repl世ace", "Pl", "", "UNICODE_CI", "re世ace"); + assertStringReplace("abcdabcd", "bc", "", UTF8_BINARY, "adad"); + assertStringReplace("r世eplace", "pl", "123", UTF8_BINARY, "r世e123ace"); + assertStringReplace("世Replace", "re", "", UTF8_BINARY, "世Replace"); + assertStringReplace("r世eplace", "pl", "xx", UTF8_LCASE, "r世exxace"); + assertStringReplace("repl世ace", "PL", "AB", UTF8_LCASE, "reAB世ace"); + assertStringReplace("re世place", "世", "x", UTF8_LCASE, "rexplace"); + assertStringReplace("re世place", "plx", "123", UNICODE, "re世place"); + assertStringReplace("replace世", "", "123", UNICODE, "replace世"); + assertStringReplace("aBc世abc", "b", "12", UNICODE, "aBc世a12c"); + assertStringReplace("aBc世abc", "b", "12", UNICODE_CI, "a12c世a12c"); + assertStringReplace("a世Bcdabcd", "bC", "", UNICODE_CI, "a世dad"); + assertStringReplace("repl世ace", "Pl", "", UNICODE_CI, "re世ace"); assertStringReplace("abcčšdabĆŠscd", "cs", "", "SR_CI_AI", "abcdabscd"); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12"); - assertStringReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12"); - assertStringReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12"); - assertStringReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12"); - assertStringReplace("İi̇İi̇İi̇", "i\u0307", "x", "UNICODE_CI", "xxxxxx"); - assertStringReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇"); - assertStringReplace("abİo12i̇o", "i\u0307o", "xx", "UNICODE_CI", "abxx12xx"); - assertStringReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"); - assertStringReplace("abi̇12", "i", "X", "UTF8_LCASE", "abX\u030712"); // != UNICODE_CI - assertStringReplace("abi̇12", "\u0307", "X", "UTF8_LCASE", "abiX12"); // != UNICODE_CI - assertStringReplace("abi̇12", "İ", "X", "UTF8_LCASE", "abX12"); - assertStringReplace("abİ12", "i", "X", "UTF8_LCASE", "abİ12"); - assertStringReplace("İi̇İi̇İi̇", "i\u0307", "x", "UTF8_LCASE", "xxxxxx"); - assertStringReplace("İi̇İi̇İi̇", "i", "x", "UTF8_LCASE", + assertStringReplace("abi̇12", "i", "X", UNICODE_CI, "abi̇12"); + assertStringReplace("abi̇12", "\u0307", "X", UNICODE_CI, "abi̇12"); + assertStringReplace("abi̇12", "İ", "X", UNICODE_CI, "abX12"); + assertStringReplace("abİ12", "i", "X", UNICODE_CI, "abİ12"); + assertStringReplace("İi̇İi̇İi̇", "i\u0307", "x", UNICODE_CI, "xxxxxx"); + assertStringReplace("İi̇İi̇İi̇", "i", "x", UNICODE_CI, "İi̇İi̇İi̇"); + assertStringReplace("abİo12i̇o", "i\u0307o", "xx", UNICODE_CI, "abxx12xx"); + assertStringReplace("abi̇o12i̇o", "İo", "yy", UNICODE_CI, "abyy12yy"); + assertStringReplace("abi̇12", "i", "X", UTF8_LCASE, "abX\u030712"); // != UNICODE_CI + assertStringReplace("abi̇12", "\u0307", "X", UTF8_LCASE, "abiX12"); // != UNICODE_CI + assertStringReplace("abi̇12", "İ", "X", UTF8_LCASE, "abX12"); + assertStringReplace("abİ12", "i", "X", UTF8_LCASE, "abİ12"); + assertStringReplace("İi̇İi̇İi̇", "i\u0307", "x", UTF8_LCASE, "xxxxxx"); + assertStringReplace("İi̇İi̇İi̇", "i", "x", UTF8_LCASE, "İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI - assertStringReplace("abİo12i̇o", "i\u0307o", "xx", "UTF8_LCASE", "abxx12xx"); - assertStringReplace("abi̇o12i̇o", "İo", "yy", "UTF8_LCASE", "abyy12yy"); + assertStringReplace("abİo12i̇o", "i\u0307o", "xx", UTF8_LCASE, "abxx12xx"); + assertStringReplace("abi̇o12i̇o", "İo", "yy", UTF8_LCASE, "abyy12yy"); // Conditional case mapping (e.g. Greek sigmas). - assertStringReplace("σ", "σ", "x", "UTF8_BINARY", "x"); - assertStringReplace("σ", "ς", "x", "UTF8_BINARY", "σ"); - assertStringReplace("σ", "Σ", "x", "UTF8_BINARY", "σ"); - assertStringReplace("ς", "σ", "x", "UTF8_BINARY", "ς"); - assertStringReplace("ς", "ς", "x", "UTF8_BINARY", "x"); - assertStringReplace("ς", "Σ", "x", "UTF8_BINARY", "ς"); - assertStringReplace("Σ", "σ", "x", "UTF8_BINARY", "Σ"); - assertStringReplace("Σ", "ς", "x", "UTF8_BINARY", "Σ"); - assertStringReplace("Σ", "Σ", "x", "UTF8_BINARY", "x"); - assertStringReplace("σ", "σ", "x", "UTF8_LCASE", "x"); - assertStringReplace("σ", "ς", "x", "UTF8_LCASE", "x"); - assertStringReplace("σ", "Σ", "x", "UTF8_LCASE", "x"); - assertStringReplace("ς", "σ", "x", "UTF8_LCASE", "x"); - assertStringReplace("ς", "ς", "x", "UTF8_LCASE", "x"); - assertStringReplace("ς", "Σ", "x", "UTF8_LCASE", "x"); - assertStringReplace("Σ", "σ", "x", "UTF8_LCASE", "x"); - assertStringReplace("Σ", "ς", "x", "UTF8_LCASE", "x"); - assertStringReplace("Σ", "Σ", "x", "UTF8_LCASE", "x"); - assertStringReplace("σ", "σ", "x", "UNICODE", "x"); - assertStringReplace("σ", "ς", "x", "UNICODE", "σ"); - assertStringReplace("σ", "Σ", "x", "UNICODE", "σ"); - assertStringReplace("ς", "σ", "x", "UNICODE", "ς"); - assertStringReplace("ς", "ς", "x", "UNICODE", "x"); - assertStringReplace("ς", "Σ", "x", "UNICODE", "ς"); - assertStringReplace("Σ", "σ", "x", "UNICODE", "Σ"); - assertStringReplace("Σ", "ς", "x", "UNICODE", "Σ"); - assertStringReplace("Σ", "Σ", "x", "UNICODE", "x"); - assertStringReplace("σ", "σ", "x", "UNICODE_CI", "x"); - assertStringReplace("σ", "ς", "x", "UNICODE_CI", "x"); - assertStringReplace("σ", "Σ", "x", "UNICODE_CI", "x"); - assertStringReplace("ς", "σ", "x", "UNICODE_CI", "x"); - assertStringReplace("ς", "ς", "x", "UNICODE_CI", "x"); - assertStringReplace("ς", "Σ", "x", "UNICODE_CI", "x"); - assertStringReplace("Σ", "σ", "x", "UNICODE_CI", "x"); - assertStringReplace("Σ", "ς", "x", "UNICODE_CI", "x"); - assertStringReplace("Σ", "Σ", "x", "UNICODE_CI", "x"); + assertStringReplace("σ", "σ", "x", UTF8_BINARY, "x"); + assertStringReplace("σ", "ς", "x", UTF8_BINARY, "σ"); + assertStringReplace("σ", "Σ", "x", UTF8_BINARY, "σ"); + assertStringReplace("ς", "σ", "x", UTF8_BINARY, "ς"); + assertStringReplace("ς", "ς", "x", UTF8_BINARY, "x"); + assertStringReplace("ς", "Σ", "x", UTF8_BINARY, "ς"); + assertStringReplace("Σ", "σ", "x", UTF8_BINARY, "Σ"); + assertStringReplace("Σ", "ς", "x", UTF8_BINARY, "Σ"); + assertStringReplace("Σ", "Σ", "x", UTF8_BINARY, "x"); + assertStringReplace("σ", "σ", "x", UTF8_LCASE, "x"); + assertStringReplace("σ", "ς", "x", UTF8_LCASE, "x"); + assertStringReplace("σ", "Σ", "x", UTF8_LCASE, "x"); + assertStringReplace("ς", "σ", "x", UTF8_LCASE, "x"); + assertStringReplace("ς", "ς", "x", UTF8_LCASE, "x"); + assertStringReplace("ς", "Σ", "x", UTF8_LCASE, "x"); + assertStringReplace("Σ", "σ", "x", UTF8_LCASE, "x"); + assertStringReplace("Σ", "ς", "x", UTF8_LCASE, "x"); + assertStringReplace("Σ", "Σ", "x", UTF8_LCASE, "x"); + assertStringReplace("σ", "σ", "x", UNICODE, "x"); + assertStringReplace("σ", "ς", "x", UNICODE, "σ"); + assertStringReplace("σ", "Σ", "x", UNICODE, "σ"); + assertStringReplace("ς", "σ", "x", UNICODE, "ς"); + assertStringReplace("ς", "ς", "x", UNICODE, "x"); + assertStringReplace("ς", "Σ", "x", UNICODE, "ς"); + assertStringReplace("Σ", "σ", "x", UNICODE, "Σ"); + assertStringReplace("Σ", "ς", "x", UNICODE, "Σ"); + assertStringReplace("Σ", "Σ", "x", UNICODE, "x"); + assertStringReplace("σ", "σ", "x", UNICODE_CI, "x"); + assertStringReplace("σ", "ς", "x", UNICODE_CI, "x"); + assertStringReplace("σ", "Σ", "x", UNICODE_CI, "x"); + assertStringReplace("ς", "σ", "x", UNICODE_CI, "x"); + assertStringReplace("ς", "ς", "x", UNICODE_CI, "x"); + assertStringReplace("ς", "Σ", "x", UNICODE_CI, "x"); + assertStringReplace("Σ", "σ", "x", UNICODE_CI, "x"); + assertStringReplace("Σ", "ς", "x", UNICODE_CI, "x"); + assertStringReplace("Σ", "Σ", "x", UNICODE_CI, "x"); // Surrogate pairs. - assertStringReplace("a🙃b", "a", "x", "UTF8_BINARY", "x🙃b"); - assertStringReplace("a🙃b", "b", "x", "UTF8_BINARY", "a🙃x"); - assertStringReplace("a🙃b", "🙃", "x", "UTF8_BINARY", "axb"); - assertStringReplace("a🙃b", "b", "c", "UTF8_LCASE", "a🙃c"); - assertStringReplace("a🙃b", "b", "x", "UTF8_LCASE", "a🙃x"); - assertStringReplace("a🙃b", "🙃", "x", "UTF8_LCASE", "axb"); - assertStringReplace("a🙃b", "b", "c", "UNICODE", "a🙃c"); - assertStringReplace("a🙃b", "b", "x", "UNICODE", "a🙃x"); - assertStringReplace("a🙃b", "🙃", "x", "UNICODE", "axb"); - assertStringReplace("a🙃b", "b", "c", "UNICODE_CI", "a🙃c"); - assertStringReplace("a🙃b", "b", "x", "UNICODE_CI", "a🙃x"); - assertStringReplace("a🙃b", "🙃", "x", "UNICODE_CI", "axb"); + assertStringReplace("a🙃b", "a", "x", UTF8_BINARY, "x🙃b"); + assertStringReplace("a🙃b", "b", "x", UTF8_BINARY, "a🙃x"); + assertStringReplace("a🙃b", "🙃", "x", UTF8_BINARY, "axb"); + assertStringReplace("a🙃b", "b", "c", UTF8_LCASE, "a🙃c"); + assertStringReplace("a🙃b", "b", "x", UTF8_LCASE, "a🙃x"); + assertStringReplace("a🙃b", "🙃", "x", UTF8_LCASE, "axb"); + assertStringReplace("a🙃b", "b", "c", UNICODE, "a🙃c"); + assertStringReplace("a🙃b", "b", "x", UNICODE, "a🙃x"); + assertStringReplace("a🙃b", "🙃", "x", UNICODE, "axb"); + assertStringReplace("a🙃b", "b", "c", UNICODE_CI, "a🙃c"); + assertStringReplace("a🙃b", "b", "x", UNICODE_CI, "a🙃x"); + assertStringReplace("a🙃b", "🙃", "x", UNICODE_CI, "axb"); } /** @@ -2145,293 +2146,293 @@ private void assertStringLocate(String substring, String string, int start, @Test public void testStringLocate() throws SparkException { // Empty strings. - assertStringLocate("", "", -1, "UTF8_BINARY", 1); - assertStringLocate("", "", -1, "UTF8_LCASE", 1); - assertStringLocate("", "", -1, "UNICODE", 1); - assertStringLocate("", "", -1, "UNICODE_CI", 1); - assertStringLocate("", "", 0, "UTF8_BINARY", 1); - assertStringLocate("", "", 0, "UTF8_LCASE", 1); - assertStringLocate("", "", 0, "UNICODE", 1); - assertStringLocate("", "", 0, "UNICODE_CI", 1); - assertStringLocate("", "", 1, "UTF8_BINARY", 1); - assertStringLocate("", "", 1, "UTF8_LCASE", 1); - assertStringLocate("", "", 1, "UNICODE", 1); - assertStringLocate("", "", 1, "UNICODE_CI", 1); - assertStringLocate("a", "", -1, "UTF8_BINARY", 0); - assertStringLocate("a", "", -1, "UTF8_LCASE", 0); - assertStringLocate("a", "", -1, "UNICODE", 0); - assertStringLocate("a", "", -1, "UNICODE_CI", 0); - assertStringLocate("a", "", 0, "UTF8_BINARY", 0); - assertStringLocate("a", "", 0, "UTF8_LCASE", 0); - assertStringLocate("a", "", 0, "UNICODE", 0); - assertStringLocate("a", "", 0, "UNICODE_CI", 0); - assertStringLocate("a", "", 1, "UTF8_BINARY", 0); - assertStringLocate("a", "", 1, "UTF8_LCASE", 0); - assertStringLocate("a", "", 1, "UNICODE", 0); - assertStringLocate("a", "", 1, "UNICODE_CI", 0); - assertStringLocate("", "x", -1, "UTF8_BINARY", 1); - assertStringLocate("", "x", -1, "UTF8_LCASE", 1); - assertStringLocate("", "x", -1, "UNICODE", 1); - assertStringLocate("", "x", -1, "UNICODE_CI", 1); - assertStringLocate("", "x", 0, "UTF8_BINARY", 1); - assertStringLocate("", "x", 0, "UTF8_LCASE", 1); - assertStringLocate("", "x", 0, "UNICODE", 1); - assertStringLocate("", "x", 0, "UNICODE_CI", 1); - assertStringLocate("", "x", 1, "UTF8_BINARY", 1); - assertStringLocate("", "x", 1, "UTF8_LCASE", 1); - assertStringLocate("", "x", 1, "UNICODE", 1); - assertStringLocate("", "x", 1, "UNICODE_CI", 1); + assertStringLocate("", "", -1, UTF8_BINARY, 1); + assertStringLocate("", "", -1, UTF8_LCASE, 1); + assertStringLocate("", "", -1, UNICODE, 1); + assertStringLocate("", "", -1, UNICODE_CI, 1); + assertStringLocate("", "", 0, UTF8_BINARY, 1); + assertStringLocate("", "", 0, UTF8_LCASE, 1); + assertStringLocate("", "", 0, UNICODE, 1); + assertStringLocate("", "", 0, UNICODE_CI, 1); + assertStringLocate("", "", 1, UTF8_BINARY, 1); + assertStringLocate("", "", 1, UTF8_LCASE, 1); + assertStringLocate("", "", 1, UNICODE, 1); + assertStringLocate("", "", 1, UNICODE_CI, 1); + assertStringLocate("a", "", -1, UTF8_BINARY, 0); + assertStringLocate("a", "", -1, UTF8_LCASE, 0); + assertStringLocate("a", "", -1, UNICODE, 0); + assertStringLocate("a", "", -1, UNICODE_CI, 0); + assertStringLocate("a", "", 0, UTF8_BINARY, 0); + assertStringLocate("a", "", 0, UTF8_LCASE, 0); + assertStringLocate("a", "", 0, UNICODE, 0); + assertStringLocate("a", "", 0, UNICODE_CI, 0); + assertStringLocate("a", "", 1, UTF8_BINARY, 0); + assertStringLocate("a", "", 1, UTF8_LCASE, 0); + assertStringLocate("a", "", 1, UNICODE, 0); + assertStringLocate("a", "", 1, UNICODE_CI, 0); + assertStringLocate("", "x", -1, UTF8_BINARY, 1); + assertStringLocate("", "x", -1, UTF8_LCASE, 1); + assertStringLocate("", "x", -1, UNICODE, 1); + assertStringLocate("", "x", -1, UNICODE_CI, 1); + assertStringLocate("", "x", 0, UTF8_BINARY, 1); + assertStringLocate("", "x", 0, UTF8_LCASE, 1); + assertStringLocate("", "x", 0, UNICODE, 1); + assertStringLocate("", "x", 0, UNICODE_CI, 1); + assertStringLocate("", "x", 1, UTF8_BINARY, 1); + assertStringLocate("", "x", 1, UTF8_LCASE, 1); + assertStringLocate("", "x", 1, UNICODE, 1); + assertStringLocate("", "x", 1, UNICODE_CI, 1); // Basic tests. - assertStringLocate("aa", "aaads", 1, "UTF8_BINARY", 1); - assertStringLocate("aa", "aaads", 1, "UTF8_LCASE", 1); - assertStringLocate("aa", "aaads", 1, "UNICODE", 1); - assertStringLocate("aa", "aaads", 1, "UNICODE_CI", 1); - assertStringLocate("aa", "aaads", 2, "UTF8_BINARY", 2); - assertStringLocate("aa", "aaads", 2, "UTF8_LCASE", 2); - assertStringLocate("aa", "aaads", 2, "UNICODE", 2); - assertStringLocate("aa", "aaads", 2, "UNICODE_CI", 2); - assertStringLocate("aa", "aaads", 3, "UTF8_BINARY", 0); - assertStringLocate("aa", "aaads", 3, "UTF8_LCASE", 0); - assertStringLocate("aa", "aaads", 3, "UNICODE", 0); - assertStringLocate("aa", "aaads", 3, "UNICODE_CI", 0); - assertStringLocate("Aa", "aaads", 1, "UTF8_BINARY", 0); - assertStringLocate("Aa", "aaads", 1, "UTF8_LCASE", 1); - assertStringLocate("Aa", "aaads", 1, "UNICODE", 0); - assertStringLocate("Aa", "aaads", 1, "UNICODE_CI", 1); - assertStringLocate("Aa", "aaads", 2, "UTF8_BINARY", 0); - assertStringLocate("Aa", "aaads", 2, "UTF8_LCASE", 2); - assertStringLocate("Aa", "aaads", 2, "UNICODE", 0); - assertStringLocate("Aa", "aaads", 2, "UNICODE_CI", 2); - assertStringLocate("Aa", "aaads", 3, "UTF8_BINARY", 0); - assertStringLocate("Aa", "aaads", 3, "UTF8_LCASE", 0); - assertStringLocate("Aa", "aaads", 3, "UNICODE", 0); - assertStringLocate("Aa", "aaads", 3, "UNICODE_CI", 0); - assertStringLocate("Aa", "aAads", 1, "UTF8_BINARY", 2); - assertStringLocate("Aa", "aAads", 1, "UTF8_LCASE", 1); - assertStringLocate("Aa", "aAads", 1, "UNICODE", 2); - assertStringLocate("Aa", "aAads", 1, "UNICODE_CI", 1); - assertStringLocate("AA", "aaads", 1, "UTF8_BINARY", 0); - assertStringLocate("AA", "aaads", 1, "UTF8_LCASE", 1); - assertStringLocate("AA", "aaads", 1, "UNICODE", 0); - assertStringLocate("AA", "aaads", 1, "UNICODE_CI", 1); - assertStringLocate("aa", "aAads", 2, "UTF8_BINARY", 0); - assertStringLocate("aa", "aAads", 2, "UTF8_LCASE", 2); - assertStringLocate("aa", "aAads", 2, "UNICODE", 0); - assertStringLocate("aa", "aAads", 2, "UNICODE_CI", 2); - assertStringLocate("aa", "aaAds", 3, "UTF8_BINARY", 0); - assertStringLocate("aa", "aaAds", 3, "UTF8_LCASE", 0); - assertStringLocate("aa", "aaAds", 3, "UNICODE", 0); - assertStringLocate("aa", "aaAds", 3, "UNICODE_CI", 0); - assertStringLocate("abC", "abcabc", 1, "UTF8_BINARY", 0); - assertStringLocate("abC", "abcabc", 1, "UTF8_LCASE", 1); - assertStringLocate("abC", "abcabc", 1, "UNICODE", 0); - assertStringLocate("abC", "abcabc", 1, "UNICODE_CI", 1); - assertStringLocate("abC", "abCabc", 2, "UTF8_BINARY", 0); - assertStringLocate("abC", "abCabc", 2, "UTF8_LCASE", 4); - assertStringLocate("abC", "abCabc", 2, "UNICODE", 0); - assertStringLocate("abC", "abCabc", 2, "UNICODE_CI", 4); - assertStringLocate("abc", "abcabc", 1, "UTF8_BINARY", 1); - assertStringLocate("abc", "abcabc", 1, "UTF8_LCASE", 1); - assertStringLocate("abc", "abcabc", 1, "UNICODE", 1); - assertStringLocate("abc", "abcabc", 1, "UNICODE_CI", 1); - assertStringLocate("abc", "abcabc", 2, "UTF8_BINARY", 4); - assertStringLocate("abc", "abcabc", 2, "UTF8_LCASE", 4); - assertStringLocate("abc", "abcabc", 2, "UNICODE", 4); - assertStringLocate("abc", "abcabc", 2, "UNICODE_CI", 4); - assertStringLocate("abc", "abcabc", 3, "UTF8_BINARY", 4); - assertStringLocate("abc", "abcabc", 3, "UTF8_LCASE", 4); - assertStringLocate("abc", "abcabc", 3, "UNICODE", 4); - assertStringLocate("abc", "abcabc", 3, "UNICODE_CI", 4); - assertStringLocate("abc", "abcabc", 4, "UTF8_BINARY", 4); - assertStringLocate("abc", "abcabc", 4, "UTF8_LCASE", 4); - assertStringLocate("abc", "abcabc", 4, "UNICODE", 4); - assertStringLocate("abc", "abcabc", 4, "UNICODE_CI", 4); - assertStringLocate("aa", "Aaads", 1, "UTF8_BINARY", 2); - assertStringLocate("aa", "Aaads", 1, "UTF8_LCASE", 1); - assertStringLocate("aa", "Aaads", 1, "UNICODE", 2); - assertStringLocate("aa", "Aaads", 1, "UNICODE_CI", 1); + assertStringLocate("aa", "aaads", 1, UTF8_BINARY, 1); + assertStringLocate("aa", "aaads", 1, UTF8_LCASE, 1); + assertStringLocate("aa", "aaads", 1, UNICODE, 1); + assertStringLocate("aa", "aaads", 1, UNICODE_CI, 1); + assertStringLocate("aa", "aaads", 2, UTF8_BINARY, 2); + assertStringLocate("aa", "aaads", 2, UTF8_LCASE, 2); + assertStringLocate("aa", "aaads", 2, UNICODE, 2); + assertStringLocate("aa", "aaads", 2, UNICODE_CI, 2); + assertStringLocate("aa", "aaads", 3, UTF8_BINARY, 0); + assertStringLocate("aa", "aaads", 3, UTF8_LCASE, 0); + assertStringLocate("aa", "aaads", 3, UNICODE, 0); + assertStringLocate("aa", "aaads", 3, UNICODE_CI, 0); + assertStringLocate("Aa", "aaads", 1, UTF8_BINARY, 0); + assertStringLocate("Aa", "aaads", 1, UTF8_LCASE, 1); + assertStringLocate("Aa", "aaads", 1, UNICODE, 0); + assertStringLocate("Aa", "aaads", 1, UNICODE_CI, 1); + assertStringLocate("Aa", "aaads", 2, UTF8_BINARY, 0); + assertStringLocate("Aa", "aaads", 2, UTF8_LCASE, 2); + assertStringLocate("Aa", "aaads", 2, UNICODE, 0); + assertStringLocate("Aa", "aaads", 2, UNICODE_CI, 2); + assertStringLocate("Aa", "aaads", 3, UTF8_BINARY, 0); + assertStringLocate("Aa", "aaads", 3, UTF8_LCASE, 0); + assertStringLocate("Aa", "aaads", 3, UNICODE, 0); + assertStringLocate("Aa", "aaads", 3, UNICODE_CI, 0); + assertStringLocate("Aa", "aAads", 1, UTF8_BINARY, 2); + assertStringLocate("Aa", "aAads", 1, UTF8_LCASE, 1); + assertStringLocate("Aa", "aAads", 1, UNICODE, 2); + assertStringLocate("Aa", "aAads", 1, UNICODE_CI, 1); + assertStringLocate("AA", "aaads", 1, UTF8_BINARY, 0); + assertStringLocate("AA", "aaads", 1, UTF8_LCASE, 1); + assertStringLocate("AA", "aaads", 1, UNICODE, 0); + assertStringLocate("AA", "aaads", 1, UNICODE_CI, 1); + assertStringLocate("aa", "aAads", 2, UTF8_BINARY, 0); + assertStringLocate("aa", "aAads", 2, UTF8_LCASE, 2); + assertStringLocate("aa", "aAads", 2, UNICODE, 0); + assertStringLocate("aa", "aAads", 2, UNICODE_CI, 2); + assertStringLocate("aa", "aaAds", 3, UTF8_BINARY, 0); + assertStringLocate("aa", "aaAds", 3, UTF8_LCASE, 0); + assertStringLocate("aa", "aaAds", 3, UNICODE, 0); + assertStringLocate("aa", "aaAds", 3, UNICODE_CI, 0); + assertStringLocate("abC", "abcabc", 1, UTF8_BINARY, 0); + assertStringLocate("abC", "abcabc", 1, UTF8_LCASE, 1); + assertStringLocate("abC", "abcabc", 1, UNICODE, 0); + assertStringLocate("abC", "abcabc", 1, UNICODE_CI, 1); + assertStringLocate("abC", "abCabc", 2, UTF8_BINARY, 0); + assertStringLocate("abC", "abCabc", 2, UTF8_LCASE, 4); + assertStringLocate("abC", "abCabc", 2, UNICODE, 0); + assertStringLocate("abC", "abCabc", 2, UNICODE_CI, 4); + assertStringLocate("abc", "abcabc", 1, UTF8_BINARY, 1); + assertStringLocate("abc", "abcabc", 1, UTF8_LCASE, 1); + assertStringLocate("abc", "abcabc", 1, UNICODE, 1); + assertStringLocate("abc", "abcabc", 1, UNICODE_CI, 1); + assertStringLocate("abc", "abcabc", 2, UTF8_BINARY, 4); + assertStringLocate("abc", "abcabc", 2, UTF8_LCASE, 4); + assertStringLocate("abc", "abcabc", 2, UNICODE, 4); + assertStringLocate("abc", "abcabc", 2, UNICODE_CI, 4); + assertStringLocate("abc", "abcabc", 3, UTF8_BINARY, 4); + assertStringLocate("abc", "abcabc", 3, UTF8_LCASE, 4); + assertStringLocate("abc", "abcabc", 3, UNICODE, 4); + assertStringLocate("abc", "abcabc", 3, UNICODE_CI, 4); + assertStringLocate("abc", "abcabc", 4, UTF8_BINARY, 4); + assertStringLocate("abc", "abcabc", 4, UTF8_LCASE, 4); + assertStringLocate("abc", "abcabc", 4, UNICODE, 4); + assertStringLocate("abc", "abcabc", 4, UNICODE_CI, 4); + assertStringLocate("aa", "Aaads", 1, UTF8_BINARY, 2); + assertStringLocate("aa", "Aaads", 1, UTF8_LCASE, 1); + assertStringLocate("aa", "Aaads", 1, UNICODE, 2); + assertStringLocate("aa", "Aaads", 1, UNICODE_CI, 1); assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR", 14); assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR_CI_AI", 3); // Advanced tests. - assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0); - assertStringLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8); - assertStringLocate("界", "test大千世界X大千世界", 13, "UTF8_BINARY", 13); - assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8); - assertStringLocate("界X", "test大千世界Xtest大千世界", 1, "UTF8_LCASE", 8); - assertStringLocate("界", "test大千世界X大千世界", 13, "UTF8_LCASE", 13); - assertStringLocate("大千", "test大千世界大千世界", 1, "UTF8_LCASE", 5); - assertStringLocate("大千", "test大千世界大千世界", 9, "UTF8_LCASE", 9); - assertStringLocate("大千", "大千世界大千世界", 1, "UTF8_LCASE", 1); - assertStringLocate("界x", "test大千世界X大千世界", 1, "UNICODE", 0); - assertStringLocate("界X", "test大千世界X大千世界", 1, "UNICODE", 8); - assertStringLocate("界", "test大千世界X大千世界", 13, "UNICODE", 13); - assertStringLocate("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8); - assertStringLocate("界", "test大千世界X大千世界", 13, "UNICODE_CI", 13); - assertStringLocate("大千", "test大千世界大千世界", 1, "UNICODE_CI", 5); - assertStringLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); - assertStringLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); + assertStringLocate("界x", "test大千世界X大千世界", 1, UTF8_BINARY, 0); + assertStringLocate("界X", "test大千世界X大千世界", 1, UTF8_BINARY, 8); + assertStringLocate("界", "test大千世界X大千世界", 13, UTF8_BINARY, 13); + assertStringLocate("界x", "test大千世界X大千世界", 1, UTF8_LCASE, 8); + assertStringLocate("界X", "test大千世界Xtest大千世界", 1, UTF8_LCASE, 8); + assertStringLocate("界", "test大千世界X大千世界", 13, UTF8_LCASE, 13); + assertStringLocate("大千", "test大千世界大千世界", 1, UTF8_LCASE, 5); + assertStringLocate("大千", "test大千世界大千世界", 9, UTF8_LCASE, 9); + assertStringLocate("大千", "大千世界大千世界", 1, UTF8_LCASE, 1); + assertStringLocate("界x", "test大千世界X大千世界", 1, UNICODE, 0); + assertStringLocate("界X", "test大千世界X大千世界", 1, UNICODE, 8); + assertStringLocate("界", "test大千世界X大千世界", 13, UNICODE, 13); + assertStringLocate("界x", "test大千世界X大千世界", 1, UNICODE_CI, 8); + assertStringLocate("界", "test大千世界X大千世界", 13, UNICODE_CI, 13); + assertStringLocate("大千", "test大千世界大千世界", 1, UNICODE_CI, 5); + assertStringLocate("大千", "test大千世界大千世界", 9, UNICODE_CI, 9); + assertStringLocate("大千", "大千世界大千世界", 1, UNICODE_CI, 1); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringLocate("\u0307", "i\u0307", 1, "UTF8_BINARY", 2); - assertStringLocate("\u0307", "İ", 1, "UTF8_LCASE", 0); // != UTF8_BINARY - assertStringLocate("i", "i\u0307", 1, "UNICODE_CI", 0); - assertStringLocate("\u0307", "i\u0307", 1, "UNICODE_CI", 0); - assertStringLocate("i\u0307", "i", 1, "UNICODE_CI", 0); - assertStringLocate("İ", "i\u0307", 1, "UNICODE_CI", 1); - assertStringLocate("İ", "i", 1, "UNICODE_CI", 0); - assertStringLocate("i", "i\u0307", 1, "UTF8_LCASE", 1); // != UNICODE_CI - assertStringLocate("\u0307", "i\u0307", 1, "UTF8_LCASE", 2); // != UNICODE_CI - assertStringLocate("i\u0307", "i", 1, "UTF8_LCASE", 0); - assertStringLocate("İ", "i\u0307", 1, "UTF8_LCASE", 1); - assertStringLocate("İ", "i", 1, "UTF8_LCASE", 0); - assertStringLocate("i\u0307o", "İo世界大千世界", 1, "UNICODE_CI", 1); - assertStringLocate("i\u0307o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); - assertStringLocate("i\u0307o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); - assertStringLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1); - assertStringLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3); - assertStringLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); + assertStringLocate("\u0307", "i\u0307", 1, UTF8_BINARY, 2); + assertStringLocate("\u0307", "İ", 1, UTF8_LCASE, 0); // != UTF8_BINARY + assertStringLocate("i", "i\u0307", 1, UNICODE_CI, 0); + assertStringLocate("\u0307", "i\u0307", 1, UNICODE_CI, 0); + assertStringLocate("i\u0307", "i", 1, UNICODE_CI, 0); + assertStringLocate("İ", "i\u0307", 1, UNICODE_CI, 1); + assertStringLocate("İ", "i", 1, UNICODE_CI, 0); + assertStringLocate("i", "i\u0307", 1, UTF8_LCASE, 1); // != UNICODE_CI + assertStringLocate("\u0307", "i\u0307", 1, UTF8_LCASE, 2); // != UNICODE_CI + assertStringLocate("i\u0307", "i", 1, UTF8_LCASE, 0); + assertStringLocate("İ", "i\u0307", 1, UTF8_LCASE, 1); + assertStringLocate("İ", "i", 1, UTF8_LCASE, 0); + assertStringLocate("i\u0307o", "İo世界大千世界", 1, UNICODE_CI, 1); + assertStringLocate("i\u0307o", "大千İo世界大千世界", 1, UNICODE_CI, 3); + assertStringLocate("i\u0307o", "世界İo大千世界大千İo", 4, UNICODE_CI, 11); + assertStringLocate("İo", "i̇o世界大千世界", 1, UNICODE_CI, 1); + assertStringLocate("İo", "大千i̇o世界大千世界", 1, UNICODE_CI, 3); + assertStringLocate("İo", "世界i̇o大千世界大千i̇o", 4, UNICODE_CI, 12); // Conditional case mapping (e.g. Greek sigmas). - assertStringLocate("σ", "σ", 1, "UTF8_BINARY", 1); - assertStringLocate("σ", "ς", 1, "UTF8_BINARY", 0); - assertStringLocate("σ", "Σ", 1, "UTF8_BINARY", 0); - assertStringLocate("ς", "σ", 1, "UTF8_BINARY", 0); - assertStringLocate("ς", "ς", 1, "UTF8_BINARY", 1); - assertStringLocate("ς", "Σ", 1, "UTF8_BINARY", 0); - assertStringLocate("Σ", "σ", 1, "UTF8_BINARY", 0); - assertStringLocate("Σ", "ς", 1, "UTF8_BINARY", 0); - assertStringLocate("Σ", "Σ", 1, "UTF8_BINARY", 1); - assertStringLocate("σ", "σ", 1, "UTF8_LCASE", 1); - assertStringLocate("σ", "ς", 1, "UTF8_LCASE", 1); - assertStringLocate("σ", "Σ", 1, "UTF8_LCASE", 1); - assertStringLocate("ς", "σ", 1, "UTF8_LCASE", 1); - assertStringLocate("ς", "ς", 1, "UTF8_LCASE", 1); - assertStringLocate("ς", "Σ", 1, "UTF8_LCASE", 1); - assertStringLocate("Σ", "σ", 1, "UTF8_LCASE", 1); - assertStringLocate("Σ", "ς", 1, "UTF8_LCASE", 1); - assertStringLocate("Σ", "Σ", 1, "UTF8_LCASE", 1); - assertStringLocate("σ", "σ", 1, "UNICODE", 1); - assertStringLocate("σ", "ς", 1, "UNICODE", 0); - assertStringLocate("σ", "Σ", 1, "UNICODE", 0); - assertStringLocate("ς", "σ", 1, "UNICODE", 0); - assertStringLocate("ς", "ς", 1, "UNICODE", 1); - assertStringLocate("ς", "Σ", 1, "UNICODE", 0); - assertStringLocate("Σ", "σ", 1, "UNICODE", 0); - assertStringLocate("Σ", "ς", 1, "UNICODE", 0); - assertStringLocate("Σ", "Σ", 1, "UNICODE", 1); - assertStringLocate("σ", "σ", 1, "UNICODE_CI", 1); - assertStringLocate("σ", "ς", 1, "UNICODE_CI", 1); - assertStringLocate("σ", "Σ", 1, "UNICODE_CI", 1); - assertStringLocate("ς", "σ", 1, "UNICODE_CI", 1); - assertStringLocate("ς", "ς", 1, "UNICODE_CI", 1); - assertStringLocate("ς", "Σ", 1, "UNICODE_CI", 1); - assertStringLocate("Σ", "σ", 1, "UNICODE_CI", 1); - assertStringLocate("Σ", "ς", 1, "UNICODE_CI", 1); - assertStringLocate("Σ", "Σ", 1, "UNICODE_CI", 1); + assertStringLocate("σ", "σ", 1, UTF8_BINARY, 1); + assertStringLocate("σ", "ς", 1, UTF8_BINARY, 0); + assertStringLocate("σ", "Σ", 1, UTF8_BINARY, 0); + assertStringLocate("ς", "σ", 1, UTF8_BINARY, 0); + assertStringLocate("ς", "ς", 1, UTF8_BINARY, 1); + assertStringLocate("ς", "Σ", 1, UTF8_BINARY, 0); + assertStringLocate("Σ", "σ", 1, UTF8_BINARY, 0); + assertStringLocate("Σ", "ς", 1, UTF8_BINARY, 0); + assertStringLocate("Σ", "Σ", 1, UTF8_BINARY, 1); + assertStringLocate("σ", "σ", 1, UTF8_LCASE, 1); + assertStringLocate("σ", "ς", 1, UTF8_LCASE, 1); + assertStringLocate("σ", "Σ", 1, UTF8_LCASE, 1); + assertStringLocate("ς", "σ", 1, UTF8_LCASE, 1); + assertStringLocate("ς", "ς", 1, UTF8_LCASE, 1); + assertStringLocate("ς", "Σ", 1, UTF8_LCASE, 1); + assertStringLocate("Σ", "σ", 1, UTF8_LCASE, 1); + assertStringLocate("Σ", "ς", 1, UTF8_LCASE, 1); + assertStringLocate("Σ", "Σ", 1, UTF8_LCASE, 1); + assertStringLocate("σ", "σ", 1, UNICODE, 1); + assertStringLocate("σ", "ς", 1, UNICODE, 0); + assertStringLocate("σ", "Σ", 1, UNICODE, 0); + assertStringLocate("ς", "σ", 1, UNICODE, 0); + assertStringLocate("ς", "ς", 1, UNICODE, 1); + assertStringLocate("ς", "Σ", 1, UNICODE, 0); + assertStringLocate("Σ", "σ", 1, UNICODE, 0); + assertStringLocate("Σ", "ς", 1, UNICODE, 0); + assertStringLocate("Σ", "Σ", 1, UNICODE, 1); + assertStringLocate("σ", "σ", 1, UNICODE_CI, 1); + assertStringLocate("σ", "ς", 1, UNICODE_CI, 1); + assertStringLocate("σ", "Σ", 1, UNICODE_CI, 1); + assertStringLocate("ς", "σ", 1, UNICODE_CI, 1); + assertStringLocate("ς", "ς", 1, UNICODE_CI, 1); + assertStringLocate("ς", "Σ", 1, UNICODE_CI, 1); + assertStringLocate("Σ", "σ", 1, UNICODE_CI, 1); + assertStringLocate("Σ", "ς", 1, UNICODE_CI, 1); + assertStringLocate("Σ", "Σ", 1, UNICODE_CI, 1); // Surrogate pairs. - assertStringLocate("a", "a🙃b", 1, "UTF8_BINARY", 1); - assertStringLocate("a", "a🙃b", 1, "UTF8_LCASE", 1); - assertStringLocate("a", "a🙃b", 1, "UNICODE", 1); - assertStringLocate("a", "a🙃b", 1, "UNICODE_CI", 1); - assertStringLocate("a", "a🙃b", 2, "UTF8_BINARY", 0); - assertStringLocate("a", "a🙃b", 2, "UTF8_LCASE", 0); - assertStringLocate("a", "a🙃b", 2, "UNICODE", 0); - assertStringLocate("a", "a🙃b", 2, "UNICODE_CI", 0); - assertStringLocate("a", "a🙃b", 3, "UTF8_BINARY", 0); - assertStringLocate("a", "a🙃b", 3, "UTF8_LCASE", 0); - assertStringLocate("a", "a🙃b", 3, "UNICODE", 0); - assertStringLocate("a", "a🙃b", 3, "UNICODE_CI", 0); - assertStringLocate("🙃", "a🙃b", 1, "UTF8_BINARY", 2); - assertStringLocate("🙃", "a🙃b", 1, "UTF8_LCASE", 2); - assertStringLocate("🙃", "a🙃b", 1, "UNICODE", 2); - assertStringLocate("🙃", "a🙃b", 1, "UNICODE_CI", 2); - assertStringLocate("🙃", "a🙃b", 2, "UTF8_BINARY", 2); - assertStringLocate("🙃", "a🙃b", 2, "UTF8_LCASE", 2); - assertStringLocate("🙃", "a🙃b", 2, "UNICODE", 2); - assertStringLocate("🙃", "a🙃b", 2, "UNICODE_CI", 2); - assertStringLocate("🙃", "a🙃b", 3, "UTF8_BINARY", 0); - assertStringLocate("🙃", "a🙃b", 3, "UTF8_LCASE", 0); - assertStringLocate("🙃", "a🙃b", 3, "UNICODE", 0); - assertStringLocate("🙃", "a🙃b", 3, "UNICODE_CI", 0); - assertStringLocate("b", "a🙃b", 1, "UTF8_BINARY", 3); - assertStringLocate("b", "a🙃b", 1, "UTF8_LCASE", 3); - assertStringLocate("b", "a🙃b", 1, "UNICODE", 3); - assertStringLocate("b", "a🙃b", 1, "UNICODE_CI", 3); - assertStringLocate("b", "a🙃b", 2, "UTF8_BINARY", 3); - assertStringLocate("b", "a🙃b", 2, "UTF8_LCASE", 3); - assertStringLocate("b", "a🙃b", 2, "UNICODE", 3); - assertStringLocate("b", "a🙃b", 2, "UNICODE_CI", 3); - assertStringLocate("b", "a🙃b", 3, "UTF8_BINARY", 3); - assertStringLocate("b", "a🙃b", 3, "UTF8_LCASE", 3); - assertStringLocate("b", "a🙃b", 3, "UNICODE", 3); - assertStringLocate("b", "a🙃b", 3, "UNICODE_CI", 3); - assertStringLocate("🙃", "a🙃🙃b", 1, "UTF8_BINARY", 2); - assertStringLocate("🙃", "a🙃🙃b", 1, "UTF8_LCASE", 2); - assertStringLocate("🙃", "a🙃🙃b", 1, "UNICODE", 2); - assertStringLocate("🙃", "a🙃🙃b", 1, "UNICODE_CI", 2); - assertStringLocate("🙃", "a🙃🙃b", 2, "UTF8_BINARY", 2); - assertStringLocate("🙃", "a🙃🙃b", 2, "UTF8_LCASE", 2); - assertStringLocate("🙃", "a🙃🙃b", 2, "UNICODE", 2); - assertStringLocate("🙃", "a🙃🙃b", 2, "UNICODE_CI", 2); - assertStringLocate("🙃", "a🙃🙃b", 3, "UTF8_BINARY", 3); - assertStringLocate("🙃", "a🙃🙃b", 3, "UTF8_LCASE", 3); - assertStringLocate("🙃", "a🙃🙃b", 3, "UNICODE", 3); - assertStringLocate("🙃", "a🙃🙃b", 3, "UNICODE_CI", 3); - assertStringLocate("🙃", "a🙃🙃b", 4, "UTF8_BINARY", 0); - assertStringLocate("🙃", "a🙃🙃b", 4, "UTF8_LCASE", 0); - assertStringLocate("🙃", "a🙃🙃b", 4, "UNICODE", 0); - assertStringLocate("🙃", "a🙃🙃b", 4, "UNICODE_CI", 0); - assertStringLocate("b", "a🙃🙃b", 1, "UTF8_BINARY", 4); - assertStringLocate("b", "a🙃🙃b", 1, "UTF8_LCASE", 4); - assertStringLocate("b", "a🙃🙃b", 1, "UNICODE", 4); - assertStringLocate("b", "a🙃🙃b", 1, "UNICODE_CI", 4); - assertStringLocate("b", "a🙃🙃b", 2, "UTF8_BINARY", 4); - assertStringLocate("b", "a🙃🙃b", 2, "UTF8_LCASE", 4); - assertStringLocate("b", "a🙃🙃b", 2, "UNICODE", 4); - assertStringLocate("b", "a🙃🙃b", 2, "UNICODE_CI", 4); - assertStringLocate("b", "a🙃🙃b", 3, "UTF8_BINARY", 4); - assertStringLocate("b", "a🙃🙃b", 3, "UTF8_LCASE", 4); - assertStringLocate("b", "a🙃🙃b", 3, "UNICODE", 4); - assertStringLocate("b", "a🙃🙃b", 3, "UNICODE_CI", 4); - assertStringLocate("b", "a🙃🙃b", 4, "UTF8_BINARY", 4); - assertStringLocate("b", "a🙃🙃b", 4, "UTF8_LCASE", 4); - assertStringLocate("b", "a🙃🙃b", 4, "UNICODE", 4); - assertStringLocate("b", "a🙃🙃b", 4, "UNICODE_CI", 4); - assertStringLocate("b", "a🙃x🙃b", 1, "UTF8_BINARY", 5); - assertStringLocate("b", "a🙃x🙃b", 1, "UTF8_LCASE", 5); - assertStringLocate("b", "a🙃x🙃b", 1, "UNICODE", 5); - assertStringLocate("b", "a🙃x🙃b", 1, "UNICODE_CI", 5); - assertStringLocate("b", "a🙃x🙃b", 2, "UTF8_BINARY", 5); - assertStringLocate("b", "a🙃x🙃b", 2, "UTF8_LCASE", 5); - assertStringLocate("b", "a🙃x🙃b", 2, "UNICODE", 5); - assertStringLocate("b", "a🙃x🙃b", 2, "UNICODE_CI", 5); - assertStringLocate("b", "a🙃x🙃b", 3, "UTF8_BINARY", 5); - assertStringLocate("b", "a🙃x🙃b", 3, "UTF8_LCASE", 5); - assertStringLocate("b", "a🙃x🙃b", 3, "UNICODE", 5); - assertStringLocate("b", "a🙃x🙃b", 3, "UNICODE_CI", 5); - assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_BINARY", 5); - assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_LCASE", 5); - assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE", 5); - assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE_CI", 5); + assertStringLocate("a", "a🙃b", 1, UTF8_BINARY, 1); + assertStringLocate("a", "a🙃b", 1, UTF8_LCASE, 1); + assertStringLocate("a", "a🙃b", 1, UNICODE, 1); + assertStringLocate("a", "a🙃b", 1, UNICODE_CI, 1); + assertStringLocate("a", "a🙃b", 2, UTF8_BINARY, 0); + assertStringLocate("a", "a🙃b", 2, UTF8_LCASE, 0); + assertStringLocate("a", "a🙃b", 2, UNICODE, 0); + assertStringLocate("a", "a🙃b", 2, UNICODE_CI, 0); + assertStringLocate("a", "a🙃b", 3, UTF8_BINARY, 0); + assertStringLocate("a", "a🙃b", 3, UTF8_LCASE, 0); + assertStringLocate("a", "a🙃b", 3, UNICODE, 0); + assertStringLocate("a", "a🙃b", 3, UNICODE_CI, 0); + assertStringLocate("🙃", "a🙃b", 1, UTF8_BINARY, 2); + assertStringLocate("🙃", "a🙃b", 1, UTF8_LCASE, 2); + assertStringLocate("🙃", "a🙃b", 1, UNICODE, 2); + assertStringLocate("🙃", "a🙃b", 1, UNICODE_CI, 2); + assertStringLocate("🙃", "a🙃b", 2, UTF8_BINARY, 2); + assertStringLocate("🙃", "a🙃b", 2, UTF8_LCASE, 2); + assertStringLocate("🙃", "a🙃b", 2, UNICODE, 2); + assertStringLocate("🙃", "a🙃b", 2, UNICODE_CI, 2); + assertStringLocate("🙃", "a🙃b", 3, UTF8_BINARY, 0); + assertStringLocate("🙃", "a🙃b", 3, UTF8_LCASE, 0); + assertStringLocate("🙃", "a🙃b", 3, UNICODE, 0); + assertStringLocate("🙃", "a🙃b", 3, UNICODE_CI, 0); + assertStringLocate("b", "a🙃b", 1, UTF8_BINARY, 3); + assertStringLocate("b", "a🙃b", 1, UTF8_LCASE, 3); + assertStringLocate("b", "a🙃b", 1, UNICODE, 3); + assertStringLocate("b", "a🙃b", 1, UNICODE_CI, 3); + assertStringLocate("b", "a🙃b", 2, UTF8_BINARY, 3); + assertStringLocate("b", "a🙃b", 2, UTF8_LCASE, 3); + assertStringLocate("b", "a🙃b", 2, UNICODE, 3); + assertStringLocate("b", "a🙃b", 2, UNICODE_CI, 3); + assertStringLocate("b", "a🙃b", 3, UTF8_BINARY, 3); + assertStringLocate("b", "a🙃b", 3, UTF8_LCASE, 3); + assertStringLocate("b", "a🙃b", 3, UNICODE, 3); + assertStringLocate("b", "a🙃b", 3, UNICODE_CI, 3); + assertStringLocate("🙃", "a🙃🙃b", 1, UTF8_BINARY, 2); + assertStringLocate("🙃", "a🙃🙃b", 1, UTF8_LCASE, 2); + assertStringLocate("🙃", "a🙃🙃b", 1, UNICODE, 2); + assertStringLocate("🙃", "a🙃🙃b", 1, UNICODE_CI, 2); + assertStringLocate("🙃", "a🙃🙃b", 2, UTF8_BINARY, 2); + assertStringLocate("🙃", "a🙃🙃b", 2, UTF8_LCASE, 2); + assertStringLocate("🙃", "a🙃🙃b", 2, UNICODE, 2); + assertStringLocate("🙃", "a🙃🙃b", 2, UNICODE_CI, 2); + assertStringLocate("🙃", "a🙃🙃b", 3, UTF8_BINARY, 3); + assertStringLocate("🙃", "a🙃🙃b", 3, UTF8_LCASE, 3); + assertStringLocate("🙃", "a🙃🙃b", 3, UNICODE, 3); + assertStringLocate("🙃", "a🙃🙃b", 3, UNICODE_CI, 3); + assertStringLocate("🙃", "a🙃🙃b", 4, UTF8_BINARY, 0); + assertStringLocate("🙃", "a🙃🙃b", 4, UTF8_LCASE, 0); + assertStringLocate("🙃", "a🙃🙃b", 4, UNICODE, 0); + assertStringLocate("🙃", "a🙃🙃b", 4, UNICODE_CI, 0); + assertStringLocate("b", "a🙃🙃b", 1, UTF8_BINARY, 4); + assertStringLocate("b", "a🙃🙃b", 1, UTF8_LCASE, 4); + assertStringLocate("b", "a🙃🙃b", 1, UNICODE, 4); + assertStringLocate("b", "a🙃🙃b", 1, UNICODE_CI, 4); + assertStringLocate("b", "a🙃🙃b", 2, UTF8_BINARY, 4); + assertStringLocate("b", "a🙃🙃b", 2, UTF8_LCASE, 4); + assertStringLocate("b", "a🙃🙃b", 2, UNICODE, 4); + assertStringLocate("b", "a🙃🙃b", 2, UNICODE_CI, 4); + assertStringLocate("b", "a🙃🙃b", 3, UTF8_BINARY, 4); + assertStringLocate("b", "a🙃🙃b", 3, UTF8_LCASE, 4); + assertStringLocate("b", "a🙃🙃b", 3, UNICODE, 4); + assertStringLocate("b", "a🙃🙃b", 3, UNICODE_CI, 4); + assertStringLocate("b", "a🙃🙃b", 4, UTF8_BINARY, 4); + assertStringLocate("b", "a🙃🙃b", 4, UTF8_LCASE, 4); + assertStringLocate("b", "a🙃🙃b", 4, UNICODE, 4); + assertStringLocate("b", "a🙃🙃b", 4, UNICODE_CI, 4); + assertStringLocate("b", "a🙃x🙃b", 1, UTF8_BINARY, 5); + assertStringLocate("b", "a🙃x🙃b", 1, UTF8_LCASE, 5); + assertStringLocate("b", "a🙃x🙃b", 1, UNICODE, 5); + assertStringLocate("b", "a🙃x🙃b", 1, UNICODE_CI, 5); + assertStringLocate("b", "a🙃x🙃b", 2, UTF8_BINARY, 5); + assertStringLocate("b", "a🙃x🙃b", 2, UTF8_LCASE, 5); + assertStringLocate("b", "a🙃x🙃b", 2, UNICODE, 5); + assertStringLocate("b", "a🙃x🙃b", 2, UNICODE_CI, 5); + assertStringLocate("b", "a🙃x🙃b", 3, UTF8_BINARY, 5); + assertStringLocate("b", "a🙃x🙃b", 3, UTF8_LCASE, 5); + assertStringLocate("b", "a🙃x🙃b", 3, UNICODE, 5); + assertStringLocate("b", "a🙃x🙃b", 3, UNICODE_CI, 5); + assertStringLocate("b", "a🙃x🙃b", 4, UTF8_BINARY, 5); + assertStringLocate("b", "a🙃x🙃b", 4, UTF8_LCASE, 5); + assertStringLocate("b", "a🙃x🙃b", 4, UNICODE, 5); + assertStringLocate("b", "a🙃x🙃b", 4, UNICODE_CI, 5); // Out of bounds test cases. - assertStringLocate("a", "asd", 4, "UTF8_BINARY", 0); - assertStringLocate("a", "asd", 4, "UTF8_LCASE", 0); - assertStringLocate("a", "asd", 4, "UNICODE", 0); - assertStringLocate("a", "asd", 4, "UNICODE_CI", 0); - assertStringLocate("a", "asd", 100, "UTF8_BINARY", 0); - assertStringLocate("a", "asd", 100, "UTF8_LCASE", 0); - assertStringLocate("a", "asd", 100, "UNICODE", 0); - assertStringLocate("a", "asd", 100, "UNICODE_CI", 0); - assertStringLocate("a", "🙃🙃", 4, "UTF8_BINARY", 0); - assertStringLocate("a", "🙃🙃", 4, "UTF8_LCASE", 0); - assertStringLocate("a", "🙃🙃", 4, "UNICODE", 0); - assertStringLocate("a", "🙃🙃", 4, "UNICODE_CI", 0); - assertStringLocate("", "asd", 100, "UTF8_BINARY", 1); - assertStringLocate("", "asd", 100, "UTF8_LCASE", 1); - assertStringLocate("", "asd", 100, "UNICODE", 1); - assertStringLocate("", "asd", 100, "UNICODE_CI", 1); - assertStringLocate("asd", "", 100, "UTF8_BINARY", 0); - assertStringLocate("asd", "", 100, "UTF8_LCASE", 0); - assertStringLocate("asd", "", 100, "UNICODE", 0); - assertStringLocate("asd", "", 100, "UNICODE_CI", 0); + assertStringLocate("a", "asd", 4, UTF8_BINARY, 0); + assertStringLocate("a", "asd", 4, UTF8_LCASE, 0); + assertStringLocate("a", "asd", 4, UNICODE, 0); + assertStringLocate("a", "asd", 4, UNICODE_CI, 0); + assertStringLocate("a", "asd", 100, UTF8_BINARY, 0); + assertStringLocate("a", "asd", 100, UTF8_LCASE, 0); + assertStringLocate("a", "asd", 100, UNICODE, 0); + assertStringLocate("a", "asd", 100, UNICODE_CI, 0); + assertStringLocate("a", "🙃🙃", 4, UTF8_BINARY, 0); + assertStringLocate("a", "🙃🙃", 4, UTF8_LCASE, 0); + assertStringLocate("a", "🙃🙃", 4, UNICODE, 0); + assertStringLocate("a", "🙃🙃", 4, UNICODE_CI, 0); + assertStringLocate("", "asd", 100, UTF8_BINARY, 1); + assertStringLocate("", "asd", 100, UTF8_LCASE, 1); + assertStringLocate("", "asd", 100, UNICODE, 1); + assertStringLocate("", "asd", 100, UNICODE_CI, 1); + assertStringLocate("asd", "", 100, UTF8_BINARY, 0); + assertStringLocate("asd", "", 100, UTF8_LCASE, 0); + assertStringLocate("asd", "", 100, UNICODE, 0); + assertStringLocate("asd", "", 100, UNICODE_CI, 0); } /** @@ -2450,292 +2451,292 @@ private void assertSubstringIndex(String string, String delimiter, int count, @Test public void testSubstringIndex() throws SparkException { // Empty strings. - assertSubstringIndex("", "", 0, "UTF8_BINARY", ""); - assertSubstringIndex("", "", 0, "UTF8_LCASE", ""); - assertSubstringIndex("", "", 0, "UNICODE", ""); - assertSubstringIndex("", "", 0, "UNICODE_CI", ""); - assertSubstringIndex("", "", 1, "UTF8_BINARY", ""); - assertSubstringIndex("", "", 1, "UTF8_LCASE", ""); - assertSubstringIndex("", "", 1, "UNICODE", ""); - assertSubstringIndex("", "", 1, "UNICODE_CI", ""); - assertSubstringIndex("", "", -1, "UTF8_BINARY", ""); - assertSubstringIndex("", "", -1, "UTF8_LCASE", ""); - assertSubstringIndex("", "", -1, "UNICODE", ""); - assertSubstringIndex("", "", -1, "UNICODE_CI", ""); - assertSubstringIndex("", "x", 0, "UTF8_BINARY", ""); - assertSubstringIndex("", "x", 0, "UTF8_LCASE", ""); - assertSubstringIndex("", "x", 0, "UNICODE", ""); - assertSubstringIndex("", "x", 0, "UNICODE_CI", ""); - assertSubstringIndex("", "x", 1, "UTF8_BINARY", ""); - assertSubstringIndex("", "x", 1, "UTF8_LCASE", ""); - assertSubstringIndex("", "x", 1, "UNICODE", ""); - assertSubstringIndex("", "x", 1, "UNICODE_CI", ""); - assertSubstringIndex("", "x", -1, "UTF8_BINARY", ""); - assertSubstringIndex("", "x", -1, "UTF8_LCASE", ""); - assertSubstringIndex("", "x", -1, "UNICODE", ""); - assertSubstringIndex("", "x", -1, "UNICODE_CI", ""); - assertSubstringIndex("abc", "", 0, "UTF8_BINARY", ""); - assertSubstringIndex("abc", "", 0, "UTF8_LCASE", ""); - assertSubstringIndex("abc", "", 0, "UNICODE", ""); - assertSubstringIndex("abc", "", 0, "UNICODE_CI", ""); - assertSubstringIndex("abc", "", 1, "UTF8_BINARY", ""); - assertSubstringIndex("abc", "", 1, "UTF8_LCASE", ""); - assertSubstringIndex("abc", "", 1, "UNICODE", ""); - assertSubstringIndex("abc", "", 1, "UNICODE_CI", ""); - assertSubstringIndex("abc", "", -1, "UTF8_BINARY", ""); - assertSubstringIndex("abc", "", -1, "UTF8_LCASE", ""); - assertSubstringIndex("abc", "", -1, "UNICODE", ""); - assertSubstringIndex("abc", "", -1, "UNICODE_CI", ""); + assertSubstringIndex("", "", 0, UTF8_BINARY, ""); + assertSubstringIndex("", "", 0, UTF8_LCASE, ""); + assertSubstringIndex("", "", 0, UNICODE, ""); + assertSubstringIndex("", "", 0, UNICODE_CI, ""); + assertSubstringIndex("", "", 1, UTF8_BINARY, ""); + assertSubstringIndex("", "", 1, UTF8_LCASE, ""); + assertSubstringIndex("", "", 1, UNICODE, ""); + assertSubstringIndex("", "", 1, UNICODE_CI, ""); + assertSubstringIndex("", "", -1, UTF8_BINARY, ""); + assertSubstringIndex("", "", -1, UTF8_LCASE, ""); + assertSubstringIndex("", "", -1, UNICODE, ""); + assertSubstringIndex("", "", -1, UNICODE_CI, ""); + assertSubstringIndex("", "x", 0, UTF8_BINARY, ""); + assertSubstringIndex("", "x", 0, UTF8_LCASE, ""); + assertSubstringIndex("", "x", 0, UNICODE, ""); + assertSubstringIndex("", "x", 0, UNICODE_CI, ""); + assertSubstringIndex("", "x", 1, UTF8_BINARY, ""); + assertSubstringIndex("", "x", 1, UTF8_LCASE, ""); + assertSubstringIndex("", "x", 1, UNICODE, ""); + assertSubstringIndex("", "x", 1, UNICODE_CI, ""); + assertSubstringIndex("", "x", -1, UTF8_BINARY, ""); + assertSubstringIndex("", "x", -1, UTF8_LCASE, ""); + assertSubstringIndex("", "x", -1, UNICODE, ""); + assertSubstringIndex("", "x", -1, UNICODE_CI, ""); + assertSubstringIndex("abc", "", 0, UTF8_BINARY, ""); + assertSubstringIndex("abc", "", 0, UTF8_LCASE, ""); + assertSubstringIndex("abc", "", 0, UNICODE, ""); + assertSubstringIndex("abc", "", 0, UNICODE_CI, ""); + assertSubstringIndex("abc", "", 1, UTF8_BINARY, ""); + assertSubstringIndex("abc", "", 1, UTF8_LCASE, ""); + assertSubstringIndex("abc", "", 1, UNICODE, ""); + assertSubstringIndex("abc", "", 1, UNICODE_CI, ""); + assertSubstringIndex("abc", "", -1, UTF8_BINARY, ""); + assertSubstringIndex("abc", "", -1, UTF8_LCASE, ""); + assertSubstringIndex("abc", "", -1, UNICODE, ""); + assertSubstringIndex("abc", "", -1, UNICODE_CI, ""); // Basic tests. - assertSubstringIndex("axbxc", "a", 1, "UTF8_BINARY", ""); - assertSubstringIndex("axbxc", "a", 1, "UTF8_LCASE", ""); - assertSubstringIndex("axbxc", "a", 1, "UNICODE", ""); - assertSubstringIndex("axbxc", "a", 1, "UNICODE_CI", ""); - assertSubstringIndex("axbxc", "x", 1, "UTF8_BINARY", "a"); - assertSubstringIndex("axbxc", "x", 1, "UTF8_LCASE", "a"); - assertSubstringIndex("axbxc", "x", 1, "UNICODE", "a"); - assertSubstringIndex("axbxc", "x", 1, "UNICODE_CI", "a"); - assertSubstringIndex("axbxc", "b", 1, "UTF8_BINARY", "ax"); - assertSubstringIndex("axbxc", "b", 1, "UTF8_LCASE", "ax"); - assertSubstringIndex("axbxc", "b", 1, "UNICODE", "ax"); - assertSubstringIndex("axbxc", "b", 1, "UNICODE_CI", "ax"); - assertSubstringIndex("axbxc", "x", 2, "UTF8_BINARY", "axb"); - assertSubstringIndex("axbxc", "x", 2, "UTF8_LCASE", "axb"); - assertSubstringIndex("axbxc", "x", 2, "UNICODE", "axb"); - assertSubstringIndex("axbxc", "x", 2, "UNICODE_CI", "axb"); - assertSubstringIndex("axbxc", "c", 1, "UTF8_BINARY", "axbx"); - assertSubstringIndex("axbxc", "c", 1, "UTF8_LCASE", "axbx"); - assertSubstringIndex("axbxc", "c", 1, "UNICODE", "axbx"); - assertSubstringIndex("axbxc", "c", 1, "UNICODE_CI", "axbx"); - assertSubstringIndex("axbxc", "x", 3, "UTF8_BINARY", "axbxc"); - assertSubstringIndex("axbxc", "x", 3, "UTF8_LCASE", "axbxc"); - assertSubstringIndex("axbxc", "x", 3, "UNICODE", "axbxc"); - assertSubstringIndex("axbxc", "x", 3, "UNICODE_CI", "axbxc"); - assertSubstringIndex("axbxc", "d", 1, "UTF8_BINARY", "axbxc"); - assertSubstringIndex("axbxc", "d", 1, "UTF8_LCASE", "axbxc"); - assertSubstringIndex("axbxc", "d", 1, "UNICODE", "axbxc"); - assertSubstringIndex("axbxc", "d", 1, "UNICODE_CI", "axbxc"); - assertSubstringIndex("axbxc", "c", -1, "UTF8_BINARY", ""); - assertSubstringIndex("axbxc", "c", -1, "UTF8_LCASE", ""); - assertSubstringIndex("axbxc", "c", -1, "UNICODE", ""); - assertSubstringIndex("axbxc", "c", -1, "UNICODE_CI", ""); - assertSubstringIndex("axbxc", "x", -1, "UTF8_BINARY", "c"); - assertSubstringIndex("axbxc", "x", -1, "UTF8_LCASE", "c"); - assertSubstringIndex("axbxc", "x", -1, "UNICODE", "c"); - assertSubstringIndex("axbxc", "x", -1, "UNICODE_CI", "c"); - assertSubstringIndex("axbxc", "b", -1, "UTF8_BINARY", "xc"); - assertSubstringIndex("axbxc", "b", -1, "UTF8_LCASE", "xc"); - assertSubstringIndex("axbxc", "b", -1, "UNICODE", "xc"); - assertSubstringIndex("axbxc", "b", -1, "UNICODE_CI", "xc"); - assertSubstringIndex("axbxc", "x", -2, "UTF8_BINARY", "bxc"); - assertSubstringIndex("axbxc", "x", -2, "UTF8_LCASE", "bxc"); - assertSubstringIndex("axbxc", "x", -2, "UNICODE", "bxc"); - assertSubstringIndex("axbxc", "x", -2, "UNICODE_CI", "bxc"); - assertSubstringIndex("axbxc", "a", -1, "UTF8_BINARY", "xbxc"); - assertSubstringIndex("axbxc", "a", -1, "UTF8_LCASE", "xbxc"); - assertSubstringIndex("axbxc", "a", -1, "UNICODE", "xbxc"); - assertSubstringIndex("axbxc", "a", -1, "UNICODE_CI", "xbxc"); - assertSubstringIndex("axbxc", "x", -3, "UTF8_BINARY", "axbxc"); - assertSubstringIndex("axbxc", "x", -3, "UTF8_LCASE", "axbxc"); - assertSubstringIndex("axbxc", "x", -3, "UNICODE", "axbxc"); - assertSubstringIndex("axbxc", "x", -3, "UNICODE_CI", "axbxc"); - assertSubstringIndex("axbxc", "d", -1, "UTF8_BINARY", "axbxc"); - assertSubstringIndex("axbxc", "d", -1, "UTF8_LCASE", "axbxc"); - assertSubstringIndex("axbxc", "d", -1, "UNICODE", "axbxc"); - assertSubstringIndex("axbxc", "d", -1, "UNICODE_CI", "axbxc"); + assertSubstringIndex("axbxc", "a", 1, UTF8_BINARY, ""); + assertSubstringIndex("axbxc", "a", 1, UTF8_LCASE, ""); + assertSubstringIndex("axbxc", "a", 1, UNICODE, ""); + assertSubstringIndex("axbxc", "a", 1, UNICODE_CI, ""); + assertSubstringIndex("axbxc", "x", 1, UTF8_BINARY, "a"); + assertSubstringIndex("axbxc", "x", 1, UTF8_LCASE, "a"); + assertSubstringIndex("axbxc", "x", 1, UNICODE, "a"); + assertSubstringIndex("axbxc", "x", 1, UNICODE_CI, "a"); + assertSubstringIndex("axbxc", "b", 1, UTF8_BINARY, "ax"); + assertSubstringIndex("axbxc", "b", 1, UTF8_LCASE, "ax"); + assertSubstringIndex("axbxc", "b", 1, UNICODE, "ax"); + assertSubstringIndex("axbxc", "b", 1, UNICODE_CI, "ax"); + assertSubstringIndex("axbxc", "x", 2, UTF8_BINARY, "axb"); + assertSubstringIndex("axbxc", "x", 2, UTF8_LCASE, "axb"); + assertSubstringIndex("axbxc", "x", 2, UNICODE, "axb"); + assertSubstringIndex("axbxc", "x", 2, UNICODE_CI, "axb"); + assertSubstringIndex("axbxc", "c", 1, UTF8_BINARY, "axbx"); + assertSubstringIndex("axbxc", "c", 1, UTF8_LCASE, "axbx"); + assertSubstringIndex("axbxc", "c", 1, UNICODE, "axbx"); + assertSubstringIndex("axbxc", "c", 1, UNICODE_CI, "axbx"); + assertSubstringIndex("axbxc", "x", 3, UTF8_BINARY, "axbxc"); + assertSubstringIndex("axbxc", "x", 3, UTF8_LCASE, "axbxc"); + assertSubstringIndex("axbxc", "x", 3, UNICODE, "axbxc"); + assertSubstringIndex("axbxc", "x", 3, UNICODE_CI, "axbxc"); + assertSubstringIndex("axbxc", "d", 1, UTF8_BINARY, "axbxc"); + assertSubstringIndex("axbxc", "d", 1, UTF8_LCASE, "axbxc"); + assertSubstringIndex("axbxc", "d", 1, UNICODE, "axbxc"); + assertSubstringIndex("axbxc", "d", 1, UNICODE_CI, "axbxc"); + assertSubstringIndex("axbxc", "c", -1, UTF8_BINARY, ""); + assertSubstringIndex("axbxc", "c", -1, UTF8_LCASE, ""); + assertSubstringIndex("axbxc", "c", -1, UNICODE, ""); + assertSubstringIndex("axbxc", "c", -1, UNICODE_CI, ""); + assertSubstringIndex("axbxc", "x", -1, UTF8_BINARY, "c"); + assertSubstringIndex("axbxc", "x", -1, UTF8_LCASE, "c"); + assertSubstringIndex("axbxc", "x", -1, UNICODE, "c"); + assertSubstringIndex("axbxc", "x", -1, UNICODE_CI, "c"); + assertSubstringIndex("axbxc", "b", -1, UTF8_BINARY, "xc"); + assertSubstringIndex("axbxc", "b", -1, UTF8_LCASE, "xc"); + assertSubstringIndex("axbxc", "b", -1, UNICODE, "xc"); + assertSubstringIndex("axbxc", "b", -1, UNICODE_CI, "xc"); + assertSubstringIndex("axbxc", "x", -2, UTF8_BINARY, "bxc"); + assertSubstringIndex("axbxc", "x", -2, UTF8_LCASE, "bxc"); + assertSubstringIndex("axbxc", "x", -2, UNICODE, "bxc"); + assertSubstringIndex("axbxc", "x", -2, UNICODE_CI, "bxc"); + assertSubstringIndex("axbxc", "a", -1, UTF8_BINARY, "xbxc"); + assertSubstringIndex("axbxc", "a", -1, UTF8_LCASE, "xbxc"); + assertSubstringIndex("axbxc", "a", -1, UNICODE, "xbxc"); + assertSubstringIndex("axbxc", "a", -1, UNICODE_CI, "xbxc"); + assertSubstringIndex("axbxc", "x", -3, UTF8_BINARY, "axbxc"); + assertSubstringIndex("axbxc", "x", -3, UTF8_LCASE, "axbxc"); + assertSubstringIndex("axbxc", "x", -3, UNICODE, "axbxc"); + assertSubstringIndex("axbxc", "x", -3, UNICODE_CI, "axbxc"); + assertSubstringIndex("axbxc", "d", -1, UTF8_BINARY, "axbxc"); + assertSubstringIndex("axbxc", "d", -1, UTF8_LCASE, "axbxc"); + assertSubstringIndex("axbxc", "d", -1, UNICODE, "axbxc"); + assertSubstringIndex("axbxc", "d", -1, UNICODE_CI, "axbxc"); // Advanced tests. - assertSubstringIndex("wwwgapachegorg", "g", -3, "UTF8_BINARY", "apachegorg"); - assertSubstringIndex("www||apache||org", "||", 2, "UTF8_BINARY", "www||apache"); - assertSubstringIndex("aaaaaaaaaa", "aa", 2, "UTF8_BINARY", "a"); - assertSubstringIndex("AaAaAaAaAa", "aa", 2, "UTF8_LCASE", "A"); - assertSubstringIndex("www.apache.org", ".", 3, "UTF8_LCASE", "www.apache.org"); - assertSubstringIndex("wwwXapacheXorg", "x", 2, "UTF8_LCASE", "wwwXapache"); - assertSubstringIndex("wwwxapachexorg", "X", 1, "UTF8_LCASE", "www"); - assertSubstringIndex("www.apache.org", ".", 0, "UTF8_LCASE", ""); - assertSubstringIndex("www.apache.ORG", ".", -3, "UTF8_LCASE", "www.apache.ORG"); - assertSubstringIndex("wwwGapacheGorg", "g", 1, "UTF8_LCASE", "www"); - assertSubstringIndex("wwwGapacheGorg", "g", 3, "UTF8_LCASE", "wwwGapacheGor"); - assertSubstringIndex("gwwwGapacheGorg", "g", 3, "UTF8_LCASE", "gwwwGapache"); - assertSubstringIndex("wwwGapacheGorg", "g", -3, "UTF8_LCASE", "apacheGorg"); - assertSubstringIndex("wwwmapacheMorg", "M", -2, "UTF8_LCASE", "apacheMorg"); - assertSubstringIndex("www.apache.org", ".", -1, "UTF8_LCASE", "org"); - assertSubstringIndex("www.apache.org.", ".", -1, "UTF8_LCASE", ""); - assertSubstringIndex("", ".", -2, "UTF8_LCASE", ""); - assertSubstringIndex("test大千世界X大千世界", "x", -1, "UTF8_LCASE", "大千世界"); - assertSubstringIndex("test大千世界X大千世界", "X", 1, "UTF8_LCASE", "test大千世界"); - assertSubstringIndex("test大千世界大千世界", "千", 2, "UTF8_LCASE", "test大千世界大"); - assertSubstringIndex("www||APACHE||org", "||", 2, "UTF8_LCASE", "www||APACHE"); - assertSubstringIndex("www||APACHE||org", "||", -1, "UTF8_LCASE", "org"); - assertSubstringIndex("AaAaAaAaAa", "Aa", 2, "UNICODE", "Aa"); - assertSubstringIndex("wwwYapacheyorg", "y", 3, "UNICODE", "wwwYapacheyorg"); - assertSubstringIndex("www.apache.org", ".", 2, "UNICODE", "www.apache"); - assertSubstringIndex("wwwYapacheYorg", "Y", 1, "UNICODE", "www"); - assertSubstringIndex("wwwYapacheYorg", "y", 1, "UNICODE", "wwwYapacheYorg"); - assertSubstringIndex("wwwGapacheGorg", "g", 1, "UNICODE", "wwwGapacheGor"); - assertSubstringIndex("GwwwGapacheGorG", "G", 3, "UNICODE", "GwwwGapache"); - assertSubstringIndex("wwwGapacheGorG", "G", -3, "UNICODE", "apacheGorG"); - assertSubstringIndex("www.apache.org", ".", 0, "UNICODE", ""); - assertSubstringIndex("www.apache.org", ".", -3, "UNICODE", "www.apache.org"); - assertSubstringIndex("www.apache.org", ".", -2, "UNICODE", "apache.org"); - assertSubstringIndex("www.apache.org", ".", -1, "UNICODE", "org"); - assertSubstringIndex("", ".", -2, "UNICODE", ""); - assertSubstringIndex("test大千世界X大千世界", "X", -1, "UNICODE", "大千世界"); - assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE", "test大千世界"); - assertSubstringIndex("大x千世界大千世x界", "x", 1, "UNICODE", "大"); - assertSubstringIndex("大x千世界大千世x界", "x", -1, "UNICODE", "界"); - assertSubstringIndex("大x千世界大千世x界", "x", -2, "UNICODE", "千世界大千世x界"); - assertSubstringIndex("大千世界大千世界", "千", 2, "UNICODE", "大千世界大"); - assertSubstringIndex("www||apache||org", "||", 2, "UNICODE", "www||apache"); - assertSubstringIndex("AaAaAaAaAa", "aa", 2, "UNICODE_CI", "A"); - assertSubstringIndex("www.apache.org", ".", 3, "UNICODE_CI", "www.apache.org"); - assertSubstringIndex("wwwXapacheXorg", "x", 2, "UNICODE_CI", "wwwXapache"); - assertSubstringIndex("wwwxapacheXorg", "X", 1, "UNICODE_CI", "www"); - assertSubstringIndex("www.apache.org", ".", 0, "UNICODE_CI", ""); - assertSubstringIndex("wwwGapacheGorg", "G", 3, "UNICODE_CI", "wwwGapacheGor"); - assertSubstringIndex("gwwwGapacheGorg", "g", 3, "UNICODE_CI", "gwwwGapache"); - assertSubstringIndex("gwwwGapacheGorg", "g", -3, "UNICODE_CI", "apacheGorg"); - assertSubstringIndex("www.apache.ORG", ".", -3, "UNICODE_CI", "www.apache.ORG"); - assertSubstringIndex("wwwmapacheMorg", "M", -2, "UNICODE_CI", "apacheMorg"); - assertSubstringIndex("www.apache.org", ".", -1, "UNICODE_CI", "org"); - assertSubstringIndex("", ".", -2, "UNICODE_CI", ""); - assertSubstringIndex("test大千世界X大千世界", "X", -1, "UNICODE_CI", "大千世界"); - assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE_CI", "test大千世界"); - assertSubstringIndex("test大千世界大千世界", "千", 2, "UNICODE_CI", "test大千世界大"); - assertSubstringIndex("www||APACHE||org", "||", 2, "UNICODE_CI", "www||APACHE"); + assertSubstringIndex("wwwgapachegorg", "g", -3, UTF8_BINARY, "apachegorg"); + assertSubstringIndex("www||apache||org", "||", 2, UTF8_BINARY, "www||apache"); + assertSubstringIndex("aaaaaaaaaa", "aa", 2, UTF8_BINARY, "a"); + assertSubstringIndex("AaAaAaAaAa", "aa", 2, UTF8_LCASE, "A"); + assertSubstringIndex("www.apache.org", ".", 3, UTF8_LCASE, "www.apache.org"); + assertSubstringIndex("wwwXapacheXorg", "x", 2, UTF8_LCASE, "wwwXapache"); + assertSubstringIndex("wwwxapachexorg", "X", 1, UTF8_LCASE, "www"); + assertSubstringIndex("www.apache.org", ".", 0, UTF8_LCASE, ""); + assertSubstringIndex("www.apache.ORG", ".", -3, UTF8_LCASE, "www.apache.ORG"); + assertSubstringIndex("wwwGapacheGorg", "g", 1, UTF8_LCASE, "www"); + assertSubstringIndex("wwwGapacheGorg", "g", 3, UTF8_LCASE, "wwwGapacheGor"); + assertSubstringIndex("gwwwGapacheGorg", "g", 3, UTF8_LCASE, "gwwwGapache"); + assertSubstringIndex("wwwGapacheGorg", "g", -3, UTF8_LCASE, "apacheGorg"); + assertSubstringIndex("wwwmapacheMorg", "M", -2, UTF8_LCASE, "apacheMorg"); + assertSubstringIndex("www.apache.org", ".", -1, UTF8_LCASE, "org"); + assertSubstringIndex("www.apache.org.", ".", -1, UTF8_LCASE, ""); + assertSubstringIndex("", ".", -2, UTF8_LCASE, ""); + assertSubstringIndex("test大千世界X大千世界", "x", -1, UTF8_LCASE, "大千世界"); + assertSubstringIndex("test大千世界X大千世界", "X", 1, UTF8_LCASE, "test大千世界"); + assertSubstringIndex("test大千世界大千世界", "千", 2, UTF8_LCASE, "test大千世界大"); + assertSubstringIndex("www||APACHE||org", "||", 2, UTF8_LCASE, "www||APACHE"); + assertSubstringIndex("www||APACHE||org", "||", -1, UTF8_LCASE, "org"); + assertSubstringIndex("AaAaAaAaAa", "Aa", 2, UNICODE, "Aa"); + assertSubstringIndex("wwwYapacheyorg", "y", 3, UNICODE, "wwwYapacheyorg"); + assertSubstringIndex("www.apache.org", ".", 2, UNICODE, "www.apache"); + assertSubstringIndex("wwwYapacheYorg", "Y", 1, UNICODE, "www"); + assertSubstringIndex("wwwYapacheYorg", "y", 1, UNICODE, "wwwYapacheYorg"); + assertSubstringIndex("wwwGapacheGorg", "g", 1, UNICODE, "wwwGapacheGor"); + assertSubstringIndex("GwwwGapacheGorG", "G", 3, UNICODE, "GwwwGapache"); + assertSubstringIndex("wwwGapacheGorG", "G", -3, UNICODE, "apacheGorG"); + assertSubstringIndex("www.apache.org", ".", 0, UNICODE, ""); + assertSubstringIndex("www.apache.org", ".", -3, UNICODE, "www.apache.org"); + assertSubstringIndex("www.apache.org", ".", -2, UNICODE, "apache.org"); + assertSubstringIndex("www.apache.org", ".", -1, UNICODE, "org"); + assertSubstringIndex("", ".", -2, UNICODE, ""); + assertSubstringIndex("test大千世界X大千世界", "X", -1, UNICODE, "大千世界"); + assertSubstringIndex("test大千世界X大千世界", "X", 1, UNICODE, "test大千世界"); + assertSubstringIndex("大x千世界大千世x界", "x", 1, UNICODE, "大"); + assertSubstringIndex("大x千世界大千世x界", "x", -1, UNICODE, "界"); + assertSubstringIndex("大x千世界大千世x界", "x", -2, UNICODE, "千世界大千世x界"); + assertSubstringIndex("大千世界大千世界", "千", 2, UNICODE, "大千世界大"); + assertSubstringIndex("www||apache||org", "||", 2, UNICODE, "www||apache"); + assertSubstringIndex("AaAaAaAaAa", "aa", 2, UNICODE_CI, "A"); + assertSubstringIndex("www.apache.org", ".", 3, UNICODE_CI, "www.apache.org"); + assertSubstringIndex("wwwXapacheXorg", "x", 2, UNICODE_CI, "wwwXapache"); + assertSubstringIndex("wwwxapacheXorg", "X", 1, UNICODE_CI, "www"); + assertSubstringIndex("www.apache.org", ".", 0, UNICODE_CI, ""); + assertSubstringIndex("wwwGapacheGorg", "G", 3, UNICODE_CI, "wwwGapacheGor"); + assertSubstringIndex("gwwwGapacheGorg", "g", 3, UNICODE_CI, "gwwwGapache"); + assertSubstringIndex("gwwwGapacheGorg", "g", -3, UNICODE_CI, "apacheGorg"); + assertSubstringIndex("www.apache.ORG", ".", -3, UNICODE_CI, "www.apache.ORG"); + assertSubstringIndex("wwwmapacheMorg", "M", -2, UNICODE_CI, "apacheMorg"); + assertSubstringIndex("www.apache.org", ".", -1, UNICODE_CI, "org"); + assertSubstringIndex("", ".", -2, UNICODE_CI, ""); + assertSubstringIndex("test大千世界X大千世界", "X", -1, UNICODE_CI, "大千世界"); + assertSubstringIndex("test大千世界X大千世界", "X", 1, UNICODE_CI, "test大千世界"); + assertSubstringIndex("test大千世界大千世界", "千", 2, UNICODE_CI, "test大千世界大"); + assertSubstringIndex("www||APACHE||org", "||", 2, UNICODE_CI, "www||APACHE"); assertSubstringIndex("wwwèapacheËorg", "Ê", -3, "AF_CI_AI", "apacheËorg"); // One-to-many case mapping (e.g. Turkish dotted I). - assertSubstringIndex("abİo12", "i\u0307o", 1, "UNICODE_CI", "ab"); - assertSubstringIndex("abİo12", "i\u0307o", -1, "UNICODE_CI", "12"); - assertSubstringIndex("abi̇o12", "İo", 1, "UNICODE_CI", "ab"); - assertSubstringIndex("abi̇o12", "İo", -1, "UNICODE_CI", "12"); - assertSubstringIndex("ai̇bi̇o12", "İo", 1, "UNICODE_CI", "ai̇b"); - assertSubstringIndex("ai̇bi̇o12i̇o", "İo", 2, "UNICODE_CI", "ai̇bi̇o12"); - assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -1, "UNICODE_CI", ""); - assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -2, "UNICODE_CI", "12i̇o"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, "UNICODE_CI", "İo12İoi̇o"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, "UNICODE_CI", "i̇o12i̇oİo"); - assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12"); - assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12"); - assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab"); - assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, "UNICODE_CI", "İo12İoi̇o"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, "UNICODE_CI", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); - assertSubstringIndex("abi̇12", "i", 1, "UTF8_LCASE", "ab"); // != UNICODE_CI - assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_LCASE", "abi"); // != UNICODE_CI - assertSubstringIndex("abi̇12", "İ", 1, "UTF8_LCASE", "ab"); - assertSubstringIndex("abİ12", "i", 1, "UTF8_LCASE", "abİ12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_LCASE", "İo12İoi̇o"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, "UTF8_LCASE", "İo12İoi̇o"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_LCASE", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, "UTF8_LCASE", "i̇o12i̇oİo"); - assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_LCASE", "bİoi̇o12i̇o"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_LCASE", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", 3, "UTF8_LCASE", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_LCASE", "ai̇bİoi̇o12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", 3, "UTF8_LCASE", "ai̇bİoi̇o12"); - assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_LCASE", "bİoi̇o12i̇o"); + assertSubstringIndex("abİo12", "i\u0307o", 1, UNICODE_CI, "ab"); + assertSubstringIndex("abİo12", "i\u0307o", -1, UNICODE_CI, "12"); + assertSubstringIndex("abi̇o12", "İo", 1, UNICODE_CI, "ab"); + assertSubstringIndex("abi̇o12", "İo", -1, UNICODE_CI, "12"); + assertSubstringIndex("ai̇bi̇o12", "İo", 1, UNICODE_CI, "ai̇b"); + assertSubstringIndex("ai̇bi̇o12i̇o", "İo", 2, UNICODE_CI, "ai̇bi̇o12"); + assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -1, UNICODE_CI, ""); + assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -2, UNICODE_CI, "12i̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, UNICODE_CI, "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, UNICODE_CI, "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, UNICODE_CI, "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, UNICODE_CI, "i̇o12i̇oİo"); + assertSubstringIndex("abi̇12", "i", 1, UNICODE_CI, "abi̇12"); + assertSubstringIndex("abi̇12", "\u0307", 1, UNICODE_CI, "abi̇12"); + assertSubstringIndex("abi̇12", "İ", 1, UNICODE_CI, "ab"); + assertSubstringIndex("abİ12", "i", 1, UNICODE_CI, "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, UNICODE_CI, "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, UNICODE_CI, "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, UNICODE_CI, "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, UNICODE_CI, "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, UNICODE_CI, "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", 3, UNICODE_CI, "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, UNICODE_CI, "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", 3, UNICODE_CI, "ai̇bİoi̇o12"); + assertSubstringIndex("abi̇12", "i", 1, UTF8_LCASE, "ab"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "\u0307", 1, UTF8_LCASE, "abi"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "İ", 1, UTF8_LCASE, "ab"); + assertSubstringIndex("abİ12", "i", 1, UTF8_LCASE, "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, UTF8_LCASE, "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, UTF8_LCASE, "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, UTF8_LCASE, "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, UTF8_LCASE, "i̇o12i̇oİo"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, UTF8_LCASE, "bİoi̇o12i̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, UTF8_LCASE, "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", 3, UTF8_LCASE, "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, UTF8_LCASE, "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", 3, UTF8_LCASE, "ai̇bİoi̇o12"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, UTF8_LCASE, "bİoi̇o12i̇o"); // Conditional case mapping (e.g. Greek sigmas). - assertSubstringIndex("σ", "σ", 1, "UTF8_BINARY", ""); - assertSubstringIndex("σ", "ς", 1, "UTF8_BINARY", "σ"); - assertSubstringIndex("σ", "Σ", 1, "UTF8_BINARY", "σ"); - assertSubstringIndex("ς", "σ", 1, "UTF8_BINARY", "ς"); - assertSubstringIndex("ς", "ς", 1, "UTF8_BINARY", ""); - assertSubstringIndex("ς", "Σ", 1, "UTF8_BINARY", "ς"); - assertSubstringIndex("Σ", "σ", 1, "UTF8_BINARY", "Σ"); - assertSubstringIndex("Σ", "ς", 1, "UTF8_BINARY", "Σ"); - assertSubstringIndex("Σ", "Σ", 1, "UTF8_BINARY", ""); - assertSubstringIndex("σ", "σ", 1, "UTF8_LCASE", ""); - assertSubstringIndex("σ", "ς", 1, "UTF8_LCASE", ""); - assertSubstringIndex("σ", "Σ", 1, "UTF8_LCASE", ""); - assertSubstringIndex("ς", "σ", 1, "UTF8_LCASE", ""); - assertSubstringIndex("ς", "ς", 1, "UTF8_LCASE", ""); - assertSubstringIndex("ς", "Σ", 1, "UTF8_LCASE", ""); - assertSubstringIndex("Σ", "σ", 1, "UTF8_LCASE", ""); - assertSubstringIndex("Σ", "ς", 1, "UTF8_LCASE", ""); - assertSubstringIndex("Σ", "Σ", 1, "UTF8_LCASE", ""); - assertSubstringIndex("σ", "σ", 1, "UNICODE", ""); - assertSubstringIndex("σ", "ς", 1, "UNICODE", "σ"); - assertSubstringIndex("σ", "Σ", 1, "UNICODE", "σ"); - assertSubstringIndex("ς", "σ", 1, "UNICODE", "ς"); - assertSubstringIndex("ς", "ς", 1, "UNICODE", ""); - assertSubstringIndex("ς", "Σ", 1, "UNICODE", "ς"); - assertSubstringIndex("Σ", "σ", 1, "UNICODE", "Σ"); - assertSubstringIndex("Σ", "ς", 1, "UNICODE", "Σ"); - assertSubstringIndex("Σ", "Σ", 1, "UNICODE", ""); - assertSubstringIndex("σ", "σ", 1, "UNICODE_CI", ""); - assertSubstringIndex("σ", "ς", 1, "UNICODE_CI", ""); - assertSubstringIndex("σ", "Σ", 1, "UNICODE_CI", ""); - assertSubstringIndex("ς", "σ", 1, "UNICODE_CI", ""); - assertSubstringIndex("ς", "ς", 1, "UNICODE_CI", ""); - assertSubstringIndex("ς", "Σ", 1, "UNICODE_CI", ""); - assertSubstringIndex("Σ", "σ", 1, "UNICODE_CI", ""); - assertSubstringIndex("Σ", "ς", 1, "UNICODE_CI", ""); - assertSubstringIndex("Σ", "Σ", 1, "UNICODE_CI", ""); + assertSubstringIndex("σ", "σ", 1, UTF8_BINARY, ""); + assertSubstringIndex("σ", "ς", 1, UTF8_BINARY, "σ"); + assertSubstringIndex("σ", "Σ", 1, UTF8_BINARY, "σ"); + assertSubstringIndex("ς", "σ", 1, UTF8_BINARY, "ς"); + assertSubstringIndex("ς", "ς", 1, UTF8_BINARY, ""); + assertSubstringIndex("ς", "Σ", 1, UTF8_BINARY, "ς"); + assertSubstringIndex("Σ", "σ", 1, UTF8_BINARY, "Σ"); + assertSubstringIndex("Σ", "ς", 1, UTF8_BINARY, "Σ"); + assertSubstringIndex("Σ", "Σ", 1, UTF8_BINARY, ""); + assertSubstringIndex("σ", "σ", 1, UTF8_LCASE, ""); + assertSubstringIndex("σ", "ς", 1, UTF8_LCASE, ""); + assertSubstringIndex("σ", "Σ", 1, UTF8_LCASE, ""); + assertSubstringIndex("ς", "σ", 1, UTF8_LCASE, ""); + assertSubstringIndex("ς", "ς", 1, UTF8_LCASE, ""); + assertSubstringIndex("ς", "Σ", 1, UTF8_LCASE, ""); + assertSubstringIndex("Σ", "σ", 1, UTF8_LCASE, ""); + assertSubstringIndex("Σ", "ς", 1, UTF8_LCASE, ""); + assertSubstringIndex("Σ", "Σ", 1, UTF8_LCASE, ""); + assertSubstringIndex("σ", "σ", 1, UNICODE, ""); + assertSubstringIndex("σ", "ς", 1, UNICODE, "σ"); + assertSubstringIndex("σ", "Σ", 1, UNICODE, "σ"); + assertSubstringIndex("ς", "σ", 1, UNICODE, "ς"); + assertSubstringIndex("ς", "ς", 1, UNICODE, ""); + assertSubstringIndex("ς", "Σ", 1, UNICODE, "ς"); + assertSubstringIndex("Σ", "σ", 1, UNICODE, "Σ"); + assertSubstringIndex("Σ", "ς", 1, UNICODE, "Σ"); + assertSubstringIndex("Σ", "Σ", 1, UNICODE, ""); + assertSubstringIndex("σ", "σ", 1, UNICODE_CI, ""); + assertSubstringIndex("σ", "ς", 1, UNICODE_CI, ""); + assertSubstringIndex("σ", "Σ", 1, UNICODE_CI, ""); + assertSubstringIndex("ς", "σ", 1, UNICODE_CI, ""); + assertSubstringIndex("ς", "ς", 1, UNICODE_CI, ""); + assertSubstringIndex("ς", "Σ", 1, UNICODE_CI, ""); + assertSubstringIndex("Σ", "σ", 1, UNICODE_CI, ""); + assertSubstringIndex("Σ", "ς", 1, UNICODE_CI, ""); + assertSubstringIndex("Σ", "Σ", 1, UNICODE_CI, ""); // Surrogate pairs. - assertSubstringIndex("a🙃b🙃c", "a", 1, "UTF8_BINARY", ""); - assertSubstringIndex("a🙃b🙃c", "a", 1, "UTF8_LCASE", ""); - assertSubstringIndex("a🙃b🙃c", "a", 1, "UNICODE", ""); - assertSubstringIndex("a🙃b🙃c", "a", 1, "UNICODE_CI", ""); - assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UTF8_BINARY", "a"); - assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UTF8_LCASE", "a"); - assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UNICODE", "a"); - assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UNICODE_CI", "a"); - assertSubstringIndex("a🙃b🙃c", "b", 1, "UTF8_BINARY", "a🙃"); - assertSubstringIndex("a🙃b🙃c", "b", 1, "UTF8_LCASE", "a🙃"); - assertSubstringIndex("a🙃b🙃c", "b", 1, "UNICODE", "a🙃"); - assertSubstringIndex("a🙃b🙃c", "b", 1, "UNICODE_CI", "a🙃"); - assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UTF8_BINARY", "a🙃b"); - assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UTF8_LCASE", "a🙃b"); - assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UNICODE", "a🙃b"); - assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UNICODE_CI", "a🙃b"); - assertSubstringIndex("a🙃b🙃c", "c", 1, "UTF8_BINARY", "a🙃b🙃"); - assertSubstringIndex("a🙃b🙃c", "c", 1, "UTF8_LCASE", "a🙃b🙃"); - assertSubstringIndex("a🙃b🙃c", "c", 1, "UNICODE", "a🙃b🙃"); - assertSubstringIndex("a🙃b🙃c", "c", 1, "UNICODE_CI", "a🙃b🙃"); - assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UTF8_BINARY", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UTF8_LCASE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UNICODE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UNICODE_CI", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", 1, "UTF8_BINARY", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", 1, "UTF8_LCASE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", 1, "UNICODE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", 1, "UNICODE_CI", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "c", -1, "UTF8_BINARY", ""); - assertSubstringIndex("a🙃b🙃c", "c", -1, "UTF8_LCASE", ""); - assertSubstringIndex("a🙃b🙃c", "c", -1, "UNICODE", ""); - assertSubstringIndex("a🙃b🙃c", "c", -1, "UNICODE_CI", ""); - assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UTF8_BINARY", "c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UTF8_LCASE", "c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UNICODE", "c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UNICODE_CI", "c"); - assertSubstringIndex("a🙃b🙃c", "b", -1, "UTF8_BINARY", "🙃c"); - assertSubstringIndex("a🙃b🙃c", "b", -1, "UTF8_LCASE", "🙃c"); - assertSubstringIndex("a🙃b🙃c", "b", -1, "UNICODE", "🙃c"); - assertSubstringIndex("a🙃b🙃c", "b", -1, "UNICODE_CI", "🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UTF8_BINARY", "b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UTF8_LCASE", "b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UNICODE", "b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UNICODE_CI", "b🙃c"); - assertSubstringIndex("a🙃b🙃c", "a", -1, "UTF8_BINARY", "🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "a", -1, "UTF8_LCASE", "🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "a", -1, "UNICODE", "🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "a", -1, "UNICODE_CI", "🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UTF8_BINARY", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UTF8_LCASE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UNICODE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UNICODE_CI", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", -1, "UTF8_BINARY", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", -1, "UTF8_LCASE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", -1, "UNICODE", "a🙃b🙃c"); - assertSubstringIndex("a🙃b🙃c", "d", -1, "UNICODE_CI", "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "a", 1, UTF8_BINARY, ""); + assertSubstringIndex("a🙃b🙃c", "a", 1, UTF8_LCASE, ""); + assertSubstringIndex("a🙃b🙃c", "a", 1, UNICODE, ""); + assertSubstringIndex("a🙃b🙃c", "a", 1, UNICODE_CI, ""); + assertSubstringIndex("a🙃b🙃c", "🙃", 1, UTF8_BINARY, "a"); + assertSubstringIndex("a🙃b🙃c", "🙃", 1, UTF8_LCASE, "a"); + assertSubstringIndex("a🙃b🙃c", "🙃", 1, UNICODE, "a"); + assertSubstringIndex("a🙃b🙃c", "🙃", 1, UNICODE_CI, "a"); + assertSubstringIndex("a🙃b🙃c", "b", 1, UTF8_BINARY, "a🙃"); + assertSubstringIndex("a🙃b🙃c", "b", 1, UTF8_LCASE, "a🙃"); + assertSubstringIndex("a🙃b🙃c", "b", 1, UNICODE, "a🙃"); + assertSubstringIndex("a🙃b🙃c", "b", 1, UNICODE_CI, "a🙃"); + assertSubstringIndex("a🙃b🙃c", "🙃", 2, UTF8_BINARY, "a🙃b"); + assertSubstringIndex("a🙃b🙃c", "🙃", 2, UTF8_LCASE, "a🙃b"); + assertSubstringIndex("a🙃b🙃c", "🙃", 2, UNICODE, "a🙃b"); + assertSubstringIndex("a🙃b🙃c", "🙃", 2, UNICODE_CI, "a🙃b"); + assertSubstringIndex("a🙃b🙃c", "c", 1, UTF8_BINARY, "a🙃b🙃"); + assertSubstringIndex("a🙃b🙃c", "c", 1, UTF8_LCASE, "a🙃b🙃"); + assertSubstringIndex("a🙃b🙃c", "c", 1, UNICODE, "a🙃b🙃"); + assertSubstringIndex("a🙃b🙃c", "c", 1, UNICODE_CI, "a🙃b🙃"); + assertSubstringIndex("a🙃b🙃c", "🙃", 3, UTF8_BINARY, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", 3, UTF8_LCASE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", 3, UNICODE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", 3, UNICODE_CI, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", 1, UTF8_BINARY, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", 1, UTF8_LCASE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", 1, UNICODE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", 1, UNICODE_CI, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "c", -1, UTF8_BINARY, ""); + assertSubstringIndex("a🙃b🙃c", "c", -1, UTF8_LCASE, ""); + assertSubstringIndex("a🙃b🙃c", "c", -1, UNICODE, ""); + assertSubstringIndex("a🙃b🙃c", "c", -1, UNICODE_CI, ""); + assertSubstringIndex("a🙃b🙃c", "🙃", -1, UTF8_BINARY, "c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -1, UTF8_LCASE, "c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -1, UNICODE, "c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -1, UNICODE_CI, "c"); + assertSubstringIndex("a🙃b🙃c", "b", -1, UTF8_BINARY, "🙃c"); + assertSubstringIndex("a🙃b🙃c", "b", -1, UTF8_LCASE, "🙃c"); + assertSubstringIndex("a🙃b🙃c", "b", -1, UNICODE, "🙃c"); + assertSubstringIndex("a🙃b🙃c", "b", -1, UNICODE_CI, "🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -2, UTF8_BINARY, "b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -2, UTF8_LCASE, "b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -2, UNICODE, "b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -2, UNICODE_CI, "b🙃c"); + assertSubstringIndex("a🙃b🙃c", "a", -1, UTF8_BINARY, "🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "a", -1, UTF8_LCASE, "🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "a", -1, UNICODE, "🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "a", -1, UNICODE_CI, "🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -3, UTF8_BINARY, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -3, UTF8_LCASE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -3, UNICODE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "🙃", -3, UNICODE_CI, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", -1, UTF8_BINARY, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", -1, UTF8_LCASE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", -1, UNICODE, "a🙃b🙃c"); + assertSubstringIndex("a🙃b🙃c", "d", -1, UNICODE_CI, "a🙃b🙃c"); } /** @@ -2776,279 +2777,279 @@ private void assertStringTrim(String collationName, String sourceString, String @Test public void testStringTrim() throws SparkException { // Basic tests. - assertStringTrim("UTF8_BINARY", "", "", ""); - assertStringTrim("UTF8_BINARY", "", "xyz", ""); - assertStringTrim("UTF8_BINARY", "asd", "", "asd"); - assertStringTrim("UTF8_BINARY", "asd", null, "asd"); - assertStringTrim("UTF8_BINARY", " asd ", null, "asd"); - assertStringTrim("UTF8_BINARY", " a世a ", null, "a世a"); - assertStringTrim("UTF8_BINARY", "asd", "x", "asd"); - assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd"); - assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a"); - assertStringTrim("UTF8_LCASE", "", "", ""); - assertStringTrim("UTF8_LCASE", "", "xyz", ""); - assertStringTrim("UTF8_LCASE", "asd", "", "asd"); - assertStringTrim("UTF8_LCASE", "asd", null, "asd"); - assertStringTrim("UTF8_LCASE", " asd ", null, "asd"); - assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a"); - assertStringTrim("UTF8_LCASE", "asd", "x", "asd"); - assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd"); - assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a"); - assertStringTrim("UNICODE", "", "", ""); - assertStringTrim("UNICODE", "", "xyz", ""); - assertStringTrim("UNICODE", "asd", "", "asd"); - assertStringTrim("UNICODE", "asd", null, "asd"); - assertStringTrim("UNICODE", " asd ", null, "asd"); - assertStringTrim("UNICODE", " a世a ", null, "a世a"); - assertStringTrim("UNICODE", "asd", "x", "asd"); - assertStringTrim("UNICODE", "xxasdxx", "x", "asd"); - assertStringTrim("UNICODE", "xa世ax", "x", "a世a"); - assertStringTrim("UNICODE_CI", "", "", ""); - assertStringTrim("UNICODE_CI", "", "xyz", ""); - assertStringTrim("UNICODE_CI", "asd", "", "asd"); - assertStringTrim("UNICODE_CI", "asd", null, "asd"); - assertStringTrim("UNICODE_CI", " asd ", null, "asd"); - assertStringTrim("UNICODE_CI", " a世a ", null, "a世a"); - assertStringTrim("UNICODE_CI", "asd", "x", "asd"); - assertStringTrim("UNICODE_CI", "xxasdxx", "x", "asd"); - assertStringTrim("UNICODE_CI", "xa世ax", "x", "a世a"); + assertStringTrim(UTF8_BINARY, "", "", ""); + assertStringTrim(UTF8_BINARY, "", "xyz", ""); + assertStringTrim(UTF8_BINARY, "asd", "", "asd"); + assertStringTrim(UTF8_BINARY, "asd", null, "asd"); + assertStringTrim(UTF8_BINARY, " asd ", null, "asd"); + assertStringTrim(UTF8_BINARY, " a世a ", null, "a世a"); + assertStringTrim(UTF8_BINARY, "asd", "x", "asd"); + assertStringTrim(UTF8_BINARY, "xxasdxx", "x", "asd"); + assertStringTrim(UTF8_BINARY, "xa世ax", "x", "a世a"); + assertStringTrim(UTF8_LCASE, "", "", ""); + assertStringTrim(UTF8_LCASE, "", "xyz", ""); + assertStringTrim(UTF8_LCASE, "asd", "", "asd"); + assertStringTrim(UTF8_LCASE, "asd", null, "asd"); + assertStringTrim(UTF8_LCASE, " asd ", null, "asd"); + assertStringTrim(UTF8_LCASE, " a世a ", null, "a世a"); + assertStringTrim(UTF8_LCASE, "asd", "x", "asd"); + assertStringTrim(UTF8_LCASE, "xxasdxx", "x", "asd"); + assertStringTrim(UTF8_LCASE, "xa世ax", "x", "a世a"); + assertStringTrim(UNICODE, "", "", ""); + assertStringTrim(UNICODE, "", "xyz", ""); + assertStringTrim(UNICODE, "asd", "", "asd"); + assertStringTrim(UNICODE, "asd", null, "asd"); + assertStringTrim(UNICODE, " asd ", null, "asd"); + assertStringTrim(UNICODE, " a世a ", null, "a世a"); + assertStringTrim(UNICODE, "asd", "x", "asd"); + assertStringTrim(UNICODE, "xxasdxx", "x", "asd"); + assertStringTrim(UNICODE, "xa世ax", "x", "a世a"); + assertStringTrim(UNICODE_CI, "", "", ""); + assertStringTrim(UNICODE_CI, "", "xyz", ""); + assertStringTrim(UNICODE_CI, "asd", "", "asd"); + assertStringTrim(UNICODE_CI, "asd", null, "asd"); + assertStringTrim(UNICODE_CI, " asd ", null, "asd"); + assertStringTrim(UNICODE_CI, " a世a ", null, "a世a"); + assertStringTrim(UNICODE_CI, "asd", "x", "asd"); + assertStringTrim(UNICODE_CI, "xxasdxx", "x", "asd"); + assertStringTrim(UNICODE_CI, "xa世ax", "x", "a世a"); // Case variation. - assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); - assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX"); - assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); - assertStringTrim("UTF8_LCASE", "asd", "A", "sd"); - assertStringTrim("UTF8_LCASE", "ASD", "a", "SD"); - assertStringTrim("UTF8_LCASE", "ddsXXXaa", "ASD", "XXX"); - assertStringTrim("UNICODE", "asd", "A", "asd"); - assertStringTrim("UNICODE", "ASD", "a", "ASD"); - assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX"); - assertStringTrim("UNICODE_CI", "asd", "A", "sd"); - assertStringTrim("UNICODE_CI", "ASD", "a", "SD"); - assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX"); + assertStringTrim(UTF8_BINARY, "asd", "A", "asd"); + assertStringTrim(UTF8_BINARY, "ddsXXXaa", "asd", "XXX"); + assertStringTrim(UTF8_BINARY, "ASD", "a", "ASD"); + assertStringTrim(UTF8_LCASE, "asd", "A", "sd"); + assertStringTrim(UTF8_LCASE, "ASD", "a", "SD"); + assertStringTrim(UTF8_LCASE, "ddsXXXaa", "ASD", "XXX"); + assertStringTrim(UNICODE, "asd", "A", "asd"); + assertStringTrim(UNICODE, "ASD", "a", "ASD"); + assertStringTrim(UNICODE, "ddsXXXaa", "asd", "XXX"); + assertStringTrim(UNICODE_CI, "asd", "A", "sd"); + assertStringTrim(UNICODE_CI, "ASD", "a", "SD"); + assertStringTrim(UNICODE_CI, "ddsXXXaa", "ASD", "XXX"); assertStringTrim("SR_CI_AI", "cSCšćČXXXsčšČŠsć", "čš", "XXX"); // One-to-many case mapping (e.g. Turkish dotted I).. - assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa"); - assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa"); - assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa"); - assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa"); - assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa"); - assertStringTrim("UNICODE_CI", "ẞaaaẞ", "ß", "aaa"); - assertStringTrim("UNICODE_CI", "ßaaaß", "ẞ", "aaa"); - assertStringTrim("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrim(UTF8_BINARY, "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrim(UTF8_BINARY, "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrim(UTF8_BINARY, "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrim(UTF8_LCASE, "ẞaaaẞ", "ß", "aaa"); + assertStringTrim(UTF8_LCASE, "ßaaaß", "ẞ", "aaa"); + assertStringTrim(UTF8_LCASE, "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrim(UNICODE, "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrim(UNICODE, "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrim(UNICODE, "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrim(UNICODE_CI, "ẞaaaẞ", "ß", "aaa"); + assertStringTrim(UNICODE_CI, "ßaaaß", "ẞ", "aaa"); + assertStringTrim(UNICODE_CI, "Ëaaaẞ", "Ëẞ", "aaa"); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringTrim("UTF8_BINARY", "i", "i", ""); - assertStringTrim("UTF8_BINARY", "iii", "I", "iii"); - assertStringTrim("UTF8_BINARY", "I", "iii", "I"); - assertStringTrim("UTF8_BINARY", "ixi", "i", "x"); - assertStringTrim("UTF8_BINARY", "i", "İ", "i"); - assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); - assertStringTrim("UTF8_BINARY", "ii\u0307", "İi", "\u0307"); - assertStringTrim("UTF8_BINARY", "iii\u0307", "İi", "\u0307"); - assertStringTrim("UTF8_BINARY", "iiii\u0307", "iİ", "\u0307"); - assertStringTrim("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307"); - assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307"); - assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i"); - assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", ""); - assertStringTrim("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); - assertStringTrim("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); - assertStringTrim("UTF8_BINARY", "i\u0307i", "i\u0307", ""); - assertStringTrim("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); - assertStringTrim("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ"); - assertStringTrim("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307"); - assertStringTrim("UTF8_BINARY", "İ", "İ", ""); - assertStringTrim("UTF8_BINARY", "IXi", "İ", "IXi"); - assertStringTrim("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrim("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); - assertStringTrim("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); - assertStringTrim("UTF8_BINARY", "İ", "i", "İ"); - assertStringTrim("UTF8_BINARY", "İ", "\u0307", "İ"); - assertStringTrim("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrim("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); - assertStringTrim("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi"); - assertStringTrim("UTF8_LCASE", "i", "i", ""); - assertStringTrim("UTF8_LCASE", "iii", "I", ""); - assertStringTrim("UTF8_LCASE", "I", "iii", ""); - assertStringTrim("UTF8_LCASE", "ixi", "i", "x"); - assertStringTrim("UTF8_LCASE", "i", "İ", "i"); - assertStringTrim("UTF8_LCASE", "i\u0307", "İ", ""); - assertStringTrim("UTF8_LCASE", "ii\u0307", "İi", ""); - assertStringTrim("UTF8_LCASE", "iii\u0307", "İi", ""); - assertStringTrim("UTF8_LCASE", "iiii\u0307", "iİ", ""); - assertStringTrim("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", ""); - assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307"); - assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i"); - assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", ""); - assertStringTrim("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); - assertStringTrim("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); - assertStringTrim("UTF8_LCASE", "i\u0307i", "i\u0307", ""); - assertStringTrim("UTF8_LCASE", "i\u0307i", "İ", "i"); - assertStringTrim("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ"); - assertStringTrim("UTF8_LCASE", "i\u0307İ", "İ", ""); - assertStringTrim("UTF8_LCASE", "İ", "İ", ""); - assertStringTrim("UTF8_LCASE", "IXi", "İ", "IXi"); - assertStringTrim("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307"); - assertStringTrim("UTF8_LCASE", "i\u0307x", "IXİ", ""); - assertStringTrim("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); - assertStringTrim("UTF8_LCASE", "İ", "i", "İ"); - assertStringTrim("UTF8_LCASE", "İ", "\u0307", "İ"); - assertStringTrim("UTF8_LCASE", "Ixİ", "i\u0307", "xİ"); - assertStringTrim("UTF8_LCASE", "IXİ", "ix\u0307", "İ"); - assertStringTrim("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); - assertStringTrim("UNICODE", "i", "i", ""); - assertStringTrim("UNICODE", "iii", "I", "iii"); - assertStringTrim("UNICODE", "I", "iii", "I"); - assertStringTrim("UNICODE", "ixi", "i", "x"); - assertStringTrim("UNICODE", "i", "İ", "i"); - assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307"); - assertStringTrim("UNICODE", "ii\u0307", "İi", "i\u0307"); - assertStringTrim("UNICODE", "iii\u0307", "İi", "i\u0307"); - assertStringTrim("UNICODE", "iiii\u0307", "iİ", "i\u0307"); - assertStringTrim("UNICODE", "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307"); - assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307"); - assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307"); - assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrim("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); - assertStringTrim("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); - assertStringTrim("UNICODE", "i\u0307i", "i\u0307", "i\u0307"); - assertStringTrim("UNICODE", "i\u0307i", "İ", "i\u0307i"); - assertStringTrim("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrim("UNICODE", "i\u0307İ", "İ", "i\u0307"); - assertStringTrim("UNICODE", "İ", "İ", ""); - assertStringTrim("UNICODE", "IXi", "İ", "IXi"); - assertStringTrim("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrim("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); - assertStringTrim("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307"); - assertStringTrim("UNICODE", "İ", "i", "İ"); - assertStringTrim("UNICODE", "İ", "\u0307", "İ"); - assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrim("UNICODE", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrim("UNICODE", "IXİ", "ix\u0307", "IXİ"); - assertStringTrim("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); - assertStringTrim("UNICODE_CI", "i", "i", ""); - assertStringTrim("UNICODE_CI", "iii", "I", ""); - assertStringTrim("UNICODE_CI", "I", "iii", ""); - assertStringTrim("UNICODE_CI", "ixi", "i", "x"); - assertStringTrim("UNICODE_CI", "i", "İ", "i"); - assertStringTrim("UNICODE_CI", "i\u0307", "İ", ""); - assertStringTrim("UNICODE_CI", "ii\u0307", "İi", ""); - assertStringTrim("UNICODE_CI", "iii\u0307", "İi", ""); - assertStringTrim("UNICODE_CI", "iiii\u0307", "iİ", ""); - assertStringTrim("UNICODE_CI", "ii\u0307ii\u0307", "iİ", ""); - assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307i", "İ", "i"); - assertStringTrim("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrim("UNICODE_CI", "i\u0307İ", "İ", ""); - assertStringTrim("UNICODE_CI", "İ", "İ", ""); - assertStringTrim("UNICODE_CI", "IXi", "İ", "IXi"); - assertStringTrim("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307"); - assertStringTrim("UNICODE_CI", "i\u0307x", "IXİ", ""); - assertStringTrim("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); - assertStringTrim("UNICODE_CI", "İ", "i", "İ"); - assertStringTrim("UNICODE_CI", "İ", "\u0307", "İ"); - assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrim("UNICODE_CI", "Ixİ", "i\u0307", "xİ"); - assertStringTrim("UNICODE_CI", "IXİ", "ix\u0307", "İ"); - assertStringTrim("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307"); + assertStringTrim(UTF8_BINARY, "i", "i", ""); + assertStringTrim(UTF8_BINARY, "iii", "I", "iii"); + assertStringTrim(UTF8_BINARY, "I", "iii", "I"); + assertStringTrim(UTF8_BINARY, "ixi", "i", "x"); + assertStringTrim(UTF8_BINARY, "i", "İ", "i"); + assertStringTrim(UTF8_BINARY, "i\u0307", "İ", "i\u0307"); + assertStringTrim(UTF8_BINARY, "ii\u0307", "İi", "\u0307"); + assertStringTrim(UTF8_BINARY, "iii\u0307", "İi", "\u0307"); + assertStringTrim(UTF8_BINARY, "iiii\u0307", "iİ", "\u0307"); + assertStringTrim(UTF8_BINARY, "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307"); + assertStringTrim(UTF8_BINARY, "i\u0307", "i", "\u0307"); + assertStringTrim(UTF8_BINARY, "i\u0307", "\u0307", "i"); + assertStringTrim(UTF8_BINARY, "i\u0307", "i\u0307", ""); + assertStringTrim(UTF8_BINARY, "i\u0307i\u0307", "i\u0307", ""); + assertStringTrim(UTF8_BINARY, "i\u0307\u0307", "i\u0307", ""); + assertStringTrim(UTF8_BINARY, "i\u0307i", "i\u0307", ""); + assertStringTrim(UTF8_BINARY, "i\u0307i", "İ", "i\u0307i"); + assertStringTrim(UTF8_BINARY, "i\u0307İ", "i\u0307", "İ"); + assertStringTrim(UTF8_BINARY, "i\u0307İ", "İ", "i\u0307"); + assertStringTrim(UTF8_BINARY, "İ", "İ", ""); + assertStringTrim(UTF8_BINARY, "IXi", "İ", "IXi"); + assertStringTrim(UTF8_BINARY, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrim(UTF8_BINARY, "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrim(UTF8_BINARY, "i\u0307x", "ix\u0307İ", ""); + assertStringTrim(UTF8_BINARY, "İ", "i", "İ"); + assertStringTrim(UTF8_BINARY, "İ", "\u0307", "İ"); + assertStringTrim(UTF8_BINARY, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrim(UTF8_BINARY, "IXİ", "ix\u0307", "IXİ"); + assertStringTrim(UTF8_BINARY, "xi\u0307", "\u0307IX", "xi"); + assertStringTrim(UTF8_LCASE, "i", "i", ""); + assertStringTrim(UTF8_LCASE, "iii", "I", ""); + assertStringTrim(UTF8_LCASE, "I", "iii", ""); + assertStringTrim(UTF8_LCASE, "ixi", "i", "x"); + assertStringTrim(UTF8_LCASE, "i", "İ", "i"); + assertStringTrim(UTF8_LCASE, "i\u0307", "İ", ""); + assertStringTrim(UTF8_LCASE, "ii\u0307", "İi", ""); + assertStringTrim(UTF8_LCASE, "iii\u0307", "İi", ""); + assertStringTrim(UTF8_LCASE, "iiii\u0307", "iİ", ""); + assertStringTrim(UTF8_LCASE, "ii\u0307ii\u0307", "iİ", ""); + assertStringTrim(UTF8_LCASE, "i\u0307", "i", "\u0307"); + assertStringTrim(UTF8_LCASE, "i\u0307", "\u0307", "i"); + assertStringTrim(UTF8_LCASE, "i\u0307", "i\u0307", ""); + assertStringTrim(UTF8_LCASE, "i\u0307i\u0307", "i\u0307", ""); + assertStringTrim(UTF8_LCASE, "i\u0307\u0307", "i\u0307", ""); + assertStringTrim(UTF8_LCASE, "i\u0307i", "i\u0307", ""); + assertStringTrim(UTF8_LCASE, "i\u0307i", "İ", "i"); + assertStringTrim(UTF8_LCASE, "i\u0307İ", "i\u0307", "İ"); + assertStringTrim(UTF8_LCASE, "i\u0307İ", "İ", ""); + assertStringTrim(UTF8_LCASE, "İ", "İ", ""); + assertStringTrim(UTF8_LCASE, "IXi", "İ", "IXi"); + assertStringTrim(UTF8_LCASE, "ix\u0307", "Ixİ", "\u0307"); + assertStringTrim(UTF8_LCASE, "i\u0307x", "IXİ", ""); + assertStringTrim(UTF8_LCASE, "i\u0307x", "I\u0307xİ", ""); + assertStringTrim(UTF8_LCASE, "İ", "i", "İ"); + assertStringTrim(UTF8_LCASE, "İ", "\u0307", "İ"); + assertStringTrim(UTF8_LCASE, "Ixİ", "i\u0307", "xİ"); + assertStringTrim(UTF8_LCASE, "IXİ", "ix\u0307", "İ"); + assertStringTrim(UTF8_LCASE, "xi\u0307", "\u0307IX", ""); + assertStringTrim(UNICODE, "i", "i", ""); + assertStringTrim(UNICODE, "iii", "I", "iii"); + assertStringTrim(UNICODE, "I", "iii", "I"); + assertStringTrim(UNICODE, "ixi", "i", "x"); + assertStringTrim(UNICODE, "i", "İ", "i"); + assertStringTrim(UNICODE, "i\u0307", "İ", "i\u0307"); + assertStringTrim(UNICODE, "ii\u0307", "İi", "i\u0307"); + assertStringTrim(UNICODE, "iii\u0307", "İi", "i\u0307"); + assertStringTrim(UNICODE, "iiii\u0307", "iİ", "i\u0307"); + assertStringTrim(UNICODE, "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307"); + assertStringTrim(UNICODE, "i\u0307", "i", "i\u0307"); + assertStringTrim(UNICODE, "i\u0307", "\u0307", "i\u0307"); + assertStringTrim(UNICODE, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim(UNICODE, "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrim(UNICODE, "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrim(UNICODE, "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrim(UNICODE, "i\u0307i", "İ", "i\u0307i"); + assertStringTrim(UNICODE, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrim(UNICODE, "i\u0307İ", "İ", "i\u0307"); + assertStringTrim(UNICODE, "İ", "İ", ""); + assertStringTrim(UNICODE, "IXi", "İ", "IXi"); + assertStringTrim(UNICODE, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrim(UNICODE, "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrim(UNICODE, "i\u0307x", "ix\u0307İ", "i\u0307"); + assertStringTrim(UNICODE, "İ", "i", "İ"); + assertStringTrim(UNICODE, "İ", "\u0307", "İ"); + assertStringTrim(UNICODE, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim(UNICODE, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrim(UNICODE, "IXİ", "ix\u0307", "IXİ"); + assertStringTrim(UNICODE, "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrim(UNICODE_CI, "i", "i", ""); + assertStringTrim(UNICODE_CI, "iii", "I", ""); + assertStringTrim(UNICODE_CI, "I", "iii", ""); + assertStringTrim(UNICODE_CI, "ixi", "i", "x"); + assertStringTrim(UNICODE_CI, "i", "İ", "i"); + assertStringTrim(UNICODE_CI, "i\u0307", "İ", ""); + assertStringTrim(UNICODE_CI, "ii\u0307", "İi", ""); + assertStringTrim(UNICODE_CI, "iii\u0307", "İi", ""); + assertStringTrim(UNICODE_CI, "iiii\u0307", "iİ", ""); + assertStringTrim(UNICODE_CI, "ii\u0307ii\u0307", "iİ", ""); + assertStringTrim(UNICODE_CI, "i\u0307", "i", "i\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307", "\u0307", "i\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307i", "İ", "i"); + assertStringTrim(UNICODE_CI, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrim(UNICODE_CI, "i\u0307İ", "İ", ""); + assertStringTrim(UNICODE_CI, "İ", "İ", ""); + assertStringTrim(UNICODE_CI, "IXi", "İ", "IXi"); + assertStringTrim(UNICODE_CI, "ix\u0307", "Ixİ", "x\u0307"); + assertStringTrim(UNICODE_CI, "i\u0307x", "IXİ", ""); + assertStringTrim(UNICODE_CI, "i\u0307x", "I\u0307xİ", ""); + assertStringTrim(UNICODE_CI, "İ", "i", "İ"); + assertStringTrim(UNICODE_CI, "İ", "\u0307", "İ"); + assertStringTrim(UNICODE_CI, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim(UNICODE_CI, "Ixİ", "i\u0307", "xİ"); + assertStringTrim(UNICODE_CI, "IXİ", "ix\u0307", "İ"); + assertStringTrim(UNICODE_CI, "xi\u0307", "\u0307IX", "i\u0307"); // Conditional case mapping (e.g. Greek sigmas). - assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς"); - assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x"); - assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς"); - assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x"); - assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ"); - assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ"); - assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); - assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); - assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x"); - assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x"); - assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x"); - assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x"); - assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x"); - assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x"); - assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x"); - assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x"); - assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x"); - assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x"); - assertStringTrim("UNICODE", "ςxς", "σ", "ςxς"); - assertStringTrim("UNICODE", "ςxς", "ς", "x"); - assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς"); - assertStringTrim("UNICODE", "σxσ", "σ", "x"); - assertStringTrim("UNICODE", "σxσ", "ς", "σxσ"); - assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ"); - assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); - assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); - assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x"); - assertStringTrim("UNICODE_CI", "ςxς", "σ", "x"); - assertStringTrim("UNICODE_CI", "ςxς", "ς", "x"); - assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x"); - assertStringTrim("UNICODE_CI", "σxσ", "σ", "x"); - assertStringTrim("UNICODE_CI", "σxσ", "ς", "x"); - assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x"); - assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x"); - assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x"); - assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x"); + assertStringTrim(UTF8_BINARY, "ςxς", "σ", "ςxς"); + assertStringTrim(UTF8_BINARY, "ςxς", "ς", "x"); + assertStringTrim(UTF8_BINARY, "ςxς", "Σ", "ςxς"); + assertStringTrim(UTF8_BINARY, "σxσ", "σ", "x"); + assertStringTrim(UTF8_BINARY, "σxσ", "ς", "σxσ"); + assertStringTrim(UTF8_BINARY, "σxσ", "Σ", "σxσ"); + assertStringTrim(UTF8_BINARY, "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrim(UTF8_BINARY, "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrim(UTF8_BINARY, "ΣxΣ", "Σ", "x"); + assertStringTrim(UTF8_LCASE, "ςxς", "σ", "x"); + assertStringTrim(UTF8_LCASE, "ςxς", "ς", "x"); + assertStringTrim(UTF8_LCASE, "ςxς", "Σ", "x"); + assertStringTrim(UTF8_LCASE, "σxσ", "σ", "x"); + assertStringTrim(UTF8_LCASE, "σxσ", "ς", "x"); + assertStringTrim(UTF8_LCASE, "σxσ", "Σ", "x"); + assertStringTrim(UTF8_LCASE, "ΣxΣ", "σ", "x"); + assertStringTrim(UTF8_LCASE, "ΣxΣ", "ς", "x"); + assertStringTrim(UTF8_LCASE, "ΣxΣ", "Σ", "x"); + assertStringTrim(UNICODE, "ςxς", "σ", "ςxς"); + assertStringTrim(UNICODE, "ςxς", "ς", "x"); + assertStringTrim(UNICODE, "ςxς", "Σ", "ςxς"); + assertStringTrim(UNICODE, "σxσ", "σ", "x"); + assertStringTrim(UNICODE, "σxσ", "ς", "σxσ"); + assertStringTrim(UNICODE, "σxσ", "Σ", "σxσ"); + assertStringTrim(UNICODE, "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrim(UNICODE, "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrim(UNICODE, "ΣxΣ", "Σ", "x"); + assertStringTrim(UNICODE_CI, "ςxς", "σ", "x"); + assertStringTrim(UNICODE_CI, "ςxς", "ς", "x"); + assertStringTrim(UNICODE_CI, "ςxς", "Σ", "x"); + assertStringTrim(UNICODE_CI, "σxσ", "σ", "x"); + assertStringTrim(UNICODE_CI, "σxσ", "ς", "x"); + assertStringTrim(UNICODE_CI, "σxσ", "Σ", "x"); + assertStringTrim(UNICODE_CI, "ΣxΣ", "σ", "x"); + assertStringTrim(UNICODE_CI, "ΣxΣ", "ς", "x"); + assertStringTrim(UNICODE_CI, "ΣxΣ", "Σ", "x"); // Unicode normalization. - assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A"); - assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A"); - assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ"); - assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ"); + assertStringTrim(UTF8_BINARY, "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrim(UTF8_LCASE, "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrim(UNICODE, "åβγδa\u030A", "å", "βγδ"); + assertStringTrim(UNICODE_CI, "åβγδa\u030A", "Å", "βγδ"); // Surrogate pairs. - assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrim("UNICODE", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrim("UNICODE_CI", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "ac", "🙃b🙃"); - assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "ac", "🙃b🙃"); - assertStringTrim("UNICODE", "a🙃b🙃c", "ac", "🙃b🙃"); - assertStringTrim("UNICODE_CI", "a🙃b🙃c", "ac", "🙃b🙃"); - assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "a🙃c", "b"); - assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "a🙃c", "b"); - assertStringTrim("UNICODE", "a🙃b🙃c", "a🙃c", "b"); - assertStringTrim("UNICODE_CI", "a🙃b🙃c", "a🙃c", "b"); - assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "abc🙃", ""); - assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "abc🙃", ""); - assertStringTrim("UNICODE", "a🙃b🙃c", "abc🙃", ""); - assertStringTrim("UNICODE_CI", "a🙃b🙃c", "abc🙃", ""); - assertStringTrim("UTF8_BINARY", "😀😆😃😄", "😀😄", "😆😃"); - assertStringTrim("UTF8_LCASE", "😀😆😃😄", "😀😄", "😆😃"); - assertStringTrim("UNICODE", "😀😆😃😄", "😀😄", "😆😃"); - assertStringTrim("UNICODE_CI", "😀😆😃😄", "😀😄", "😆😃"); - assertStringTrim("UTF8_BINARY", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrim("UTF8_LCASE", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrim("UNICODE", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrim("UNICODE_CI", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrim("UTF8_BINARY", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrim("UTF8_LCASE", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrim("UNICODE", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrim("UNICODE_CI", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrim("UTF8_BINARY", "𐐅", "𐐅", ""); - assertStringTrim("UTF8_LCASE", "𐐅", "𐐅", ""); - assertStringTrim("UNICODE", "𐐅", "𐐅", ""); - assertStringTrim("UNICODE_CI", "𐐅", "𐐅", ""); - assertStringTrim("UTF8_BINARY", "𐐅", "𐐭", "𐐅"); - assertStringTrim("UTF8_LCASE", "𐐅", "𐐭", ""); - assertStringTrim("UNICODE", "𐐅", "𐐭", "𐐅"); - assertStringTrim("UNICODE_CI", "𐐅", "𐐭", ""); - assertStringTrim("UTF8_BINARY", "𝔸", "𝔸", ""); - assertStringTrim("UTF8_LCASE", "𝔸", "𝔸", ""); - assertStringTrim("UNICODE", "𝔸", "𝔸", ""); - assertStringTrim("UNICODE_CI", "𝔸", "𝔸", ""); - assertStringTrim("UTF8_BINARY", "𝔸", "A", "𝔸"); - assertStringTrim("UTF8_LCASE", "𝔸", "A", "𝔸"); - assertStringTrim("UNICODE", "𝔸", "A", "𝔸"); - assertStringTrim("UNICODE_CI", "𝔸", "A", ""); - assertStringTrim("UTF8_BINARY", "𝔸", "a", "𝔸"); - assertStringTrim("UTF8_LCASE", "𝔸", "a", "𝔸"); - assertStringTrim("UNICODE", "𝔸", "a", "𝔸"); - assertStringTrim("UNICODE_CI", "𝔸", "a", ""); + assertStringTrim(UTF8_BINARY, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrim(UTF8_LCASE, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrim(UNICODE, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrim(UNICODE_CI, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrim(UTF8_BINARY, "a🙃b🙃c", "ac", "🙃b🙃"); + assertStringTrim(UTF8_LCASE, "a🙃b🙃c", "ac", "🙃b🙃"); + assertStringTrim(UNICODE, "a🙃b🙃c", "ac", "🙃b🙃"); + assertStringTrim(UNICODE_CI, "a🙃b🙃c", "ac", "🙃b🙃"); + assertStringTrim(UTF8_BINARY, "a🙃b🙃c", "a🙃c", "b"); + assertStringTrim(UTF8_LCASE, "a🙃b🙃c", "a🙃c", "b"); + assertStringTrim(UNICODE, "a🙃b🙃c", "a🙃c", "b"); + assertStringTrim(UNICODE_CI, "a🙃b🙃c", "a🙃c", "b"); + assertStringTrim(UTF8_BINARY, "a🙃b🙃c", "abc🙃", ""); + assertStringTrim(UTF8_LCASE, "a🙃b🙃c", "abc🙃", ""); + assertStringTrim(UNICODE, "a🙃b🙃c", "abc🙃", ""); + assertStringTrim(UNICODE_CI, "a🙃b🙃c", "abc🙃", ""); + assertStringTrim(UTF8_BINARY, "😀😆😃😄", "😀😄", "😆😃"); + assertStringTrim(UTF8_LCASE, "😀😆😃😄", "😀😄", "😆😃"); + assertStringTrim(UNICODE, "😀😆😃😄", "😀😄", "😆😃"); + assertStringTrim(UNICODE_CI, "😀😆😃😄", "😀😄", "😆😃"); + assertStringTrim(UTF8_BINARY, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrim(UTF8_LCASE, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrim(UNICODE, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrim(UNICODE_CI, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrim(UTF8_BINARY, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrim(UTF8_LCASE, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrim(UNICODE, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrim(UNICODE_CI, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrim(UTF8_BINARY, "𐐅", "𐐅", ""); + assertStringTrim(UTF8_LCASE, "𐐅", "𐐅", ""); + assertStringTrim(UNICODE, "𐐅", "𐐅", ""); + assertStringTrim(UNICODE_CI, "𐐅", "𐐅", ""); + assertStringTrim(UTF8_BINARY, "𐐅", "𐐭", "𐐅"); + assertStringTrim(UTF8_LCASE, "𐐅", "𐐭", ""); + assertStringTrim(UNICODE, "𐐅", "𐐭", "𐐅"); + assertStringTrim(UNICODE_CI, "𐐅", "𐐭", ""); + assertStringTrim(UTF8_BINARY, "𝔸", "𝔸", ""); + assertStringTrim(UTF8_LCASE, "𝔸", "𝔸", ""); + assertStringTrim(UNICODE, "𝔸", "𝔸", ""); + assertStringTrim(UNICODE_CI, "𝔸", "𝔸", ""); + assertStringTrim(UTF8_BINARY, "𝔸", "A", "𝔸"); + assertStringTrim(UTF8_LCASE, "𝔸", "A", "𝔸"); + assertStringTrim(UNICODE, "𝔸", "A", "𝔸"); + assertStringTrim(UNICODE_CI, "𝔸", "A", ""); + assertStringTrim(UTF8_BINARY, "𝔸", "a", "𝔸"); + assertStringTrim(UTF8_LCASE, "𝔸", "a", "𝔸"); + assertStringTrim(UNICODE, "𝔸", "a", "𝔸"); + assertStringTrim(UNICODE_CI, "𝔸", "a", ""); } /** @@ -3078,277 +3079,277 @@ private void assertStringTrimLeft(String collationName, String sourceString, Str @Test public void testStringTrimLeft() throws SparkException { // Basic tests - UTF8_BINARY. - assertStringTrimLeft("UTF8_BINARY", "", "", ""); - assertStringTrimLeft("UTF8_BINARY", "", "xyz", ""); - assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd"); - assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd"); - assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd "); - assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a "); - assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd"); - assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx"); - assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax"); + assertStringTrimLeft(UTF8_BINARY, "", "", ""); + assertStringTrimLeft(UTF8_BINARY, "", "xyz", ""); + assertStringTrimLeft(UTF8_BINARY, "asd", "", "asd"); + assertStringTrimLeft(UTF8_BINARY, "asd", null, "asd"); + assertStringTrimLeft(UTF8_BINARY, " asd ", null, "asd "); + assertStringTrimLeft(UTF8_BINARY, " a世a ", null, "a世a "); + assertStringTrimLeft(UTF8_BINARY, "asd", "x", "asd"); + assertStringTrimLeft(UTF8_BINARY, "xxasdxx", "x", "asdxx"); + assertStringTrimLeft(UTF8_BINARY, "xa世ax", "x", "a世ax"); // Basic tests - UTF8_LCASE. - assertStringTrimLeft("UTF8_LCASE", "", "", ""); - assertStringTrimLeft("UTF8_LCASE", "", "xyz", ""); - assertStringTrimLeft("UTF8_LCASE", "asd", "", "asd"); - assertStringTrimLeft("UTF8_LCASE", "asd", null, "asd"); - assertStringTrimLeft("UTF8_LCASE", " asd ", null, "asd "); - assertStringTrimLeft("UTF8_LCASE", " a世a ", null, "a世a "); - assertStringTrimLeft("UTF8_LCASE", "asd", "x", "asd"); - assertStringTrimLeft("UTF8_LCASE", "xxasdxx", "x", "asdxx"); - assertStringTrimLeft("UTF8_LCASE", "xa世ax", "x", "a世ax"); + assertStringTrimLeft(UTF8_LCASE, "", "", ""); + assertStringTrimLeft(UTF8_LCASE, "", "xyz", ""); + assertStringTrimLeft(UTF8_LCASE, "asd", "", "asd"); + assertStringTrimLeft(UTF8_LCASE, "asd", null, "asd"); + assertStringTrimLeft(UTF8_LCASE, " asd ", null, "asd "); + assertStringTrimLeft(UTF8_LCASE, " a世a ", null, "a世a "); + assertStringTrimLeft(UTF8_LCASE, "asd", "x", "asd"); + assertStringTrimLeft(UTF8_LCASE, "xxasdxx", "x", "asdxx"); + assertStringTrimLeft(UTF8_LCASE, "xa世ax", "x", "a世ax"); // Basic tests - UNICODE. - assertStringTrimLeft("UNICODE", "", "", ""); - assertStringTrimLeft("UNICODE", "", "xyz", ""); - assertStringTrimLeft("UNICODE", "asd", "", "asd"); - assertStringTrimLeft("UNICODE", "asd", null, "asd"); - assertStringTrimLeft("UNICODE", " asd ", null, "asd "); - assertStringTrimLeft("UNICODE", " a世a ", null, "a世a "); - assertStringTrimLeft("UNICODE", "asd", "x", "asd"); - assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx"); - assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax"); + assertStringTrimLeft(UNICODE, "", "", ""); + assertStringTrimLeft(UNICODE, "", "xyz", ""); + assertStringTrimLeft(UNICODE, "asd", "", "asd"); + assertStringTrimLeft(UNICODE, "asd", null, "asd"); + assertStringTrimLeft(UNICODE, " asd ", null, "asd "); + assertStringTrimLeft(UNICODE, " a世a ", null, "a世a "); + assertStringTrimLeft(UNICODE, "asd", "x", "asd"); + assertStringTrimLeft(UNICODE, "xxasdxx", "x", "asdxx"); + assertStringTrimLeft(UNICODE, "xa世ax", "x", "a世ax"); // Basic tests - UNICODE_CI. - assertStringTrimLeft("UNICODE_CI", "", "", ""); - assertStringTrimLeft("UNICODE_CI", "", "xyz", ""); - assertStringTrimLeft("UNICODE_CI", "asd", "", "asd"); - assertStringTrimLeft("UNICODE_CI", "asd", null, "asd"); - assertStringTrimLeft("UNICODE_CI", " asd ", null, "asd "); - assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a "); - assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd"); - assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx"); - assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax"); + assertStringTrimLeft(UNICODE_CI, "", "", ""); + assertStringTrimLeft(UNICODE_CI, "", "xyz", ""); + assertStringTrimLeft(UNICODE_CI, "asd", "", "asd"); + assertStringTrimLeft(UNICODE_CI, "asd", null, "asd"); + assertStringTrimLeft(UNICODE_CI, " asd ", null, "asd "); + assertStringTrimLeft(UNICODE_CI, " a世a ", null, "a世a "); + assertStringTrimLeft(UNICODE_CI, "asd", "x", "asd"); + assertStringTrimLeft(UNICODE_CI, "xxasdxx", "x", "asdxx"); + assertStringTrimLeft(UNICODE_CI, "xa世ax", "x", "a世ax"); // Case variation. - assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa"); - assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa"); - assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa"); - assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa"); + assertStringTrimLeft(UTF8_BINARY, "ddsXXXaa", "asd", "XXXaa"); + assertStringTrimLeft(UTF8_LCASE, "ddsXXXaa", "aSd", "XXXaa"); + assertStringTrimLeft(UNICODE, "ddsXXXaa", "asd", "XXXaa"); + assertStringTrimLeft(UNICODE_CI, "ddsXXXaa", "aSd", "XXXaa"); // One-to-many case mapping (e.g. Turkish dotted I).. - assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ"); - assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); - assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß"); - assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); - assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); - assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ"); - assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß"); - assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimLeft(UTF8_BINARY, "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimLeft(UTF8_BINARY, "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimLeft(UTF8_BINARY, "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimLeft(UTF8_LCASE, "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimLeft(UTF8_LCASE, "ßaaaß", "ẞ", "aaaß"); + assertStringTrimLeft(UTF8_LCASE, "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimLeft(UNICODE, "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimLeft(UNICODE, "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimLeft(UNICODE, "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimLeft(UNICODE_CI, "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimLeft(UNICODE_CI, "ßaaaß", "ẞ", "aaaß"); + assertStringTrimLeft(UNICODE_CI, "Ëaaaẞ", "Ëẞ", "aaaẞ"); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringTrimLeft("UTF8_BINARY", "i", "i", ""); - assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii"); - assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I"); - assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi"); - assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); - assertStringTrimLeft("UTF8_BINARY", "ii\u0307", "İi", "\u0307"); - assertStringTrimLeft("UTF8_BINARY", "iii\u0307", "İi", "\u0307"); - assertStringTrimLeft("UTF8_BINARY", "iiii\u0307", "iİ", "\u0307"); - assertStringTrimLeft("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", ""); - assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); - assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); - assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", ""); - assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ"); - assertStringTrimLeft("UTF8_BINARY", "İ", "İ", ""); - assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi"); - assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); - assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); - assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ"); - assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ"); - assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); - assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307"); - assertStringTrimLeft("UTF8_LCASE", "i", "i", ""); - assertStringTrimLeft("UTF8_LCASE", "iii", "I", ""); - assertStringTrimLeft("UTF8_LCASE", "I", "iii", ""); - assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi"); - assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i"); - assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", ""); - assertStringTrimLeft("UTF8_LCASE", "ii\u0307", "İi", ""); - assertStringTrimLeft("UTF8_LCASE", "iii\u0307", "İi", ""); - assertStringTrimLeft("UTF8_LCASE", "iiii\u0307", "iİ", ""); - assertStringTrimLeft("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", ""); - assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307"); - assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307"); - assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", ""); - assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); - assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); - assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", ""); - assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i"); - assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ"); - assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", ""); - assertStringTrimLeft("UTF8_LCASE", "İ", "İ", ""); - assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi"); - assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307"); - assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", ""); - assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); - assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ"); - assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ"); - assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ"); - assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ"); - assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); - assertStringTrimLeft("UNICODE", "i", "i", ""); - assertStringTrimLeft("UNICODE", "iii", "I", "iii"); - assertStringTrimLeft("UNICODE", "I", "iii", "I"); - assertStringTrimLeft("UNICODE", "ixi", "i", "xi"); - assertStringTrimLeft("UNICODE", "i", "İ", "i"); - assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307"); - assertStringTrimLeft("UNICODE", "ii\u0307", "İi", "i\u0307"); - assertStringTrimLeft("UNICODE", "iii\u0307", "İi", "i\u0307"); - assertStringTrimLeft("UNICODE", "iiii\u0307", "iİ", "i\u0307"); - assertStringTrimLeft("UNICODE", "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i"); - assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i"); - assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ"); - assertStringTrimLeft("UNICODE", "İ", "İ", ""); - assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi"); - assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); - assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x"); - assertStringTrimLeft("UNICODE", "İ", "i", "İ"); - assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ"); - assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ"); - assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); - assertStringTrimLeft("UNICODE_CI", "i", "i", ""); - assertStringTrimLeft("UNICODE_CI", "iii", "I", ""); - assertStringTrimLeft("UNICODE_CI", "I", "iii", ""); - assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi"); - assertStringTrimLeft("UNICODE_CI", "i", "İ", "i"); - assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", ""); - assertStringTrimLeft("UNICODE_CI", "ii\u0307", "İi", ""); - assertStringTrimLeft("UNICODE_CI", "iii\u0307", "İi", ""); - assertStringTrimLeft("UNICODE_CI", "iiii\u0307", "iİ", ""); - assertStringTrimLeft("UNICODE_CI", "ii\u0307ii\u0307", "iİ", ""); - assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307"); - assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); - assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimLeft("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); - assertStringTrimLeft("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); - assertStringTrimLeft("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307i"); - assertStringTrimLeft("UNICODE_CI", "i\u0307i", "İ", "i"); - assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "İ", ""); - assertStringTrimLeft("UNICODE_CI", "İ", "İ", ""); - assertStringTrimLeft("UNICODE_CI", "IXi", "İ", "IXi"); - assertStringTrimLeft("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307"); - assertStringTrimLeft("UNICODE_CI", "i\u0307x", "IXİ", ""); - assertStringTrimLeft("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); - assertStringTrimLeft("UNICODE_CI", "İ", "i", "İ"); - assertStringTrimLeft("UNICODE_CI", "İ", "\u0307", "İ"); - assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimLeft("UNICODE_CI", "Ixİ", "i\u0307", "xİ"); - assertStringTrimLeft("UNICODE_CI", "IXİ", "ix\u0307", "İ"); - assertStringTrimLeft("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307"); + assertStringTrimLeft(UTF8_BINARY, "i", "i", ""); + assertStringTrimLeft(UTF8_BINARY, "iii", "I", "iii"); + assertStringTrimLeft(UTF8_BINARY, "I", "iii", "I"); + assertStringTrimLeft(UTF8_BINARY, "ixi", "i", "xi"); + assertStringTrimLeft(UTF8_BINARY, "i", "İ", "i"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307", "İ", "i\u0307"); + assertStringTrimLeft(UTF8_BINARY, "ii\u0307", "İi", "\u0307"); + assertStringTrimLeft(UTF8_BINARY, "iii\u0307", "İi", "\u0307"); + assertStringTrimLeft(UTF8_BINARY, "iiii\u0307", "iİ", "\u0307"); + assertStringTrimLeft(UTF8_BINARY, "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307", "i", "\u0307"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307", "i\u0307", ""); + assertStringTrimLeft(UTF8_BINARY, "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimLeft(UTF8_BINARY, "i\u0307\u0307", "i\u0307", ""); + assertStringTrimLeft(UTF8_BINARY, "i\u0307i", "i\u0307", ""); + assertStringTrimLeft(UTF8_BINARY, "i\u0307i", "İ", "i\u0307i"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307İ", "i\u0307", "İ"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307İ", "İ", "i\u0307İ"); + assertStringTrimLeft(UTF8_BINARY, "İ", "İ", ""); + assertStringTrimLeft(UTF8_BINARY, "IXi", "İ", "IXi"); + assertStringTrimLeft(UTF8_BINARY, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimLeft(UTF8_BINARY, "i\u0307x", "ix\u0307İ", ""); + assertStringTrimLeft(UTF8_BINARY, "İ", "i", "İ"); + assertStringTrimLeft(UTF8_BINARY, "İ", "\u0307", "İ"); + assertStringTrimLeft(UTF8_BINARY, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimLeft(UTF8_BINARY, "IXİ", "ix\u0307", "IXİ"); + assertStringTrimLeft(UTF8_BINARY, "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimLeft(UTF8_LCASE, "i", "i", ""); + assertStringTrimLeft(UTF8_LCASE, "iii", "I", ""); + assertStringTrimLeft(UTF8_LCASE, "I", "iii", ""); + assertStringTrimLeft(UTF8_LCASE, "ixi", "i", "xi"); + assertStringTrimLeft(UTF8_LCASE, "i", "İ", "i"); + assertStringTrimLeft(UTF8_LCASE, "i\u0307", "İ", ""); + assertStringTrimLeft(UTF8_LCASE, "ii\u0307", "İi", ""); + assertStringTrimLeft(UTF8_LCASE, "iii\u0307", "İi", ""); + assertStringTrimLeft(UTF8_LCASE, "iiii\u0307", "iİ", ""); + assertStringTrimLeft(UTF8_LCASE, "ii\u0307ii\u0307", "iİ", ""); + assertStringTrimLeft(UTF8_LCASE, "i\u0307", "i", "\u0307"); + assertStringTrimLeft(UTF8_LCASE, "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft(UTF8_LCASE, "i\u0307", "i\u0307", ""); + assertStringTrimLeft(UTF8_LCASE, "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimLeft(UTF8_LCASE, "i\u0307\u0307", "i\u0307", ""); + assertStringTrimLeft(UTF8_LCASE, "i\u0307i", "i\u0307", ""); + assertStringTrimLeft(UTF8_LCASE, "i\u0307i", "İ", "i"); + assertStringTrimLeft(UTF8_LCASE, "i\u0307İ", "i\u0307", "İ"); + assertStringTrimLeft(UTF8_LCASE, "i\u0307İ", "İ", ""); + assertStringTrimLeft(UTF8_LCASE, "İ", "İ", ""); + assertStringTrimLeft(UTF8_LCASE, "IXi", "İ", "IXi"); + assertStringTrimLeft(UTF8_LCASE, "ix\u0307", "Ixİ", "\u0307"); + assertStringTrimLeft(UTF8_LCASE, "i\u0307x", "IXİ", ""); + assertStringTrimLeft(UTF8_LCASE, "i\u0307x", "I\u0307xİ", ""); + assertStringTrimLeft(UTF8_LCASE, "İ", "i", "İ"); + assertStringTrimLeft(UTF8_LCASE, "İ", "\u0307", "İ"); + assertStringTrimLeft(UTF8_LCASE, "Ixİ", "i\u0307", "xİ"); + assertStringTrimLeft(UTF8_LCASE, "IXİ", "ix\u0307", "İ"); + assertStringTrimLeft(UTF8_LCASE, "xi\u0307", "\u0307IX", ""); + assertStringTrimLeft(UNICODE, "i", "i", ""); + assertStringTrimLeft(UNICODE, "iii", "I", "iii"); + assertStringTrimLeft(UNICODE, "I", "iii", "I"); + assertStringTrimLeft(UNICODE, "ixi", "i", "xi"); + assertStringTrimLeft(UNICODE, "i", "İ", "i"); + assertStringTrimLeft(UNICODE, "i\u0307", "İ", "i\u0307"); + assertStringTrimLeft(UNICODE, "ii\u0307", "İi", "i\u0307"); + assertStringTrimLeft(UNICODE, "iii\u0307", "İi", "i\u0307"); + assertStringTrimLeft(UNICODE, "iiii\u0307", "iİ", "i\u0307"); + assertStringTrimLeft(UNICODE, "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307", "i", "i\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307i", "i\u0307", "i\u0307i"); + assertStringTrimLeft(UNICODE, "i\u0307i", "İ", "i\u0307i"); + assertStringTrimLeft(UNICODE, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimLeft(UNICODE, "i\u0307İ", "İ", "i\u0307İ"); + assertStringTrimLeft(UNICODE, "İ", "İ", ""); + assertStringTrimLeft(UNICODE, "IXi", "İ", "IXi"); + assertStringTrimLeft(UNICODE, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimLeft(UNICODE, "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimLeft(UNICODE, "i\u0307x", "ix\u0307İ", "i\u0307x"); + assertStringTrimLeft(UNICODE, "İ", "i", "İ"); + assertStringTrimLeft(UNICODE, "İ", "\u0307", "İ"); + assertStringTrimLeft(UNICODE, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft(UNICODE, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimLeft(UNICODE, "IXİ", "ix\u0307", "IXİ"); + assertStringTrimLeft(UNICODE, "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimLeft(UNICODE_CI, "i", "i", ""); + assertStringTrimLeft(UNICODE_CI, "iii", "I", ""); + assertStringTrimLeft(UNICODE_CI, "I", "iii", ""); + assertStringTrimLeft(UNICODE_CI, "ixi", "i", "xi"); + assertStringTrimLeft(UNICODE_CI, "i", "İ", "i"); + assertStringTrimLeft(UNICODE_CI, "i\u0307", "İ", ""); + assertStringTrimLeft(UNICODE_CI, "ii\u0307", "İi", ""); + assertStringTrimLeft(UNICODE_CI, "iii\u0307", "İi", ""); + assertStringTrimLeft(UNICODE_CI, "iiii\u0307", "iİ", ""); + assertStringTrimLeft(UNICODE_CI, "ii\u0307ii\u0307", "iİ", ""); + assertStringTrimLeft(UNICODE_CI, "i\u0307", "i", "i\u0307"); + assertStringTrimLeft(UNICODE_CI, "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft(UNICODE_CI, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft(UNICODE_CI, "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimLeft(UNICODE_CI, "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimLeft(UNICODE_CI, "i\u0307i", "i\u0307", "i\u0307i"); + assertStringTrimLeft(UNICODE_CI, "i\u0307i", "İ", "i"); + assertStringTrimLeft(UNICODE_CI, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimLeft(UNICODE_CI, "i\u0307İ", "İ", ""); + assertStringTrimLeft(UNICODE_CI, "İ", "İ", ""); + assertStringTrimLeft(UNICODE_CI, "IXi", "İ", "IXi"); + assertStringTrimLeft(UNICODE_CI, "ix\u0307", "Ixİ", "x\u0307"); + assertStringTrimLeft(UNICODE_CI, "i\u0307x", "IXİ", ""); + assertStringTrimLeft(UNICODE_CI, "i\u0307x", "I\u0307xİ", ""); + assertStringTrimLeft(UNICODE_CI, "İ", "i", "İ"); + assertStringTrimLeft(UNICODE_CI, "İ", "\u0307", "İ"); + assertStringTrimLeft(UNICODE_CI, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft(UNICODE_CI, "Ixİ", "i\u0307", "xİ"); + assertStringTrimLeft(UNICODE_CI, "IXİ", "ix\u0307", "İ"); + assertStringTrimLeft(UNICODE_CI, "xi\u0307", "\u0307IX", "i\u0307"); // Conditional case mapping (e.g. Greek sigmas). - assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς"); - assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς"); - assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς"); - assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ"); - assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ"); - assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ"); - assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); - assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); - assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ"); - assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς"); - assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς"); - assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς"); - assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ"); - assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ"); - assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ"); - assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ"); - assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ"); - assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ"); - assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς"); - assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς"); - assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς"); - assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ"); - assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ"); - assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ"); - assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); - assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); - assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ"); - assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς"); - assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς"); - assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς"); - assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ"); - assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ"); - assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ"); - assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ"); - assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ"); - assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimLeft(UTF8_BINARY, "ςxς", "σ", "ςxς"); + assertStringTrimLeft(UTF8_BINARY, "ςxς", "ς", "xς"); + assertStringTrimLeft(UTF8_BINARY, "ςxς", "Σ", "ςxς"); + assertStringTrimLeft(UTF8_BINARY, "σxσ", "σ", "xσ"); + assertStringTrimLeft(UTF8_BINARY, "σxσ", "ς", "σxσ"); + assertStringTrimLeft(UTF8_BINARY, "σxσ", "Σ", "σxσ"); + assertStringTrimLeft(UTF8_BINARY, "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimLeft(UTF8_BINARY, "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimLeft(UTF8_BINARY, "ΣxΣ", "Σ", "xΣ"); + assertStringTrimLeft(UTF8_LCASE, "ςxς", "σ", "xς"); + assertStringTrimLeft(UTF8_LCASE, "ςxς", "ς", "xς"); + assertStringTrimLeft(UTF8_LCASE, "ςxς", "Σ", "xς"); + assertStringTrimLeft(UTF8_LCASE, "σxσ", "σ", "xσ"); + assertStringTrimLeft(UTF8_LCASE, "σxσ", "ς", "xσ"); + assertStringTrimLeft(UTF8_LCASE, "σxσ", "Σ", "xσ"); + assertStringTrimLeft(UTF8_LCASE, "ΣxΣ", "σ", "xΣ"); + assertStringTrimLeft(UTF8_LCASE, "ΣxΣ", "ς", "xΣ"); + assertStringTrimLeft(UTF8_LCASE, "ΣxΣ", "Σ", "xΣ"); + assertStringTrimLeft(UNICODE, "ςxς", "σ", "ςxς"); + assertStringTrimLeft(UNICODE, "ςxς", "ς", "xς"); + assertStringTrimLeft(UNICODE, "ςxς", "Σ", "ςxς"); + assertStringTrimLeft(UNICODE, "σxσ", "σ", "xσ"); + assertStringTrimLeft(UNICODE, "σxσ", "ς", "σxσ"); + assertStringTrimLeft(UNICODE, "σxσ", "Σ", "σxσ"); + assertStringTrimLeft(UNICODE, "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimLeft(UNICODE, "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimLeft(UNICODE, "ΣxΣ", "Σ", "xΣ"); + assertStringTrimLeft(UNICODE_CI, "ςxς", "σ", "xς"); + assertStringTrimLeft(UNICODE_CI, "ςxς", "ς", "xς"); + assertStringTrimLeft(UNICODE_CI, "ςxς", "Σ", "xς"); + assertStringTrimLeft(UNICODE_CI, "σxσ", "σ", "xσ"); + assertStringTrimLeft(UNICODE_CI, "σxσ", "ς", "xσ"); + assertStringTrimLeft(UNICODE_CI, "σxσ", "Σ", "xσ"); + assertStringTrimLeft(UNICODE_CI, "ΣxΣ", "σ", "xΣ"); + assertStringTrimLeft(UNICODE_CI, "ΣxΣ", "ς", "xΣ"); + assertStringTrimLeft(UNICODE_CI, "ΣxΣ", "Σ", "xΣ"); // Unicode normalization. - assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A"); - assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A"); - assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A"); - assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimLeft(UTF8_BINARY, "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimLeft(UTF8_LCASE, "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimLeft(UNICODE, "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimLeft(UNICODE_CI, "åβγδa\u030A", "Å", "βγδa\u030A"); // Surrogate pairs. - assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimLeft("UNICODE", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "a", "🙃b🙃c"); - assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "a", "🙃b🙃c"); - assertStringTrimLeft("UNICODE", "a🙃b🙃c", "a", "🙃b🙃c"); - assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "a", "🙃b🙃c"); - assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "a🙃", "b🙃c"); - assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "a🙃", "b🙃c"); - assertStringTrimLeft("UNICODE", "a🙃b🙃c", "a🙃", "b🙃c"); - assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "a🙃", "b🙃c"); - assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "a🙃b", "c"); - assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "a🙃b", "c"); - assertStringTrimLeft("UNICODE", "a🙃b🙃c", "a🙃b", "c"); - assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "a🙃b", "c"); - assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimLeft("UNICODE", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimLeft("UTF8_BINARY", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimLeft("UTF8_LCASE", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimLeft("UNICODE", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimLeft("UNICODE_CI", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimLeft("UTF8_BINARY", "😀😆😃😄", "😀😆", "😃😄"); - assertStringTrimLeft("UTF8_LCASE", "😀😆😃😄", "😀😆", "😃😄"); - assertStringTrimLeft("UNICODE", "😀😆😃😄", "😀😆", "😃😄"); - assertStringTrimLeft("UNICODE_CI", "😀😆😃😄", "😀😆", "😃😄"); - assertStringTrimLeft("UTF8_BINARY", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimLeft("UTF8_LCASE", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimLeft("UNICODE", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimLeft("UNICODE_CI", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimLeft("UTF8_BINARY", "𐐅", "𐐅", ""); - assertStringTrimLeft("UTF8_LCASE", "𐐅", "𐐅", ""); - assertStringTrimLeft("UNICODE", "𐐅", "𐐅", ""); - assertStringTrimLeft("UNICODE_CI", "𐐅", "𐐅", ""); - assertStringTrimLeft("UTF8_BINARY", "𐐅", "𐐭", "𐐅"); - assertStringTrimLeft("UTF8_LCASE", "𐐅", "𐐭", ""); - assertStringTrimLeft("UNICODE", "𐐅", "𐐭", "𐐅"); - assertStringTrimLeft("UNICODE_CI", "𐐅", "𐐭", ""); - assertStringTrimLeft("UTF8_BINARY", "𝔸", "𝔸", ""); - assertStringTrimLeft("UTF8_LCASE", "𝔸", "𝔸", ""); - assertStringTrimLeft("UNICODE", "𝔸", "𝔸", ""); - assertStringTrimLeft("UNICODE_CI", "𝔸", "𝔸", ""); - assertStringTrimLeft("UTF8_BINARY", "𝔸", "A", "𝔸"); - assertStringTrimLeft("UTF8_LCASE", "𝔸", "A", "𝔸"); - assertStringTrimLeft("UNICODE", "𝔸", "A", "𝔸"); - assertStringTrimLeft("UNICODE_CI", "𝔸", "A", ""); - assertStringTrimLeft("UTF8_BINARY", "𝔸", "a", "𝔸"); - assertStringTrimLeft("UTF8_LCASE", "𝔸", "a", "𝔸"); - assertStringTrimLeft("UNICODE", "𝔸", "a", "𝔸"); - assertStringTrimLeft("UNICODE_CI", "𝔸", "a", ""); + assertStringTrimLeft(UTF8_BINARY, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimLeft(UTF8_LCASE, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimLeft(UNICODE, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimLeft(UNICODE_CI, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimLeft(UTF8_BINARY, "a🙃b🙃c", "a", "🙃b🙃c"); + assertStringTrimLeft(UTF8_LCASE, "a🙃b🙃c", "a", "🙃b🙃c"); + assertStringTrimLeft(UNICODE, "a🙃b🙃c", "a", "🙃b🙃c"); + assertStringTrimLeft(UNICODE_CI, "a🙃b🙃c", "a", "🙃b🙃c"); + assertStringTrimLeft(UTF8_BINARY, "a🙃b🙃c", "a🙃", "b🙃c"); + assertStringTrimLeft(UTF8_LCASE, "a🙃b🙃c", "a🙃", "b🙃c"); + assertStringTrimLeft(UNICODE, "a🙃b🙃c", "a🙃", "b🙃c"); + assertStringTrimLeft(UNICODE_CI, "a🙃b🙃c", "a🙃", "b🙃c"); + assertStringTrimLeft(UTF8_BINARY, "a🙃b🙃c", "a🙃b", "c"); + assertStringTrimLeft(UTF8_LCASE, "a🙃b🙃c", "a🙃b", "c"); + assertStringTrimLeft(UNICODE, "a🙃b🙃c", "a🙃b", "c"); + assertStringTrimLeft(UNICODE_CI, "a🙃b🙃c", "a🙃b", "c"); + assertStringTrimLeft(UTF8_BINARY, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimLeft(UTF8_LCASE, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimLeft(UNICODE, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimLeft(UNICODE_CI, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimLeft(UTF8_BINARY, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimLeft(UTF8_LCASE, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimLeft(UNICODE, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimLeft(UNICODE_CI, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimLeft(UTF8_BINARY, "😀😆😃😄", "😀😆", "😃😄"); + assertStringTrimLeft(UTF8_LCASE, "😀😆😃😄", "😀😆", "😃😄"); + assertStringTrimLeft(UNICODE, "😀😆😃😄", "😀😆", "😃😄"); + assertStringTrimLeft(UNICODE_CI, "😀😆😃😄", "😀😆", "😃😄"); + assertStringTrimLeft(UTF8_BINARY, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimLeft(UTF8_LCASE, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimLeft(UNICODE, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimLeft(UNICODE_CI, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimLeft(UTF8_BINARY, "𐐅", "𐐅", ""); + assertStringTrimLeft(UTF8_LCASE, "𐐅", "𐐅", ""); + assertStringTrimLeft(UNICODE, "𐐅", "𐐅", ""); + assertStringTrimLeft(UNICODE_CI, "𐐅", "𐐅", ""); + assertStringTrimLeft(UTF8_BINARY, "𐐅", "𐐭", "𐐅"); + assertStringTrimLeft(UTF8_LCASE, "𐐅", "𐐭", ""); + assertStringTrimLeft(UNICODE, "𐐅", "𐐭", "𐐅"); + assertStringTrimLeft(UNICODE_CI, "𐐅", "𐐭", ""); + assertStringTrimLeft(UTF8_BINARY, "𝔸", "𝔸", ""); + assertStringTrimLeft(UTF8_LCASE, "𝔸", "𝔸", ""); + assertStringTrimLeft(UNICODE, "𝔸", "𝔸", ""); + assertStringTrimLeft(UNICODE_CI, "𝔸", "𝔸", ""); + assertStringTrimLeft(UTF8_BINARY, "𝔸", "A", "𝔸"); + assertStringTrimLeft(UTF8_LCASE, "𝔸", "A", "𝔸"); + assertStringTrimLeft(UNICODE, "𝔸", "A", "𝔸"); + assertStringTrimLeft(UNICODE_CI, "𝔸", "A", ""); + assertStringTrimLeft(UTF8_BINARY, "𝔸", "a", "𝔸"); + assertStringTrimLeft(UTF8_LCASE, "𝔸", "a", "𝔸"); + assertStringTrimLeft(UNICODE, "𝔸", "a", "𝔸"); + assertStringTrimLeft(UNICODE_CI, "𝔸", "a", ""); } /** @@ -3378,274 +3379,274 @@ private void assertStringTrimRight(String collationName, String sourceString, St @Test public void testStringTrimRight() throws SparkException { // Basic tests. - assertStringTrimRight("UTF8_BINARY", "", "", ""); - assertStringTrimRight("UTF8_BINARY", "", "xyz", ""); - assertStringTrimRight("UTF8_BINARY", "asd", "", "asd"); - assertStringTrimRight("UTF8_BINARY", "asd", null, "asd"); - assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd"); - assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a"); - assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd"); - assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd"); - assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a"); - assertStringTrimRight("UTF8_LCASE", "", "", ""); - assertStringTrimRight("UTF8_LCASE", "", "xyz", ""); - assertStringTrimRight("UTF8_LCASE", "asd", "", "asd"); - assertStringTrimRight("UTF8_LCASE", "asd", null, "asd"); - assertStringTrimRight("UTF8_LCASE", " asd ", null, " asd"); - assertStringTrimRight("UTF8_LCASE", " a世a ", null, " a世a"); - assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd"); - assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd"); - assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a"); - assertStringTrimRight("UNICODE", "", "", ""); - assertStringTrimRight("UNICODE", "", "xyz", ""); - assertStringTrimRight("UNICODE", "asd", "", "asd"); - assertStringTrimRight("UNICODE", "asd", null, "asd"); - assertStringTrimRight("UNICODE", " asd ", null, " asd"); - assertStringTrimRight("UNICODE", " a世a ", null, " a世a"); - assertStringTrimRight("UNICODE", "asd", "x", "asd"); - assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd"); - assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a"); - assertStringTrimRight("UNICODE_CI", "", "", ""); - assertStringTrimRight("UNICODE_CI", "", "xyz", ""); - assertStringTrimRight("UNICODE_CI", "asd", "", "asd"); - assertStringTrimRight("UNICODE_CI", "asd", null, "asd"); - assertStringTrimRight("UNICODE_CI", " asd ", null, " asd"); - assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a"); - assertStringTrimRight("UNICODE_CI", "asd", "x", "asd"); - assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd"); - assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a"); + assertStringTrimRight(UTF8_BINARY, "", "", ""); + assertStringTrimRight(UTF8_BINARY, "", "xyz", ""); + assertStringTrimRight(UTF8_BINARY, "asd", "", "asd"); + assertStringTrimRight(UTF8_BINARY, "asd", null, "asd"); + assertStringTrimRight(UTF8_BINARY, " asd ", null, " asd"); + assertStringTrimRight(UTF8_BINARY, " a世a ", null, " a世a"); + assertStringTrimRight(UTF8_BINARY, "asd", "x", "asd"); + assertStringTrimRight(UTF8_BINARY, "xxasdxx", "x", "xxasd"); + assertStringTrimRight(UTF8_BINARY, "xa世ax", "x", "xa世a"); + assertStringTrimRight(UTF8_LCASE, "", "", ""); + assertStringTrimRight(UTF8_LCASE, "", "xyz", ""); + assertStringTrimRight(UTF8_LCASE, "asd", "", "asd"); + assertStringTrimRight(UTF8_LCASE, "asd", null, "asd"); + assertStringTrimRight(UTF8_LCASE, " asd ", null, " asd"); + assertStringTrimRight(UTF8_LCASE, " a世a ", null, " a世a"); + assertStringTrimRight(UTF8_LCASE, "asd", "x", "asd"); + assertStringTrimRight(UTF8_LCASE, "xxasdxx", "x", "xxasd"); + assertStringTrimRight(UTF8_LCASE, "xa世ax", "x", "xa世a"); + assertStringTrimRight(UNICODE, "", "", ""); + assertStringTrimRight(UNICODE, "", "xyz", ""); + assertStringTrimRight(UNICODE, "asd", "", "asd"); + assertStringTrimRight(UNICODE, "asd", null, "asd"); + assertStringTrimRight(UNICODE, " asd ", null, " asd"); + assertStringTrimRight(UNICODE, " a世a ", null, " a世a"); + assertStringTrimRight(UNICODE, "asd", "x", "asd"); + assertStringTrimRight(UNICODE, "xxasdxx", "x", "xxasd"); + assertStringTrimRight(UNICODE, "xa世ax", "x", "xa世a"); + assertStringTrimRight(UNICODE_CI, "", "", ""); + assertStringTrimRight(UNICODE_CI, "", "xyz", ""); + assertStringTrimRight(UNICODE_CI, "asd", "", "asd"); + assertStringTrimRight(UNICODE_CI, "asd", null, "asd"); + assertStringTrimRight(UNICODE_CI, " asd ", null, " asd"); + assertStringTrimRight(UNICODE_CI, " a世a ", null, " a世a"); + assertStringTrimRight(UNICODE_CI, "asd", "x", "asd"); + assertStringTrimRight(UNICODE_CI, "xxasdxx", "x", "xxasd"); + assertStringTrimRight(UNICODE_CI, "xa世ax", "x", "xa世a"); // Case variation. - assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX"); - assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX"); - assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX"); - assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX"); + assertStringTrimRight(UTF8_BINARY, "ddsXXXaa", "asd", "ddsXXX"); + assertStringTrimRight(UTF8_LCASE, "ddsXXXaa", "AsD", "ddsXXX"); + assertStringTrimRight(UNICODE, "ddsXXXaa", "asd", "ddsXXX"); + assertStringTrimRight(UNICODE_CI, "ddsXXXaa", "AsD", "ddsXXX"); // One-to-many case mapping (e.g. Turkish dotted I).. - assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); - assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa"); - assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa"); - assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa"); - assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + assertStringTrimRight(UTF8_BINARY, "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimRight(UTF8_BINARY, "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimRight(UTF8_BINARY, "Ëaaaẞ", "Ëẞ", "Ëaaa"); + assertStringTrimRight(UTF8_LCASE, "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrimRight(UTF8_LCASE, "ßaaaß", "ẞ", "ßaaa"); + assertStringTrimRight(UTF8_LCASE, "Ëaaaẞ", "Ëẞ", "Ëaaa"); + assertStringTrimRight(UNICODE, "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimRight(UNICODE, "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimRight(UNICODE, "Ëaaaẞ", "Ëẞ", "Ëaaa"); + assertStringTrimRight(UNICODE_CI, "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrimRight(UNICODE_CI, "ßaaaß", "ẞ", "ßaaa"); + assertStringTrimRight(UNICODE_CI, "Ëaaaẞ", "Ëẞ", "Ëaaa"); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringTrimRight("UTF8_BINARY", "i", "i", ""); - assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii"); - assertStringTrimRight("UTF8_BINARY", "I", "iii", "I"); - assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix"); - assertStringTrimRight("UTF8_BINARY", "i", "İ", "i"); - assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); - assertStringTrimRight("UTF8_BINARY", "ii\u0307", "İi", "ii\u0307"); - assertStringTrimRight("UTF8_BINARY", "iii\u0307", "İi", "iii\u0307"); - assertStringTrimRight("UTF8_BINARY", "iiii\u0307", "iİ", "iiii\u0307"); - assertStringTrimRight("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307"); - assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307"); - assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i"); - assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", ""); - assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); - assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); - assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", ""); - assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); - assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307"); - assertStringTrimRight("UTF8_BINARY", "İ", "İ", ""); - assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi"); - assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); - assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); - assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ"); - assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ"); - assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); - assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi"); - assertStringTrimRight("UTF8_LCASE", "i", "i", ""); - assertStringTrimRight("UTF8_LCASE", "iii", "I", ""); - assertStringTrimRight("UTF8_LCASE", "I", "iii", ""); - assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix"); - assertStringTrimRight("UTF8_LCASE", "i", "İ", "i"); - assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", ""); - assertStringTrimRight("UTF8_LCASE", "ii\u0307", "İi", ""); - assertStringTrimRight("UTF8_LCASE", "iii\u0307", "İi", ""); - assertStringTrimRight("UTF8_LCASE", "iiii\u0307", "iİ", ""); - assertStringTrimRight("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", ""); - assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307"); - assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i"); - assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", ""); - assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); - assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); - assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", ""); - assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i"); - assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", ""); - assertStringTrimRight("UTF8_LCASE", "İ", "İ", ""); - assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi"); - assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", ""); - assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); - assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ"); - assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ"); - assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ"); - assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); - assertStringTrimRight("UNICODE", "i", "i", ""); - assertStringTrimRight("UNICODE", "iii", "I", "iii"); - assertStringTrimRight("UNICODE", "I", "iii", "I"); - assertStringTrimRight("UNICODE", "ixi", "i", "ix"); - assertStringTrimRight("UNICODE", "i", "İ", "i"); - assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307"); - assertStringTrimRight("UTF8_BINARY", "ii\u0307", "İi", "ii\u0307"); - assertStringTrimRight("UTF8_BINARY", "iii\u0307", "İi", "iii\u0307"); - assertStringTrimRight("UTF8_BINARY", "iiii\u0307", "iİ", "iiii\u0307"); - assertStringTrimRight("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307"); - assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307"); - assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307"); - assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); - assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); - assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307"); - assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i"); - assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307"); - assertStringTrimRight("UNICODE", "İ", "İ", ""); - assertStringTrimRight("UNICODE", "IXi", "İ", "IXi"); - assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); - assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307"); - assertStringTrimRight("UNICODE", "İ", "i", "İ"); - assertStringTrimRight("UNICODE", "İ", "\u0307", "İ"); - assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ"); - assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); - assertStringTrimRight("UNICODE_CI", "i", "i", ""); - assertStringTrimRight("UNICODE_CI", "iii", "I", ""); - assertStringTrimRight("UNICODE_CI", "I", "iii", ""); - assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix"); - assertStringTrimRight("UNICODE_CI", "i", "İ", "i"); - assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", ""); - assertStringTrimRight("UNICODE_CI", "ii\u0307", "İi", ""); - assertStringTrimRight("UNICODE_CI", "iii\u0307", "İi", ""); - assertStringTrimRight("UNICODE_CI", "iiii\u0307", "iİ", ""); - assertStringTrimRight("UNICODE_CI", "ii\u0307ii\u0307", "iİ", ""); - assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307i", "İ", "i\u0307i"); - assertStringTrimRight("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); - assertStringTrimRight("UNICODE_CI", "i\u0307İ", "İ", ""); - assertStringTrimRight("UNICODE_CI", "İ", "İ", ""); - assertStringTrimRight("UNICODE_CI", "IXi", "İ", "IXi"); - assertStringTrimRight("UNICODE_CI", "ix\u0307", "Ixİ", "ix\u0307"); - assertStringTrimRight("UNICODE_CI", "i\u0307x", "IXİ", ""); - assertStringTrimRight("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); - assertStringTrimRight("UNICODE_CI", "İ", "i", "İ"); - assertStringTrimRight("UNICODE_CI", "İ", "\u0307", "İ"); - assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); - assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ"); - assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ"); - assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimRight(UTF8_BINARY, "i", "i", ""); + assertStringTrimRight(UTF8_BINARY, "iii", "I", "iii"); + assertStringTrimRight(UTF8_BINARY, "I", "iii", "I"); + assertStringTrimRight(UTF8_BINARY, "ixi", "i", "ix"); + assertStringTrimRight(UTF8_BINARY, "i", "İ", "i"); + assertStringTrimRight(UTF8_BINARY, "i\u0307", "İ", "i\u0307"); + assertStringTrimRight(UTF8_BINARY, "ii\u0307", "İi", "ii\u0307"); + assertStringTrimRight(UTF8_BINARY, "iii\u0307", "İi", "iii\u0307"); + assertStringTrimRight(UTF8_BINARY, "iiii\u0307", "iİ", "iiii\u0307"); + assertStringTrimRight(UTF8_BINARY, "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307"); + assertStringTrimRight(UTF8_BINARY, "i\u0307", "i", "i\u0307"); + assertStringTrimRight(UTF8_BINARY, "i\u0307", "\u0307", "i"); + assertStringTrimRight(UTF8_BINARY, "i\u0307", "i\u0307", ""); + assertStringTrimRight(UTF8_BINARY, "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimRight(UTF8_BINARY, "i\u0307\u0307", "i\u0307", ""); + assertStringTrimRight(UTF8_BINARY, "i\u0307i", "i\u0307", ""); + assertStringTrimRight(UTF8_BINARY, "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight(UTF8_BINARY, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight(UTF8_BINARY, "i\u0307İ", "İ", "i\u0307"); + assertStringTrimRight(UTF8_BINARY, "İ", "İ", ""); + assertStringTrimRight(UTF8_BINARY, "IXi", "İ", "IXi"); + assertStringTrimRight(UTF8_BINARY, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight(UTF8_BINARY, "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimRight(UTF8_BINARY, "i\u0307x", "ix\u0307İ", ""); + assertStringTrimRight(UTF8_BINARY, "İ", "i", "İ"); + assertStringTrimRight(UTF8_BINARY, "İ", "\u0307", "İ"); + assertStringTrimRight(UTF8_BINARY, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight(UTF8_BINARY, "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight(UTF8_BINARY, "xi\u0307", "\u0307IX", "xi"); + assertStringTrimRight(UTF8_LCASE, "i", "i", ""); + assertStringTrimRight(UTF8_LCASE, "iii", "I", ""); + assertStringTrimRight(UTF8_LCASE, "I", "iii", ""); + assertStringTrimRight(UTF8_LCASE, "ixi", "i", "ix"); + assertStringTrimRight(UTF8_LCASE, "i", "İ", "i"); + assertStringTrimRight(UTF8_LCASE, "i\u0307", "İ", ""); + assertStringTrimRight(UTF8_LCASE, "ii\u0307", "İi", ""); + assertStringTrimRight(UTF8_LCASE, "iii\u0307", "İi", ""); + assertStringTrimRight(UTF8_LCASE, "iiii\u0307", "iİ", ""); + assertStringTrimRight(UTF8_LCASE, "ii\u0307ii\u0307", "iİ", ""); + assertStringTrimRight(UTF8_LCASE, "i\u0307", "i", "i\u0307"); + assertStringTrimRight(UTF8_LCASE, "i\u0307", "\u0307", "i"); + assertStringTrimRight(UTF8_LCASE, "i\u0307", "i\u0307", ""); + assertStringTrimRight(UTF8_LCASE, "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimRight(UTF8_LCASE, "i\u0307\u0307", "i\u0307", ""); + assertStringTrimRight(UTF8_LCASE, "i\u0307i", "i\u0307", ""); + assertStringTrimRight(UTF8_LCASE, "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight(UTF8_LCASE, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight(UTF8_LCASE, "i\u0307İ", "İ", ""); + assertStringTrimRight(UTF8_LCASE, "İ", "İ", ""); + assertStringTrimRight(UTF8_LCASE, "IXi", "İ", "IXi"); + assertStringTrimRight(UTF8_LCASE, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight(UTF8_LCASE, "i\u0307x", "IXİ", ""); + assertStringTrimRight(UTF8_LCASE, "i\u0307x", "I\u0307xİ", ""); + assertStringTrimRight(UTF8_LCASE, "İ", "i", "İ"); + assertStringTrimRight(UTF8_LCASE, "İ", "\u0307", "İ"); + assertStringTrimRight(UTF8_LCASE, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight(UTF8_LCASE, "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight(UTF8_LCASE, "xi\u0307", "\u0307IX", ""); + assertStringTrimRight(UNICODE, "i", "i", ""); + assertStringTrimRight(UNICODE, "iii", "I", "iii"); + assertStringTrimRight(UNICODE, "I", "iii", "I"); + assertStringTrimRight(UNICODE, "ixi", "i", "ix"); + assertStringTrimRight(UNICODE, "i", "İ", "i"); + assertStringTrimRight(UNICODE, "i\u0307", "İ", "i\u0307"); + assertStringTrimRight(UTF8_BINARY, "ii\u0307", "İi", "ii\u0307"); + assertStringTrimRight(UTF8_BINARY, "iii\u0307", "İi", "iii\u0307"); + assertStringTrimRight(UTF8_BINARY, "iiii\u0307", "iİ", "iiii\u0307"); + assertStringTrimRight(UTF8_BINARY, "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307"); + assertStringTrimRight(UNICODE, "i\u0307", "i", "i\u0307"); + assertStringTrimRight(UNICODE, "i\u0307", "\u0307", "i\u0307"); + assertStringTrimRight(UNICODE, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight(UNICODE, "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimRight(UNICODE, "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimRight(UNICODE, "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrimRight(UNICODE, "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight(UNICODE, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight(UNICODE, "i\u0307İ", "İ", "i\u0307"); + assertStringTrimRight(UNICODE, "İ", "İ", ""); + assertStringTrimRight(UNICODE, "IXi", "İ", "IXi"); + assertStringTrimRight(UNICODE, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight(UNICODE, "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimRight(UNICODE, "i\u0307x", "ix\u0307İ", "i\u0307"); + assertStringTrimRight(UNICODE, "İ", "i", "İ"); + assertStringTrimRight(UNICODE, "İ", "\u0307", "İ"); + assertStringTrimRight(UNICODE, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight(UNICODE, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight(UNICODE, "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight(UNICODE, "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimRight(UNICODE_CI, "i", "i", ""); + assertStringTrimRight(UNICODE_CI, "iii", "I", ""); + assertStringTrimRight(UNICODE_CI, "I", "iii", ""); + assertStringTrimRight(UNICODE_CI, "ixi", "i", "ix"); + assertStringTrimRight(UNICODE_CI, "i", "İ", "i"); + assertStringTrimRight(UNICODE_CI, "i\u0307", "İ", ""); + assertStringTrimRight(UNICODE_CI, "ii\u0307", "İi", ""); + assertStringTrimRight(UNICODE_CI, "iii\u0307", "İi", ""); + assertStringTrimRight(UNICODE_CI, "iiii\u0307", "iİ", ""); + assertStringTrimRight(UNICODE_CI, "ii\u0307ii\u0307", "iİ", ""); + assertStringTrimRight(UNICODE_CI, "i\u0307", "i", "i\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307", "\u0307", "i\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight(UNICODE_CI, "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight(UNICODE_CI, "i\u0307İ", "İ", ""); + assertStringTrimRight(UNICODE_CI, "İ", "İ", ""); + assertStringTrimRight(UNICODE_CI, "IXi", "İ", "IXi"); + assertStringTrimRight(UNICODE_CI, "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight(UNICODE_CI, "i\u0307x", "IXİ", ""); + assertStringTrimRight(UNICODE_CI, "i\u0307x", "I\u0307xİ", ""); + assertStringTrimRight(UNICODE_CI, "İ", "i", "İ"); + assertStringTrimRight(UNICODE_CI, "İ", "\u0307", "İ"); + assertStringTrimRight(UNICODE_CI, "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight(UNICODE_CI, "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight(UNICODE_CI, "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight(UNICODE_CI, "xi\u0307", "\u0307IX", "xi\u0307"); // Conditional case mapping (e.g. Greek sigmas). - assertStringTrimRight("UTF8_BINARY", "ςxς", "σ", "ςxς"); - assertStringTrimRight("UTF8_BINARY", "ςxς", "ς", "ςx"); - assertStringTrimRight("UTF8_BINARY", "ςxς", "Σ", "ςxς"); - assertStringTrimRight("UTF8_BINARY", "σxσ", "σ", "σx"); - assertStringTrimRight("UTF8_BINARY", "σxσ", "ς", "σxσ"); - assertStringTrimRight("UTF8_BINARY", "σxσ", "Σ", "σxσ"); - assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); - assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); - assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx"); - assertStringTrimRight("UTF8_LCASE", "ςxς", "σ", "ςx"); - assertStringTrimRight("UTF8_LCASE", "ςxς", "ς", "ςx"); - assertStringTrimRight("UTF8_LCASE", "ςxς", "Σ", "ςx"); - assertStringTrimRight("UTF8_LCASE", "σxσ", "σ", "σx"); - assertStringTrimRight("UTF8_LCASE", "σxσ", "ς", "σx"); - assertStringTrimRight("UTF8_LCASE", "σxσ", "Σ", "σx"); - assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx"); - assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx"); - assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx"); - assertStringTrimRight("UNICODE", "ςxς", "σ", "ςxς"); - assertStringTrimRight("UNICODE", "ςxς", "ς", "ςx"); - assertStringTrimRight("UNICODE", "ςxς", "Σ", "ςxς"); - assertStringTrimRight("UNICODE", "σxσ", "σ", "σx"); - assertStringTrimRight("UNICODE", "σxσ", "ς", "σxσ"); - assertStringTrimRight("UNICODE", "σxσ", "Σ", "σxσ"); - assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); - assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); - assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx"); - assertStringTrimRight("UNICODE_CI", "ςxς", "σ", "ςx"); - assertStringTrimRight("UNICODE_CI", "ςxς", "ς", "ςx"); - assertStringTrimRight("UNICODE_CI", "ςxς", "Σ", "ςx"); - assertStringTrimRight("UNICODE_CI", "σxσ", "σ", "σx"); - assertStringTrimRight("UNICODE_CI", "σxσ", "ς", "σx"); - assertStringTrimRight("UNICODE_CI", "σxσ", "Σ", "σx"); - assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx"); - assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx"); - assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx"); + assertStringTrimRight(UTF8_BINARY, "ςxς", "σ", "ςxς"); + assertStringTrimRight(UTF8_BINARY, "ςxς", "ς", "ςx"); + assertStringTrimRight(UTF8_BINARY, "ςxς", "Σ", "ςxς"); + assertStringTrimRight(UTF8_BINARY, "σxσ", "σ", "σx"); + assertStringTrimRight(UTF8_BINARY, "σxσ", "ς", "σxσ"); + assertStringTrimRight(UTF8_BINARY, "σxσ", "Σ", "σxσ"); + assertStringTrimRight(UTF8_BINARY, "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimRight(UTF8_BINARY, "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimRight(UTF8_BINARY, "ΣxΣ", "Σ", "Σx"); + assertStringTrimRight(UTF8_LCASE, "ςxς", "σ", "ςx"); + assertStringTrimRight(UTF8_LCASE, "ςxς", "ς", "ςx"); + assertStringTrimRight(UTF8_LCASE, "ςxς", "Σ", "ςx"); + assertStringTrimRight(UTF8_LCASE, "σxσ", "σ", "σx"); + assertStringTrimRight(UTF8_LCASE, "σxσ", "ς", "σx"); + assertStringTrimRight(UTF8_LCASE, "σxσ", "Σ", "σx"); + assertStringTrimRight(UTF8_LCASE, "ΣxΣ", "σ", "Σx"); + assertStringTrimRight(UTF8_LCASE, "ΣxΣ", "ς", "Σx"); + assertStringTrimRight(UTF8_LCASE, "ΣxΣ", "Σ", "Σx"); + assertStringTrimRight(UNICODE, "ςxς", "σ", "ςxς"); + assertStringTrimRight(UNICODE, "ςxς", "ς", "ςx"); + assertStringTrimRight(UNICODE, "ςxς", "Σ", "ςxς"); + assertStringTrimRight(UNICODE, "σxσ", "σ", "σx"); + assertStringTrimRight(UNICODE, "σxσ", "ς", "σxσ"); + assertStringTrimRight(UNICODE, "σxσ", "Σ", "σxσ"); + assertStringTrimRight(UNICODE, "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimRight(UNICODE, "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimRight(UNICODE, "ΣxΣ", "Σ", "Σx"); + assertStringTrimRight(UNICODE_CI, "ςxς", "σ", "ςx"); + assertStringTrimRight(UNICODE_CI, "ςxς", "ς", "ςx"); + assertStringTrimRight(UNICODE_CI, "ςxς", "Σ", "ςx"); + assertStringTrimRight(UNICODE_CI, "σxσ", "σ", "σx"); + assertStringTrimRight(UNICODE_CI, "σxσ", "ς", "σx"); + assertStringTrimRight(UNICODE_CI, "σxσ", "Σ", "σx"); + assertStringTrimRight(UNICODE_CI, "ΣxΣ", "σ", "Σx"); + assertStringTrimRight(UNICODE_CI, "ΣxΣ", "ς", "Σx"); + assertStringTrimRight(UNICODE_CI, "ΣxΣ", "Σ", "Σx"); // Unicode normalization. - assertStringTrimRight("UTF8_BINARY", "åβγδa\u030A", "å", "åβγδa\u030A"); - assertStringTrimRight("UTF8_LCASE", "åβγδa\u030A", "Å", "åβγδa\u030A"); - assertStringTrimRight("UNICODE", "åβγδa\u030A", "å", "åβγδ"); - assertStringTrimRight("UNICODE_CI", "åβγδa\u030A", "Å", "åβγδ"); + assertStringTrimRight(UTF8_BINARY, "åβγδa\u030A", "å", "åβγδa\u030A"); + assertStringTrimRight(UTF8_LCASE, "åβγδa\u030A", "Å", "åβγδa\u030A"); + assertStringTrimRight(UNICODE, "åβγδa\u030A", "å", "åβγδ"); + assertStringTrimRight(UNICODE_CI, "åβγδa\u030A", "Å", "åβγδ"); // Surrogate pairs. - assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimRight("UNICODE", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "🙃", "a🙃b🙃c"); - assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "c", "a🙃b🙃"); - assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "c", "a🙃b🙃"); - assertStringTrimRight("UNICODE", "a🙃b🙃c", "c", "a🙃b🙃"); - assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "c", "a🙃b🙃"); - assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "c🙃", "a🙃b"); - assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "c🙃", "a🙃b"); - assertStringTrimRight("UNICODE", "a🙃b🙃c", "c🙃", "a🙃b"); - assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "c🙃", "a🙃b"); - assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "c🙃b", "a"); - assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "c🙃b", "a"); - assertStringTrimRight("UNICODE", "a🙃b🙃c", "c🙃b", "a"); - assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "c🙃b", "a"); - assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimRight("UNICODE", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "abc🙃", ""); - assertStringTrimRight("UTF8_BINARY", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimRight("UTF8_LCASE", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimRight("UNICODE", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimRight("UNICODE_CI", "😀😆😃😄", "😆😃", "😀😆😃😄"); - assertStringTrimRight("UTF8_BINARY", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrimRight("UTF8_LCASE", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrimRight("UNICODE", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrimRight("UNICODE_CI", "😀😆😃😄", "😃😄", "😀😆"); - assertStringTrimRight("UTF8_BINARY", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimRight("UTF8_LCASE", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimRight("UNICODE", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimRight("UNICODE_CI", "😀😆😃😄", "😀😆😃😄", ""); - assertStringTrimRight("UTF8_BINARY", "𐐅", "𐐅", ""); - assertStringTrimRight("UTF8_LCASE", "𐐅", "𐐅", ""); - assertStringTrimRight("UNICODE", "𐐅", "𐐅", ""); - assertStringTrimRight("UNICODE_CI", "𐐅", "𐐅", ""); - assertStringTrimRight("UTF8_BINARY", "𐐅", "𐐭", "𐐅"); - assertStringTrimRight("UTF8_LCASE", "𐐅", "𐐭", ""); - assertStringTrimRight("UNICODE", "𐐅", "𐐭", "𐐅"); - assertStringTrimRight("UNICODE_CI", "𐐅", "𐐭", ""); - assertStringTrimRight("UTF8_BINARY", "𝔸", "𝔸", ""); - assertStringTrimRight("UTF8_LCASE", "𝔸", "𝔸", ""); - assertStringTrimRight("UNICODE", "𝔸", "𝔸", ""); - assertStringTrimRight("UNICODE_CI", "𝔸", "𝔸", ""); - assertStringTrimRight("UTF8_BINARY", "𝔸", "A", "𝔸"); - assertStringTrimRight("UTF8_LCASE", "𝔸", "A", "𝔸"); - assertStringTrimRight("UNICODE", "𝔸", "A", "𝔸"); - assertStringTrimRight("UNICODE_CI", "𝔸", "A", ""); - assertStringTrimRight("UTF8_BINARY", "𝔸", "a", "𝔸"); - assertStringTrimRight("UTF8_LCASE", "𝔸", "a", "𝔸"); - assertStringTrimRight("UNICODE", "𝔸", "a", "𝔸"); - assertStringTrimRight("UNICODE_CI", "𝔸", "a", ""); + assertStringTrimRight(UTF8_BINARY, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimRight(UTF8_LCASE, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimRight(UNICODE, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimRight(UNICODE_CI, "a🙃b🙃c", "🙃", "a🙃b🙃c"); + assertStringTrimRight(UTF8_BINARY, "a🙃b🙃c", "c", "a🙃b🙃"); + assertStringTrimRight(UTF8_LCASE, "a🙃b🙃c", "c", "a🙃b🙃"); + assertStringTrimRight(UNICODE, "a🙃b🙃c", "c", "a🙃b🙃"); + assertStringTrimRight(UNICODE_CI, "a🙃b🙃c", "c", "a🙃b🙃"); + assertStringTrimRight(UTF8_BINARY, "a🙃b🙃c", "c🙃", "a🙃b"); + assertStringTrimRight(UTF8_LCASE, "a🙃b🙃c", "c🙃", "a🙃b"); + assertStringTrimRight(UNICODE, "a🙃b🙃c", "c🙃", "a🙃b"); + assertStringTrimRight(UNICODE_CI, "a🙃b🙃c", "c🙃", "a🙃b"); + assertStringTrimRight(UTF8_BINARY, "a🙃b🙃c", "c🙃b", "a"); + assertStringTrimRight(UTF8_LCASE, "a🙃b🙃c", "c🙃b", "a"); + assertStringTrimRight(UNICODE, "a🙃b🙃c", "c🙃b", "a"); + assertStringTrimRight(UNICODE_CI, "a🙃b🙃c", "c🙃b", "a"); + assertStringTrimRight(UTF8_BINARY, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimRight(UTF8_LCASE, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimRight(UNICODE, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimRight(UNICODE_CI, "a🙃b🙃c", "abc🙃", ""); + assertStringTrimRight(UTF8_BINARY, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimRight(UTF8_LCASE, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimRight(UNICODE, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimRight(UNICODE_CI, "😀😆😃😄", "😆😃", "😀😆😃😄"); + assertStringTrimRight(UTF8_BINARY, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrimRight(UTF8_LCASE, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrimRight(UNICODE, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrimRight(UNICODE_CI, "😀😆😃😄", "😃😄", "😀😆"); + assertStringTrimRight(UTF8_BINARY, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimRight(UTF8_LCASE, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimRight(UNICODE, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimRight(UNICODE_CI, "😀😆😃😄", "😀😆😃😄", ""); + assertStringTrimRight(UTF8_BINARY, "𐐅", "𐐅", ""); + assertStringTrimRight(UTF8_LCASE, "𐐅", "𐐅", ""); + assertStringTrimRight(UNICODE, "𐐅", "𐐅", ""); + assertStringTrimRight(UNICODE_CI, "𐐅", "𐐅", ""); + assertStringTrimRight(UTF8_BINARY, "𐐅", "𐐭", "𐐅"); + assertStringTrimRight(UTF8_LCASE, "𐐅", "𐐭", ""); + assertStringTrimRight(UNICODE, "𐐅", "𐐭", "𐐅"); + assertStringTrimRight(UNICODE_CI, "𐐅", "𐐭", ""); + assertStringTrimRight(UTF8_BINARY, "𝔸", "𝔸", ""); + assertStringTrimRight(UTF8_LCASE, "𝔸", "𝔸", ""); + assertStringTrimRight(UNICODE, "𝔸", "𝔸", ""); + assertStringTrimRight(UNICODE_CI, "𝔸", "𝔸", ""); + assertStringTrimRight(UTF8_BINARY, "𝔸", "A", "𝔸"); + assertStringTrimRight(UTF8_LCASE, "𝔸", "A", "𝔸"); + assertStringTrimRight(UNICODE, "𝔸", "A", "𝔸"); + assertStringTrimRight(UNICODE_CI, "𝔸", "A", ""); + assertStringTrimRight(UTF8_BINARY, "𝔸", "a", "𝔸"); + assertStringTrimRight(UTF8_LCASE, "𝔸", "a", "𝔸"); + assertStringTrimRight(UNICODE, "𝔸", "a", "𝔸"); + assertStringTrimRight(UNICODE_CI, "𝔸", "a", ""); } /** @@ -3664,211 +3665,211 @@ private void assertStringTranslate(String inputString, String matchingString, @Test public void testStringTranslate() throws SparkException { // Empty strings. - assertStringTranslate("", "", "", "UTF8_BINARY", ""); - assertStringTranslate("", "", "", "UTF8_LCASE", ""); - assertStringTranslate("", "", "", "UNICODE", ""); - assertStringTranslate("", "", "", "UNICODE_CI", ""); - assertStringTranslate("abc", "", "", "UTF8_BINARY", "abc"); - assertStringTranslate("abc", "", "", "UTF8_LCASE", "abc"); - assertStringTranslate("abc", "", "", "UNICODE", "abc"); - assertStringTranslate("abc", "", "", "UNICODE_CI", "abc"); - assertStringTranslate("", "b", "", "UTF8_BINARY", ""); - assertStringTranslate("", "b", "", "UTF8_LCASE", ""); - assertStringTranslate("", "b", "", "UNICODE", ""); - assertStringTranslate("", "b", "", "UNICODE_CI", ""); - assertStringTranslate("", "", "x", "UTF8_BINARY", ""); - assertStringTranslate("", "", "x", "UTF8_LCASE", ""); - assertStringTranslate("", "", "x", "UNICODE", ""); - assertStringTranslate("", "", "x", "UNICODE_CI", ""); - assertStringTranslate("abc", "b", "", "UTF8_BINARY", "ac"); - assertStringTranslate("abc", "b", "", "UTF8_LCASE", "ac"); - assertStringTranslate("abc", "b", "", "UNICODE", "ac"); - assertStringTranslate("abc", "b", "", "UNICODE_CI", "ac"); - assertStringTranslate("abc", "", "x", "UTF8_BINARY", "abc"); - assertStringTranslate("abc", "", "x", "UTF8_LCASE", "abc"); - assertStringTranslate("abc", "", "x", "UNICODE", "abc"); - assertStringTranslate("abc", "", "x", "UNICODE_CI", "abc"); - assertStringTranslate("", "b", "x", "UTF8_BINARY", ""); - assertStringTranslate("", "b", "x", "UTF8_LCASE", ""); - assertStringTranslate("", "b", "x", "UNICODE", ""); - assertStringTranslate("", "b", "x", "UNICODE_CI", ""); + assertStringTranslate("", "", "", UTF8_BINARY, ""); + assertStringTranslate("", "", "", UTF8_LCASE, ""); + assertStringTranslate("", "", "", UNICODE, ""); + assertStringTranslate("", "", "", UNICODE_CI, ""); + assertStringTranslate("abc", "", "", UTF8_BINARY, "abc"); + assertStringTranslate("abc", "", "", UTF8_LCASE, "abc"); + assertStringTranslate("abc", "", "", UNICODE, "abc"); + assertStringTranslate("abc", "", "", UNICODE_CI, "abc"); + assertStringTranslate("", "b", "", UTF8_BINARY, ""); + assertStringTranslate("", "b", "", UTF8_LCASE, ""); + assertStringTranslate("", "b", "", UNICODE, ""); + assertStringTranslate("", "b", "", UNICODE_CI, ""); + assertStringTranslate("", "", "x", UTF8_BINARY, ""); + assertStringTranslate("", "", "x", UTF8_LCASE, ""); + assertStringTranslate("", "", "x", UNICODE, ""); + assertStringTranslate("", "", "x", UNICODE_CI, ""); + assertStringTranslate("abc", "b", "", UTF8_BINARY, "ac"); + assertStringTranslate("abc", "b", "", UTF8_LCASE, "ac"); + assertStringTranslate("abc", "b", "", UNICODE, "ac"); + assertStringTranslate("abc", "b", "", UNICODE_CI, "ac"); + assertStringTranslate("abc", "", "x", UTF8_BINARY, "abc"); + assertStringTranslate("abc", "", "x", UTF8_LCASE, "abc"); + assertStringTranslate("abc", "", "x", UNICODE, "abc"); + assertStringTranslate("abc", "", "x", UNICODE_CI, "abc"); + assertStringTranslate("", "b", "x", UTF8_BINARY, ""); + assertStringTranslate("", "b", "x", UTF8_LCASE, ""); + assertStringTranslate("", "b", "x", UNICODE, ""); + assertStringTranslate("", "b", "x", UNICODE_CI, ""); // Basic tests. - assertStringTranslate("abc", "b", "x", "UTF8_BINARY", "axc"); - assertStringTranslate("abc", "b", "x", "UTF8_LCASE", "axc"); - assertStringTranslate("abc", "b", "x", "UNICODE", "axc"); - assertStringTranslate("abc", "b", "x", "UNICODE_CI", "axc"); - assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae"); - assertStringTranslate("Translate", "Rnlt", "12", "UTF8_LCASE", "1a2sae"); - assertStringTranslate("Translate", "Rnlt", "12", "UNICODE", "Tra2sae"); - assertStringTranslate("Translate", "Rnlt", "12", "UNICODE_CI", "1a2sae"); - assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY", "Tra2slate"); - assertStringTranslate("Translate", "Rn", "1234", "UTF8_LCASE", "T1a2slate"); - assertStringTranslate("Translate", "Rn", "1234", "UNICODE", "Tra2slate"); - assertStringTranslate("Translate", "Rn", "1234", "UNICODE_CI", "T1a2slate"); - assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY", "Tra2s3a4e"); - assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE", "41a2s3a4e"); - assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e"); - assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI", "41a2s3a4e"); - assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY", "TRaxsXaxe"); - assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE", "xXaxsXaxe"); - assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe"); - assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI", "xXaxsXaxe"); - assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY", "TxaxsXaxeX"); - assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE", "xxaxsXaxex"); - assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE", "TxaxsXaxeX"); - assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI", "xxaxsXaxex"); - assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY", "TXaxsXaxex"); - assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE", "xXaxsXaxeX"); - assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE", "TXaxsXaxex"); - assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI", "xXaxsXaxeX"); - assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY", "test大千世AX大千世A"); - assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE", "test大千世AB大千世A"); - assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE", "test大千世AX大千世A"); - assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI", "test大千世AB大千世A"); - assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY", "大千世界test大千世界"); - assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE", "大千世界abca大千世界"); - assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE", "大千世界test大千世界"); - assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI", "大千世界abca大千世界"); - assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY", "Oeso大千世界大千世界"); - assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE", "oeso大千世界大千世界"); - assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE", "Oeso大千世界大千世界"); - assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI", "oeso大千世界大千世界"); - assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY", "大千世界大千世界oesO"); - assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE", "大千世界大千世界OesO"); - assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE", "大千世界大千世界oesO"); - assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI", "大千世界大千世界OesO"); - assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY", "世世世界世世世界tesT"); - assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE", "世世世界世世世界tesT"); - assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE", "世世世界世世世界tesT"); - assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI", "世世世界世世世界tesT"); - assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY", "Tr4234e"); - assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE", "14234e"); - assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE", "Tr4234e"); - assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI", "14234e"); - assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY", "Tra2s3a4e"); - assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE", "41a2s3a4e"); - assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE", "Tra2s3a4e"); - assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI", "41a2s3a4e"); - assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f"); - assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f"); - assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f"); - assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f"); + assertStringTranslate("abc", "b", "x", UTF8_BINARY, "axc"); + assertStringTranslate("abc", "b", "x", UTF8_LCASE, "axc"); + assertStringTranslate("abc", "b", "x", UNICODE, "axc"); + assertStringTranslate("abc", "b", "x", UNICODE_CI, "axc"); + assertStringTranslate("Translate", "Rnlt", "12", UTF8_BINARY, "Tra2sae"); + assertStringTranslate("Translate", "Rnlt", "12", UTF8_LCASE, "1a2sae"); + assertStringTranslate("Translate", "Rnlt", "12", UNICODE, "Tra2sae"); + assertStringTranslate("Translate", "Rnlt", "12", UNICODE_CI, "1a2sae"); + assertStringTranslate("Translate", "Rn", "1234", UTF8_BINARY, "Tra2slate"); + assertStringTranslate("Translate", "Rn", "1234", UTF8_LCASE, "T1a2slate"); + assertStringTranslate("Translate", "Rn", "1234", UNICODE, "Tra2slate"); + assertStringTranslate("Translate", "Rn", "1234", UNICODE_CI, "T1a2slate"); + assertStringTranslate("Translate", "Rnlt", "1234", UTF8_BINARY, "Tra2s3a4e"); + assertStringTranslate("Translate", "Rnlt", "1234", UTF8_LCASE, "41a2s3a4e"); + assertStringTranslate("Translate", "Rnlt", "1234", UNICODE, "Tra2s3a4e"); + assertStringTranslate("Translate", "Rnlt", "1234", UNICODE_CI, "41a2s3a4e"); + assertStringTranslate("TRanslate", "rnlt", "XxXx", UTF8_BINARY, "TRaxsXaxe"); + assertStringTranslate("TRanslate", "rnlt", "XxXx", UTF8_LCASE, "xXaxsXaxe"); + assertStringTranslate("TRanslate", "rnlt", "XxXx", UNICODE, "TRaxsXaxe"); + assertStringTranslate("TRanslate", "rnlt", "XxXx", UNICODE_CI, "xXaxsXaxe"); + assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", UTF8_BINARY, "TxaxsXaxeX"); + assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", UTF8_LCASE, "xxaxsXaxex"); + assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", UNICODE, "TxaxsXaxeX"); + assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", UNICODE_CI, "xxaxsXaxex"); + assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", UTF8_BINARY, "TXaxsXaxex"); + assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", UTF8_LCASE, "xXaxsXaxeX"); + assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", UNICODE, "TXaxsXaxex"); + assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", UNICODE_CI, "xXaxsXaxeX"); + assertStringTranslate("test大千世界X大千世界", "界x", "AB", UTF8_BINARY, "test大千世AX大千世A"); + assertStringTranslate("test大千世界X大千世界", "界x", "AB", UTF8_LCASE, "test大千世AB大千世A"); + assertStringTranslate("test大千世界X大千世界", "界x", "AB", UNICODE, "test大千世AX大千世A"); + assertStringTranslate("test大千世界X大千世界", "界x", "AB", UNICODE_CI, "test大千世AB大千世A"); + assertStringTranslate("大千世界test大千世界", "TEST", "abcd", UTF8_BINARY, "大千世界test大千世界"); + assertStringTranslate("大千世界test大千世界", "TEST", "abcd", UTF8_LCASE, "大千世界abca大千世界"); + assertStringTranslate("大千世界test大千世界", "TEST", "abcd", UNICODE, "大千世界test大千世界"); + assertStringTranslate("大千世界test大千世界", "TEST", "abcd", UNICODE_CI, "大千世界abca大千世界"); + assertStringTranslate("Test大千世界大千世界", "tT", "oO", UTF8_BINARY, "Oeso大千世界大千世界"); + assertStringTranslate("Test大千世界大千世界", "tT", "oO", UTF8_LCASE, "oeso大千世界大千世界"); + assertStringTranslate("Test大千世界大千世界", "tT", "oO", UNICODE, "Oeso大千世界大千世界"); + assertStringTranslate("Test大千世界大千世界", "tT", "oO", UNICODE_CI, "oeso大千世界大千世界"); + assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", UTF8_BINARY, "大千世界大千世界oesO"); + assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", UTF8_LCASE, "大千世界大千世界OesO"); + assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", UNICODE, "大千世界大千世界oesO"); + assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", UNICODE_CI, "大千世界大千世界OesO"); + assertStringTranslate("大千世界大千世界tesT", "大千", "世世", UTF8_BINARY, "世世世界世世世界tesT"); + assertStringTranslate("大千世界大千世界tesT", "大千", "世世", UTF8_LCASE, "世世世界世世世界tesT"); + assertStringTranslate("大千世界大千世界tesT", "大千", "世世", UNICODE, "世世世界世世世界tesT"); + assertStringTranslate("大千世界大千世界tesT", "大千", "世世", UNICODE_CI, "世世世界世世世界tesT"); + assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", UTF8_BINARY, "Tr4234e"); + assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", UTF8_LCASE, "14234e"); + assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", UNICODE, "Tr4234e"); + assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", UNICODE_CI, "14234e"); + assertStringTranslate("Translate", "Rnlt", "123495834634", UTF8_BINARY, "Tra2s3a4e"); + assertStringTranslate("Translate", "Rnlt", "123495834634", UTF8_LCASE, "41a2s3a4e"); + assertStringTranslate("Translate", "Rnlt", "123495834634", UNICODE, "Tra2s3a4e"); + assertStringTranslate("Translate", "Rnlt", "123495834634", UNICODE_CI, "41a2s3a4e"); + assertStringTranslate("abcdef", "abcde", "123", UTF8_BINARY, "123f"); + assertStringTranslate("abcdef", "abcde", "123", UTF8_LCASE, "123f"); + assertStringTranslate("abcdef", "abcde", "123", UNICODE, "123f"); + assertStringTranslate("abcdef", "abcde", "123", UNICODE_CI, "123f"); assertStringTranslate("abcdëÈêf", "ÊèË", "123", "AF_CI", "abcd321f"); // One-to-many case mapping (e.g. Turkish dotted I). - assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ"); - assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ"); - assertStringTranslate("İ", "i\u0307", "xy", "UNICODE", "İ"); - assertStringTranslate("İ", "i\u0307", "xy", "UNICODE_CI", "İ"); - assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307"); - assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x"); - assertStringTranslate("i\u0307", "İ", "xy", "UNICODE", "i\u0307"); - assertStringTranslate("i\u0307", "İ", "xy", "UNICODE_CI", "x"); - assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A"); - assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A"); - assertStringTranslate("i\u030A", "İ", "x", "UNICODE", "i\u030A"); - assertStringTranslate("i\u030A", "İ", "x", "UNICODE_CI", "i\u030A"); - assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A"); - assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A"); - assertStringTranslate("i\u030A", "İi", "xy", "UNICODE", "i\u030A"); - assertStringTranslate("i\u030A", "İi", "xy", "UNICODE_CI", "i\u030A"); - assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123"); - assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11"); - assertStringTranslate("İi\u0307", "İi\u0307", "123", "UNICODE", "1i\u0307"); - assertStringTranslate("İi\u0307", "İi\u0307", "123", "UNICODE_CI", "11"); - assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307"); - assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_LCASE", "11"); - assertStringTranslate("İi\u0307", "İyz", "123", "UNICODE", "1i\u0307"); - assertStringTranslate("İi\u0307", "İyz", "123", "UNICODE_CI", "11"); - assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23"); - assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_LCASE", "İ23"); - assertStringTranslate("İi\u0307", "xi\u0307", "123", "UNICODE", "İi\u0307"); - assertStringTranslate("İi\u0307", "xi\u0307", "123", "UNICODE_CI", "İi\u0307"); - assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY", "12bc3"); - assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_LCASE", "12bc3"); - assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UNICODE", "3bc3"); - assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UNICODE_CI", "3bc3"); - assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY", "a2bcå"); - assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_LCASE", "12bc3"); - assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UNICODE", "a\u030Abcå"); - assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UNICODE_CI", "3bc3"); - assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY", "3\u030Aβφδ1\u0307"); - assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", "UTF8_LCASE", "3\u030Aβφδ2"); - assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UNICODE", "4βφδ2"); - assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", "UNICODE_CI", "4βφδ2"); + assertStringTranslate("İ", "i\u0307", "xy", UTF8_BINARY, "İ"); + assertStringTranslate("İ", "i\u0307", "xy", UTF8_LCASE, "İ"); + assertStringTranslate("İ", "i\u0307", "xy", UNICODE, "İ"); + assertStringTranslate("İ", "i\u0307", "xy", UNICODE_CI, "İ"); + assertStringTranslate("i\u0307", "İ", "xy", UTF8_BINARY, "i\u0307"); + assertStringTranslate("i\u0307", "İ", "xy", UTF8_LCASE, "x"); + assertStringTranslate("i\u0307", "İ", "xy", UNICODE, "i\u0307"); + assertStringTranslate("i\u0307", "İ", "xy", UNICODE_CI, "x"); + assertStringTranslate("i\u030A", "İ", "x", UTF8_BINARY, "i\u030A"); + assertStringTranslate("i\u030A", "İ", "x", UTF8_LCASE, "i\u030A"); + assertStringTranslate("i\u030A", "İ", "x", UNICODE, "i\u030A"); + assertStringTranslate("i\u030A", "İ", "x", UNICODE_CI, "i\u030A"); + assertStringTranslate("i\u030A", "İi", "xy", UTF8_BINARY, "y\u030A"); + assertStringTranslate("i\u030A", "İi", "xy", UTF8_LCASE, "y\u030A"); + assertStringTranslate("i\u030A", "İi", "xy", UNICODE, "i\u030A"); + assertStringTranslate("i\u030A", "İi", "xy", UNICODE_CI, "i\u030A"); + assertStringTranslate("İi\u0307", "İi\u0307", "123", UTF8_BINARY, "123"); + assertStringTranslate("İi\u0307", "İi\u0307", "123", UTF8_LCASE, "11"); + assertStringTranslate("İi\u0307", "İi\u0307", "123", UNICODE, "1i\u0307"); + assertStringTranslate("İi\u0307", "İi\u0307", "123", UNICODE_CI, "11"); + assertStringTranslate("İi\u0307", "İyz", "123", UTF8_BINARY, "1i\u0307"); + assertStringTranslate("İi\u0307", "İyz", "123", UTF8_LCASE, "11"); + assertStringTranslate("İi\u0307", "İyz", "123", UNICODE, "1i\u0307"); + assertStringTranslate("İi\u0307", "İyz", "123", UNICODE_CI, "11"); + assertStringTranslate("İi\u0307", "xi\u0307", "123", UTF8_BINARY, "İ23"); + assertStringTranslate("İi\u0307", "xi\u0307", "123", UTF8_LCASE, "İ23"); + assertStringTranslate("İi\u0307", "xi\u0307", "123", UNICODE, "İi\u0307"); + assertStringTranslate("İi\u0307", "xi\u0307", "123", UNICODE_CI, "İi\u0307"); + assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", UTF8_BINARY, "12bc3"); + assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", UTF8_LCASE, "12bc3"); + assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", UNICODE, "3bc3"); + assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", UNICODE_CI, "3bc3"); + assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", UTF8_BINARY, "a2bcå"); + assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", UTF8_LCASE, "12bc3"); + assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", UNICODE, "a\u030Abcå"); + assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", UNICODE_CI, "3bc3"); + assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", UTF8_BINARY, "3\u030Aβφδ1\u0307"); + assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", UTF8_LCASE, "3\u030Aβφδ2"); + assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", UNICODE, "4βφδ2"); + assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", UNICODE_CI, "4βφδ2"); // Conditional case mapping (e.g. Greek sigmas). - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UTF8_BINARY", "σΥσΤΗΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UNICODE", "σΥσΤΗΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UNICODE_CI", "σισΤιΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UNICODE_CI", "σισΤιΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UNICODE_CI", "σισΤιΜΑΤΙΚΟσ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UNICODE_CI", "ςιςΤιΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_BINARY", "ςΥςΤΗΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UNICODE", "ςΥςΤΗΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UNICODE_CI", "ςιςΤιΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ"); - assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UNICODE_CI", "ςιςΤιΜΑΤΙΚΟς"); - assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_BINARY", "σιστιματικος"); - assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_LCASE", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "Συη", "σιι", "UNICODE", "σιστιματικος"); - assertStringTranslate("συστηματικος", "Συη", "σιι", "UNICODE_CI", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_BINARY", "σιστιματικος"); - assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_LCASE", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "συη", "σιι", "UNICODE", "σιστιματικος"); - assertStringTranslate("συστηματικος", "συη", "σιι", "UNICODE_CI", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_BINARY", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_LCASE", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "ςυη", "σιι", "UNICODE", "σιστιματικοσ"); - assertStringTranslate("συστηματικος", "ςυη", "σιι", "UNICODE_CI", "σιστιματικοσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", UTF8_BINARY, "σΥσΤΗΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", UTF8_LCASE, "σισΤιΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", UNICODE, "σΥσΤΗΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", UNICODE_CI, "σισΤιΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", UTF8_BINARY, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", UTF8_LCASE, "σισΤιΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", UNICODE, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", UNICODE_CI, "σισΤιΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", UTF8_BINARY, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", UTF8_LCASE, "σισΤιΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", UNICODE, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", UNICODE_CI, "σισΤιΜΑΤΙΚΟσ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", UTF8_BINARY, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", UTF8_LCASE, "ςιςΤιΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", UNICODE, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", UNICODE_CI, "ςιςΤιΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", UTF8_BINARY, "ςΥςΤΗΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", UTF8_LCASE, "ςιςΤιΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", UNICODE, "ςΥςΤΗΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", UNICODE_CI, "ςιςΤιΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", UTF8_BINARY, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", UTF8_LCASE, "ςιςΤιΜΑΤΙΚΟς"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", UNICODE, "ΣΥΣΤΗΜΑΤΙΚΟΣ"); + assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", UNICODE_CI, "ςιςΤιΜΑΤΙΚΟς"); + assertStringTranslate("συστηματικος", "Συη", "σιι", UTF8_BINARY, "σιστιματικος"); + assertStringTranslate("συστηματικος", "Συη", "σιι", UTF8_LCASE, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "Συη", "σιι", UNICODE, "σιστιματικος"); + assertStringTranslate("συστηματικος", "Συη", "σιι", UNICODE_CI, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "συη", "σιι", UTF8_BINARY, "σιστιματικος"); + assertStringTranslate("συστηματικος", "συη", "σιι", UTF8_LCASE, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "συη", "σιι", UNICODE, "σιστιματικος"); + assertStringTranslate("συστηματικος", "συη", "σιι", UNICODE_CI, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "ςυη", "σιι", UTF8_BINARY, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "ςυη", "σιι", UTF8_LCASE, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "ςυη", "σιι", UNICODE, "σιστιματικοσ"); + assertStringTranslate("συστηματικος", "ςυη", "σιι", UNICODE_CI, "σιστιματικοσ"); // Surrogate pairs. - assertStringTranslate("a🙃b🙃c", "a", "x", "UTF8_BINARY", "x🙃b🙃c"); - assertStringTranslate("a🙃b🙃c", "a🙃", "xy", "UTF8_BINARY", "xybyc"); - assertStringTranslate("a🙃b🙃c", "a🙃b", "xyz", "UTF8_BINARY", "xyzyc"); - assertStringTranslate("a🙃b🙃c", "a🙃bc", "xyzw", "UTF8_BINARY", "xyzyw"); - assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UTF8_BINARY", "😀😂😃😅"); - assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UTF8_LCASE", "😀😂😃😅"); - assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UNICODE", "😀😂😃😅"); - assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UNICODE_CI", "😀😂😃😅"); - assertStringTranslate("𐐅", "𐐅", "x", "UTF8_BINARY", "x"); - assertStringTranslate("𐐅", "𐐅", "x", "UTF8_LCASE", "x"); - assertStringTranslate("𐐅", "𐐅", "x", "UNICODE", "x"); - assertStringTranslate("𐐅", "𐐅", "x", "UNICODE_CI", "x"); - assertStringTranslate("𐐅", "𐐭", "x", "UTF8_BINARY", "𐐅"); - assertStringTranslate("𐐅", "𐐭", "x", "UTF8_LCASE", "x"); - assertStringTranslate("𐐅", "𐐭", "x", "UNICODE", "𐐅"); - assertStringTranslate("𐐅", "𐐭", "x", "UNICODE_CI", "x"); - assertStringTranslate("A", "A", "𐐅", "UTF8_BINARY", "𐐅"); - assertStringTranslate("A", "A", "𐐅", "UTF8_LCASE", "𐐅"); - assertStringTranslate("A", "A", "𐐅", "UNICODE", "𐐅"); - assertStringTranslate("A", "A", "𐐅", "UNICODE_CI", "𐐅"); - assertStringTranslate("A", "a", "𐐅", "UTF8_BINARY", "A"); - assertStringTranslate("A", "a", "𐐅", "UTF8_LCASE", "𐐅"); - assertStringTranslate("A", "a", "𐐅", "UNICODE", "A"); - assertStringTranslate("A", "a", "𐐅", "UNICODE_CI", "𐐅"); - assertStringTranslate("a", "A", "𐐅", "UTF8_BINARY", "a"); - assertStringTranslate("a", "A", "𐐅", "UTF8_LCASE", "𐐅"); - assertStringTranslate("a", "A", "𐐅", "UNICODE", "a"); - assertStringTranslate("a", "A", "𐐅", "UNICODE_CI", "𐐅"); - assertStringTranslate("𝔸", "𝔸", "x", "UTF8_BINARY", "x"); - assertStringTranslate("𝔸", "𝔸", "x", "UTF8_LCASE", "x"); - assertStringTranslate("𝔸", "𝔸", "x", "UNICODE", "x"); - assertStringTranslate("𝔸", "𝔸", "x", "UNICODE_CI", "x"); - assertStringTranslate("𝔸", "𝕒", "x", "UTF8_BINARY", "𝔸"); - assertStringTranslate("𝔸", "𝕒", "x", "UTF8_LCASE", "𝔸"); - assertStringTranslate("𝔸", "𝕒", "x", "UNICODE", "𝔸"); - assertStringTranslate("𝔸", "𝕒", "x", "UNICODE_CI", "x"); + assertStringTranslate("a🙃b🙃c", "a", "x", UTF8_BINARY, "x🙃b🙃c"); + assertStringTranslate("a🙃b🙃c", "a🙃", "xy", UTF8_BINARY, "xybyc"); + assertStringTranslate("a🙃b🙃c", "a🙃b", "xyz", UTF8_BINARY, "xyzyc"); + assertStringTranslate("a🙃b🙃c", "a🙃bc", "xyzw", UTF8_BINARY, "xyzyw"); + assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", UTF8_BINARY, "😀😂😃😅"); + assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", UTF8_LCASE, "😀😂😃😅"); + assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", UNICODE, "😀😂😃😅"); + assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", UNICODE_CI, "😀😂😃😅"); + assertStringTranslate("𐐅", "𐐅", "x", UTF8_BINARY, "x"); + assertStringTranslate("𐐅", "𐐅", "x", UTF8_LCASE, "x"); + assertStringTranslate("𐐅", "𐐅", "x", UNICODE, "x"); + assertStringTranslate("𐐅", "𐐅", "x", UNICODE_CI, "x"); + assertStringTranslate("𐐅", "𐐭", "x", UTF8_BINARY, "𐐅"); + assertStringTranslate("𐐅", "𐐭", "x", UTF8_LCASE, "x"); + assertStringTranslate("𐐅", "𐐭", "x", UNICODE, "𐐅"); + assertStringTranslate("𐐅", "𐐭", "x", UNICODE_CI, "x"); + assertStringTranslate("A", "A", "𐐅", UTF8_BINARY, "𐐅"); + assertStringTranslate("A", "A", "𐐅", UTF8_LCASE, "𐐅"); + assertStringTranslate("A", "A", "𐐅", UNICODE, "𐐅"); + assertStringTranslate("A", "A", "𐐅", UNICODE_CI, "𐐅"); + assertStringTranslate("A", "a", "𐐅", UTF8_BINARY, "A"); + assertStringTranslate("A", "a", "𐐅", UTF8_LCASE, "𐐅"); + assertStringTranslate("A", "a", "𐐅", UNICODE, "A"); + assertStringTranslate("A", "a", "𐐅", UNICODE_CI, "𐐅"); + assertStringTranslate("a", "A", "𐐅", UTF8_BINARY, "a"); + assertStringTranslate("a", "A", "𐐅", UTF8_LCASE, "𐐅"); + assertStringTranslate("a", "A", "𐐅", UNICODE, "a"); + assertStringTranslate("a", "A", "𐐅", UNICODE_CI, "𐐅"); + assertStringTranslate("𝔸", "𝔸", "x", UTF8_BINARY, "x"); + assertStringTranslate("𝔸", "𝔸", "x", UTF8_LCASE, "x"); + assertStringTranslate("𝔸", "𝔸", "x", UNICODE, "x"); + assertStringTranslate("𝔸", "𝔸", "x", UNICODE_CI, "x"); + assertStringTranslate("𝔸", "𝕒", "x", UTF8_BINARY, "𝔸"); + assertStringTranslate("𝔸", "𝕒", "x", UTF8_LCASE, "𝔸"); + assertStringTranslate("𝔸", "𝕒", "x", UNICODE, "𝔸"); + assertStringTranslate("𝔸", "𝕒", "x", UNICODE_CI, "x"); } private Map buildDict(String matching, String replace) { diff --git a/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java b/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java index 90dddc2cb08c1..3482c6addfee3 100644 --- a/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java +++ b/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java @@ -22,6 +22,7 @@ import java.nio.channels.ReadableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.LinkOption; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.TimeUnit; @@ -125,10 +126,11 @@ public static void deleteRecursively(File file, FilenameFilter filter) throws IO private static void deleteRecursivelyUsingJavaIO( File file, FilenameFilter filter) throws IOException { - if (!file.exists()) return; - BasicFileAttributes fileAttributes = - Files.readAttributes(file.toPath(), BasicFileAttributes.class); - if (fileAttributes.isDirectory() && !isSymlink(file)) { + BasicFileAttributes fileAttributes = readFileAttributes(file); + // SPARK-50716: If the file attributes are null, that is, the file attributes cannot be read, + // or if the file does not exist and is not a broken symbolic link, then return directly. + if (fileAttributes == null || (!file.exists() && !fileAttributes.isSymbolicLink())) return; + if (fileAttributes.isDirectory()) { IOException savedIOException = null; for (File child : listFilesSafely(file, filter)) { try { @@ -143,8 +145,8 @@ private static void deleteRecursivelyUsingJavaIO( } } - // Delete file only when it's a normal file or an empty directory. - if (fileAttributes.isRegularFile() || + // Delete file only when it's a normal file, a symbolic link, or an empty directory. + if (fileAttributes.isRegularFile() || fileAttributes.isSymbolicLink() || (fileAttributes.isDirectory() && listFilesSafely(file, null).length == 0)) { boolean deleted = file.delete(); // Delete can also fail if the file simply did not exist. @@ -154,6 +156,18 @@ private static void deleteRecursivelyUsingJavaIO( } } + /** + * Reads basic attributes of a given file, of return null if an I/O error occurs. + */ + private static BasicFileAttributes readFileAttributes(File file) { + try { + return Files.readAttributes( + file.toPath(), BasicFileAttributes.class, LinkOption.NOFOLLOW_LINKS); + } catch (IOException e) { + return null; + } + } + private static void deleteRecursivelyUsingUnixNative(File file) throws IOException { ProcessBuilder builder = new ProcessBuilder("rm", "-rf", file.getAbsolutePath()); Process process = null; @@ -192,17 +206,6 @@ private static File[] listFilesSafely(File file, FilenameFilter filter) throws I } } - private static boolean isSymlink(File file) throws IOException { - Objects.requireNonNull(file); - File fileInCanonicalDir = null; - if (file.getParent() == null) { - fileInCanonicalDir = file; - } else { - fileInCanonicalDir = new File(file.getParentFile().getCanonicalFile(), file.getName()); - } - return !fileInCanonicalDir.getCanonicalFile().equals(fileInCanonicalDir.getAbsoluteFile()); - } - private static final Map timeSuffixes; private static final Map byteSuffixes; diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 94513cca1023f..44d69b6675937 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -70,6 +70,12 @@ ], "sqlState" : "42000" }, + "AMBIGUOUS_RESOLVER_EXTENSION" : { + "message" : [ + "The single-pass analyzer cannot process this query or command because the extension choice for is ambiguous: ." + ], + "sqlState" : "XX000" + }, "ARITHMETIC_OVERFLOW" : { "message" : [ ". If necessary set to \"false\" to bypass this error." @@ -233,6 +239,11 @@ "An error occurred during loading state." ], "subClass" : { + "CANNOT_FIND_BASE_SNAPSHOT_CHECKPOINT" : { + "message" : [ + "Cannot find a base snapshot checkpoint with lineage: ." + ] + }, "CANNOT_READ_CHECKPOINT" : { "message" : [ "Cannot read RocksDB checkpoint metadata. Expected , but found ." @@ -275,7 +286,7 @@ }, "INVALID_CHANGE_LOG_READER_VERSION" : { "message" : [ - "The change log reader version cannot be ." + "The change log reader version cannot be . The checkpoint probably is from a future Spark version, please upgrade your Spark." ] }, "INVALID_CHANGE_LOG_WRITER_VERSION" : { @@ -366,7 +377,7 @@ }, "CANNOT_PARSE_TIMESTAMP" : { "message" : [ - ". If necessary set to \"false\" to bypass this error." + ". Use to tolerate invalid input string and return NULL instead." ], "sqlState" : "22007" }, @@ -741,12 +752,36 @@ }, "sqlState" : "56K00" }, + "CONNECT_ML" : { + "message" : [ + "Generic Spark Connect ML error." + ], + "subClass" : { + "ATTRIBUTE_NOT_ALLOWED" : { + "message" : [ + " is not allowed to be accessed." + ] + }, + "UNSUPPORTED_EXCEPTION" : { + "message" : [ + "" + ] + } + }, + "sqlState" : "XX000" + }, "CONVERSION_INVALID_INPUT" : { "message" : [ "The value () cannot be converted to because it is malformed. Correct the value as per the syntax, or change its format. Use to tolerate malformed input and return NULL instead." ], "sqlState" : "22018" }, + "CORRUPTED_CATALOG_FUNCTION" : { + "message" : [ + "Cannot convert the catalog function '' into a SQL function due to corrupted function information in catalog. If the function is not a SQL function, please make sure the class name '' is loadable." + ], + "sqlState" : "0A000" + }, "CREATE_PERMANENT_VIEW_WITHOUT_ALIAS" : { "message" : [ "Not allowed to create the permanent view without explicitly assigning an alias for the expression ." @@ -1144,6 +1179,13 @@ ], "sqlState" : "42623" }, + "DESCRIBE_JSON_NOT_EXTENDED" : { + "message" : [ + "DESCRIBE TABLE ... AS JSON only supported when [EXTENDED|FORMATTED] is specified.", + "For example: DESCRIBE EXTENDED AS JSON is supported but DESCRIBE AS JSON is not." + ], + "sqlState" : "0A000" + }, "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED" : { "message" : [ "Distinct window functions are not supported: ." @@ -1218,6 +1260,18 @@ }, "sqlState" : "4274K" }, + "DUPLICATE_ROUTINE_PARAMETER_NAMES" : { + "message" : [ + "Found duplicate name(s) in the parameter list of the user-defined routine : ." + ], + "sqlState" : "42734" + }, + "DUPLICATE_ROUTINE_RETURNS_COLUMNS" : { + "message" : [ + "Found duplicate column(s) in the RETURNS clause column list of the user-defined routine : ." + ], + "sqlState" : "42711" + }, "EMITTING_ROWS_OLDER_THAN_WATERMARK_NOT_ALLOWED" : { "message" : [ "Previous node emitted a row with eventTime= which is older than current_watermark_value=", @@ -1476,6 +1530,11 @@ "message" : [ "Data type mismatches when reading Parquet column . Expected Spark type , actual Parquet type ." ] + }, + "UNSUPPORTED_FILE_SYSTEM" : { + "message" : [ + "The file system hasn't implemented ." + ] } }, "sqlState" : "KD001" @@ -1649,6 +1708,39 @@ ], "sqlState" : "22000" }, + "HYBRID_ANALYZER_EXCEPTION" : { + "message" : [ + "An failure occurred when attempting to resolve a query or command with both the legacy fixed-point analyzer as well as the single-pass resolver." + ], + "subClass" : { + "FIXED_POINT_FAILED_SINGLE_PASS_SUCCEEDED" : { + "message" : [ + "Fixed-point resolution failed, but single-pass resolution succeeded.", + "Single-pass analyzer output:", + "" + ] + }, + "LOGICAL_PLAN_COMPARISON_MISMATCH" : { + "message" : [ + "Outputs of fixed-point and single-pass analyzers do not match.", + "Fixed-point analyzer output:", + "", + "Single-pass analyzer output:", + "" + ] + }, + "OUTPUT_SCHEMA_COMPARISON_MISMATCH" : { + "message" : [ + "Output schemas of fixed-point and single-pass analyzers do not match.", + "Fixed-point analyzer output schema:", + "", + "Single-pass analyzer output schema:", + "" + ] + } + }, + "sqlState" : "XX000" + }, "IDENTIFIER_TOO_MANY_NAME_PARTS" : { "message" : [ " is not a valid identifier as it has more than 2 name parts." @@ -2082,13 +2174,13 @@ }, "INVALID_ARRAY_INDEX" : { "message" : [ - "The index is out of bounds. The array has elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set to \"false\" to bypass this error." + "The index is out of bounds. The array has elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead." ], "sqlState" : "22003" }, "INVALID_ARRAY_INDEX_IN_ELEMENT_AT" : { "message" : [ - "The index is out of bounds. The array has elements. Use `try_element_at` to tolerate accessing element at invalid index and return NULL instead. If necessary set to \"false\" to bypass this error." + "The index is out of bounds. The array has elements. Use `try_element_at` to tolerate accessing element at invalid index and return NULL instead." ], "sqlState" : "22003" }, @@ -2627,29 +2719,6 @@ ], "sqlState" : "22006" }, - "INVALID_INVERSE_DISTRIBUTION_FUNCTION" : { - "message" : [ - "Invalid inverse distribution function ." - ], - "subClass" : { - "DISTINCT_UNSUPPORTED" : { - "message" : [ - "Cannot use DISTINCT with WITHIN GROUP." - ] - }, - "WITHIN_GROUP_MISSING" : { - "message" : [ - "WITHIN GROUP is required for inverse distribution function." - ] - }, - "WRONG_NUM_ORDERINGS" : { - "message" : [ - "Requires orderings in WITHIN GROUP but got ." - ] - } - }, - "sqlState" : "42K0K" - }, "INVALID_JAVA_IDENTIFIER_AS_FIELD_NAME" : { "message" : [ " is not a valid identifier of Java and cannot be used as field name", @@ -2713,6 +2782,11 @@ "message" : [ "ITERATE statement cannot be used with a label that belongs to a compound (BEGIN...END) body." ] + }, + "QUALIFIED_LABEL_NAME" : { + "message" : [ + "Label cannot be qualified." + ] } }, "sqlState" : "42K0L" @@ -2908,6 +2982,11 @@ "Unsupported dtype: . Valid values: float64, float32." ] }, + "EXTENSION" : { + "message" : [ + "Invalid extension: . Extension is limited to exactly 3 letters (e.g. csv, tsv, etc...)" + ] + }, "INTEGER" : { "message" : [ "expects an integer literal, but got ." @@ -3070,6 +3149,13 @@ ], "sqlState" : "42K08" }, + "INVALID_SQL_FUNCTION_PLAN_STRUCTURE" : { + "message" : [ + "Invalid SQL function plan structure", + "" + ], + "sqlState" : "XXKD0" + }, "INVALID_SQL_SYNTAX" : { "message" : [ "Invalid SQL syntax:" @@ -3364,6 +3450,34 @@ ], "sqlState" : "42601" }, + "INVALID_WITHIN_GROUP_EXPRESSION" : { + "message" : [ + "Invalid function with WITHIN GROUP." + ], + "subClass" : { + "DISTINCT_UNSUPPORTED" : { + "message" : [ + "The function does not support DISTINCT with WITHIN GROUP." + ] + }, + "MISMATCH_WITH_DISTINCT_INPUT" : { + "message" : [ + "The function is invoked with DISTINCT and WITHIN GROUP but expressions and do not match. The WITHIN GROUP ordering expression must be picked from the function inputs." + ] + }, + "WITHIN_GROUP_MISSING" : { + "message" : [ + "WITHIN GROUP is required for the function." + ] + }, + "WRONG_NUM_ORDERINGS" : { + "message" : [ + "The function requires orderings in WITHIN GROUP but got ." + ] + } + }, + "sqlState" : "42K0K" + }, "INVALID_WRITER_COMMIT_MESSAGE" : { "message" : [ "The data source writer has generated an invalid number of commit messages. Expected exactly one writer commit message from each task, but received ." @@ -3564,6 +3678,12 @@ ], "sqlState" : "42710" }, + "MULTI_ALIAS_WITHOUT_GENERATOR" : { + "message" : [ + "Multi part aliasing () is not supported with as it is not a generator function." + ], + "sqlState" : "42K0E" + }, "MULTI_SOURCES_UNSUPPORTED_FOR_EXPRESSION" : { "message" : [ "The expression does not support more than one source." @@ -4086,6 +4206,18 @@ ], "sqlState" : "38000" }, + "RECURSIVE_CTE_IN_LEGACY_MODE" : { + "message" : [ + "Recursive definitions cannot be used in legacy CTE precedence mode (spark.sql.legacy.ctePrecedencePolicy=LEGACY)." + ], + "sqlState" : "42836" + }, + "RECURSIVE_CTE_WHEN_INLINING_IS_FORCED" : { + "message" : [ + "Recursive definitions cannot be used when CTE inlining is forced." + ], + "sqlState" : "42836" + }, "RECURSIVE_PROTOBUF_SCHEMA" : { "message" : [ "Found recursive reference in Protobuf schema, which can not be processed by Spark by default: . try setting the option `recursive.fields.max.depth` 1 to 10. Going beyond 10 levels of recursion is not allowed." @@ -4623,6 +4755,12 @@ ], "sqlState" : "42P01" }, + "TABLE_VALUED_ARGUMENTS_NOT_YET_IMPLEMENTED_FOR_SQL_FUNCTIONS" : { + "message" : [ + "Cannot SQL user-defined function with TABLE arguments because this functionality is not yet implemented." + ], + "sqlState" : "0A000" + }, "TABLE_VALUED_FUNCTION_FAILED_TO_ANALYZE_IN_PYTHON" : { "message" : [ "Failed to analyze the Python user defined table function: " @@ -4748,12 +4886,6 @@ ], "sqlState" : "42KD9" }, - "UNANALYZABLE_EXPRESSION" : { - "message" : [ - "The plan contains an unanalyzable expression that holds the analysis." - ], - "sqlState" : "03000" - }, "UNBOUND_SQL_PARAMETER" : { "message" : [ "Found the unbound parameter: . Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`." @@ -5036,11 +5168,6 @@ "message" : [ "Access to the SparkContext." ] - }, - "SESSION_SQL_CONTEXT" : { - "message" : [ - "Access to the SparkSession SQL Context." - ] } }, "sqlState" : "0A000" @@ -5189,6 +5316,11 @@ "The SQL pipe operator syntax using |> does not support ." ] }, + "COLLATIONS_IN_MAP_KEYS" : { + "message" : [ + "Collated strings for keys of maps" + ] + }, "COMBINATION_QUERY_RESULT_CLAUSES" : { "message" : [ "Combination of ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY." @@ -5199,6 +5331,11 @@ "Attach a comment to the namespace ." ] }, + "DESC_TABLE_COLUMN_JSON" : { + "message" : [ + "DESC TABLE COLUMN AS JSON not supported for individual columns." + ] + }, "DESC_TABLE_COLUMN_PARTITION" : { "message" : [ "DESC TABLE COLUMN for a specific partition." @@ -5244,6 +5381,11 @@ "Referencing lateral column alias in the aggregate query both with window expressions and with having clause. Please rewrite the aggregate query by removing the having clause or removing lateral alias reference in the SELECT list." ] }, + "LATERAL_COLUMN_ALIAS_IN_GENERATOR" : { + "message" : [ + "Referencing a lateral column alias in generator expression ." + ] + }, "LATERAL_COLUMN_ALIAS_IN_GROUP_BY" : { "message" : [ "Referencing a lateral column alias via GROUP BY alias/ALL is not supported yet." @@ -5274,6 +5416,11 @@ "The target JDBC server hosting table does not support ALTER TABLE with multiple actions. Split the ALTER TABLE up into individual actions to avoid this error." ] }, + "OBJECT_LEVEL_COLLATIONS" : { + "message" : [ + "Default collation for the specified object." + ] + }, "ORC_TYPE_CAST" : { "message" : [ "Unable to convert of Orc to data type ." @@ -5294,6 +5441,11 @@ "Parameter markers are not allowed in ." ] }, + "PARTITION_BY_VARIANT" : { + "message" : [ + "Cannot use VARIANT producing expressions to partition a DataFrame, but the type of expression is ." + ] + }, "PARTITION_WITH_NESTED_COLUMN_IS_UNSUPPORTED" : { "message" : [ "Invalid partitioning: is missing or is in a map or array." @@ -5333,12 +5485,12 @@ "message" : [ "Queries from raw JSON/CSV/XML files are disallowed when the", "referenced columns only include the internal corrupt record column", - "(named _corrupt_record by default). For example:", - "spark.read.schema(schema).json(file).filter($\"_corrupt_record\".isNotNull).count()", - "and spark.read.schema(schema).json(file).select(\"_corrupt_record\").show().", + "(named `_corrupt_record` by default). For example:", + "`spark.read.schema(schema).json(file).filter($\"_corrupt_record\".isNotNull).count()`", + "and `spark.read.schema(schema).json(file).select(\"_corrupt_record\").show()`.", "Instead, you can cache or save the parsed results and then send the same query.", - "For example, val df = spark.read.schema(schema).json(file).cache() and then", - "df.filter($\"_corrupt_record\".isNotNull).count()." + "For example, `val df = spark.read.schema(schema).json(file).cache()` and then", + "`df.filter($\"_corrupt_record\".isNotNull).count()`." ] }, "REMOVE_NAMESPACE_COMMENT" : { @@ -5361,6 +5513,11 @@ "Cannot have MAP type columns in DataFrame which calls set operations (INTERSECT, EXCEPT, etc.), but the type of column is ." ] }, + "SET_OPERATION_ON_VARIANT_TYPE" : { + "message" : [ + "Cannot have VARIANT type columns in DataFrame which calls set operations (INTERSECT, EXCEPT, etc.), but the type of column is ." + ] + }, "SET_PROPERTIES_AND_DBPROPERTIES" : { "message" : [ "set PROPERTIES and DBPROPERTIES at the same time." @@ -5381,6 +5538,11 @@ "SQL Scripting is under development and not all features are supported. SQL Scripting enables users to write procedural SQL including control flow and error handling. To enable existing features set to `true`." ] }, + "SQL_SCRIPTING_WITH_POSITIONAL_PARAMETERS" : { + "message" : [ + "Positional parameters are not supported with SQL Scripting." + ] + }, "STATE_STORE_MULTIPLE_COLUMN_FAMILIES" : { "message" : [ "Creating multiple column families with is not supported." @@ -5619,6 +5781,18 @@ }, "sqlState" : "0A000" }, + "UNSUPPORTED_SINGLE_PASS_ANALYZER_FEATURE" : { + "message" : [ + "The single-pass analyzer cannot process this query or command because it does not yet support ." + ], + "sqlState" : "0A000" + }, + "UNSUPPORTED_SQL_UDF_USAGE" : { + "message" : [ + "Using SQL function in is not supported." + ], + "sqlState" : "0A000" + }, "UNSUPPORTED_STREAMING_OPERATOR_WITHOUT_WATERMARK" : { "message" : [ " output mode not supported for on streaming DataFrames/DataSets without watermark." @@ -5735,6 +5909,59 @@ ], "sqlState" : "42K0E" }, + "USER_DEFINED_FUNCTIONS" : { + "message" : [ + "User defined function is invalid:" + ], + "subClass" : { + "CANNOT_CONTAIN_COMPLEX_FUNCTIONS" : { + "message" : [ + "SQL scalar function cannot contain aggregate/window/generate functions: " + ] + }, + "CANNOT_REPLACE_NON_SQL_UDF_WITH_SQL_UDF" : { + "message" : [ + "Cannot replace the non-SQL function with a SQL function." + ] + }, + "NOT_A_VALID_DEFAULT_EXPRESSION" : { + "message" : [ + "The DEFAULT expression of ``.`` is not supported because it contains a subquery." + ] + }, + "NOT_A_VALID_DEFAULT_PARAMETER_POSITION" : { + "message" : [ + "In routine `` parameter `` with DEFAULT must not be followed by parameter `` without DEFAULT." + ] + }, + "NOT_NULL_ON_FUNCTION_PARAMETERS" : { + "message" : [ + "Cannot specify NOT NULL on function parameters: " + ] + }, + "RETURN_COLUMN_COUNT_MISMATCH" : { + "message" : [ + "The number of columns produced by the RETURN clause (num: ``) does not match the number of column names specified by the RETURNS clause (num: ``) of ." + ] + }, + "ROUTINE_PROPERTY_TOO_LARGE" : { + "message" : [ + "Cannot convert user defined routine to catalog function: routine properties are too large." + ] + }, + "SQL_TABLE_UDF_BODY_MUST_BE_A_QUERY" : { + "message" : [ + "SQL table function body must be a query." + ] + }, + "SQL_TABLE_UDF_MISSING_COLUMN_NAMES" : { + "message" : [ + "The relation returned by the query in the CREATE FUNCTION statement for with RETURNS TABLE clause lacks explicit names for one or more output columns; please rewrite the function body to provide explicit column names or add column names to the RETURNS TABLE clause, and re-run the command." + ] + } + }, + "sqlState" : "42601" + }, "USER_RAISED_EXCEPTION" : { "message" : [ "" @@ -5855,7 +6082,7 @@ }, "XML_ROW_TAG_MISSING" : { "message" : [ - " option is required for reading files in XML format." + " option is required for reading/writing files in XML format." ], "sqlState" : "42KDF" }, @@ -6655,11 +6882,6 @@ "Decimal scale () cannot be greater than precision ()." ] }, - "_LEGACY_ERROR_TEMP_1231" : { - "message" : [ - " is not a valid partition column in table ." - ] - }, "_LEGACY_ERROR_TEMP_1232" : { "message" : [ "Partition spec is invalid. The spec () must match the partition spec () defined in table ''." diff --git a/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties b/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties index 9be86b650d091..777c5f2b25915 100644 --- a/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties +++ b/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties @@ -22,8 +22,8 @@ rootLogger.appenderRef.stdout.ref = console appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = JsonTemplateLayout -appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex # Settings to quiet third party logs that are too verbose logger.jetty.name = org.sparkproject.jetty diff --git a/common/utils/src/main/resources/org/apache/spark/log4j2-pattern-layout-defaults.properties b/common/utils/src/main/resources/org/apache/spark/log4j2-json-layout.properties similarity index 94% rename from common/utils/src/main/resources/org/apache/spark/log4j2-pattern-layout-defaults.properties rename to common/utils/src/main/resources/org/apache/spark/log4j2-json-layout.properties index 777c5f2b25915..9be86b650d091 100644 --- a/common/utils/src/main/resources/org/apache/spark/log4j2-pattern-layout-defaults.properties +++ b/common/utils/src/main/resources/org/apache/spark/log4j2-json-layout.properties @@ -22,8 +22,8 @@ rootLogger.appenderRef.stdout.ref = console appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex +appender.console.layout.type = JsonTemplateLayout +appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json # Settings to quiet third party logs that are too verbose logger.jetty.name = org.sparkproject.jetty diff --git a/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala b/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala index c365797cec690..c3a1af68d1c82 100644 --- a/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala @@ -94,6 +94,7 @@ private[spark] object LogKeys { case object BATCH_TIMESTAMP extends LogKey case object BATCH_WRITE extends LogKey case object BIND_ADDRESS extends LogKey + case object BLOCK_GENERATOR_STATUS extends LogKey case object BLOCK_ID extends LogKey case object BLOCK_IDS extends LogKey case object BLOCK_MANAGER_ID extends LogKey @@ -241,6 +242,8 @@ private[spark] object LogKeys { case object EXECUTOR_ID extends LogKey case object EXECUTOR_IDS extends LogKey case object EXECUTOR_LAUNCH_COMMANDS extends LogKey + case object EXECUTOR_MEMORY_OFFHEAP extends LogKey + case object EXECUTOR_MEMORY_OVERHEAD_SIZE extends LogKey case object EXECUTOR_MEMORY_SIZE extends LogKey case object EXECUTOR_RESOURCES extends LogKey case object EXECUTOR_SHUFFLE_INFO extends LogKey @@ -348,9 +351,12 @@ private[spark] object LogKeys { case object KEYTAB extends LogKey case object KEYTAB_FILE extends LogKey case object KILL_EXECUTORS extends LogKey + case object KINESIS_REASON extends LogKey case object LABEL_COLUMN extends LogKey case object LARGEST_CLUSTER_INDEX extends LogKey case object LAST_ACCESS_TIME extends LogKey + case object LAST_COMMITTED_CHECKPOINT_ID extends LogKey + case object LAST_COMMIT_BASED_CHECKPOINT_ID extends LogKey case object LAST_VALID_TIME extends LogKey case object LATEST_BATCH_ID extends LogKey case object LATEST_COMMITTED_BATCH_ID extends LogKey @@ -359,8 +365,10 @@ private[spark] object LogKeys { case object LEFT_EXPR extends LogKey case object LEFT_LOGICAL_PLAN_STATS_SIZE_IN_BYTES extends LogKey case object LINE extends LogKey + case object LINEAGE extends LogKey case object LINE_NUM extends LogKey case object LISTENER extends LogKey + case object LOADED_CHECKPOINT_ID extends LogKey case object LOADED_VERSION extends LogKey case object LOAD_FACTOR extends LogKey case object LOAD_TIME extends LogKey @@ -542,7 +550,7 @@ private[spark] object LogKeys { case object NUM_RULE_OF_RUNS extends LogKey case object NUM_SEQUENCES extends LogKey case object NUM_SLOTS extends LogKey - case object NUM_SPILL_INFOS extends LogKey + case object NUM_SPILLS extends LogKey case object NUM_SPILL_WRITERS extends LogKey case object NUM_SUB_DIRS extends LogKey case object NUM_SUCCESSFUL_TASKS extends LogKey @@ -695,6 +703,7 @@ private[spark] object LogKeys { case object RULE_EXECUTOR_NAME extends LogKey case object RULE_NAME extends LogKey case object RUN_ID extends LogKey + case object RUN_ID_STRING extends LogKey case object SCALA_VERSION extends LogKey case object SCALING_DOWN_RATIO extends LogKey case object SCALING_UP_RATIO extends LogKey @@ -717,6 +726,7 @@ private[spark] object LogKeys { case object SHUFFLE_DB_BACKEND_KEY extends LogKey case object SHUFFLE_DB_BACKEND_NAME extends LogKey case object SHUFFLE_ID extends LogKey + case object SHUFFLE_IDS extends LogKey case object SHUFFLE_MERGE_ID extends LogKey case object SHUFFLE_MERGE_RECOVERY_FILE extends LogKey case object SHUFFLE_SERVICE_CONF_OVERLAY_URL extends LogKey @@ -747,14 +757,18 @@ private[spark] object LogKeys { case object STAGE extends LogKey case object STAGES extends LogKey case object STAGE_ATTEMPT extends LogKey + case object STAGE_ATTEMPT_ID extends LogKey case object STAGE_ID extends LogKey case object STAGE_NAME extends LogKey case object START_INDEX extends LogKey case object START_TIME extends LogKey case object STATEMENT_ID extends LogKey case object STATE_NAME extends LogKey + case object STATE_STORE_COORDINATOR extends LogKey case object STATE_STORE_ID extends LogKey case object STATE_STORE_PROVIDER extends LogKey + case object STATE_STORE_PROVIDER_ID extends LogKey + case object STATE_STORE_PROVIDER_IDS extends LogKey case object STATE_STORE_VERSION extends LogKey case object STATS extends LogKey case object STATUS extends LogKey @@ -878,6 +892,7 @@ private[spark] object LogKeys { case object WRITE_JOB_UUID extends LogKey case object XML_SCHEDULING_MODE extends LogKey case object XSD_PATH extends LogKey + case object YARN_RESOURCE extends LogKey case object YOUNG_GENERATION_GC extends LogKey case object ZERO_TIME extends LogKey } diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala index 7471b764bd2b3..4b60cb20f0732 100644 --- a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala @@ -337,9 +337,9 @@ trait Logging { if (Logging.defaultSparkLog4jConfig || Logging.islog4j2DefaultConfigured()) { Logging.defaultSparkLog4jConfig = true val defaultLogProps = if (Logging.isStructuredLoggingEnabled) { - "org/apache/spark/log4j2-defaults.properties" + "org/apache/spark/log4j2-json-layout.properties" } else { - "org/apache/spark/log4j2-pattern-layout-defaults.properties" + "org/apache/spark/log4j2-defaults.properties" } Option(SparkClassUtils.getSparkClassLoader.getResource(defaultLogProps)) match { case Some(url) => @@ -398,7 +398,7 @@ private[spark] object Logging { @volatile private var initialized = false @volatile private var defaultRootLevel: Level = null @volatile private var defaultSparkLog4jConfig = false - @volatile private var structuredLoggingEnabled = true + @volatile private var structuredLoggingEnabled = false @volatile private[spark] var sparkShellThresholdLevel: Level = null @volatile private[spark] var setLogLevelPrinted: Boolean = false diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/common/utils/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala similarity index 99% rename from core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala rename to common/utils/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala index f50cc0f88842a..d3e975d1782f0 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala @@ -24,7 +24,7 @@ import scala.util.matching.Regex import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.network.util.{ByteUnit, JavaUtils} -import org.apache.spark.util.Utils +import org.apache.spark.util.SparkStringUtils private object ConfigHelpers { @@ -47,7 +47,7 @@ private object ConfigHelpers { } def stringToSeq[T](str: String, converter: String => T): Seq[T] = { - Utils.stringToSeq(str).map(converter) + SparkStringUtils.stringToSeq(str).map(converter) } def seqToString[T](v: Seq[T], stringConverter: T => String): String = { diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/common/utils/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala similarity index 100% rename from core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala rename to common/utils/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigProvider.scala b/common/utils/src/main/scala/org/apache/spark/internal/config/ConfigProvider.scala similarity index 78% rename from core/src/main/scala/org/apache/spark/internal/config/ConfigProvider.scala rename to common/utils/src/main/scala/org/apache/spark/internal/config/ConfigProvider.scala index 392f9d56e7f51..fef019ef1f560 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigProvider.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/config/ConfigProvider.scala @@ -19,8 +19,6 @@ package org.apache.spark.internal.config import java.util.{Map => JMap} -import org.apache.spark.SparkConf - /** * A source of configuration values. */ @@ -47,18 +45,3 @@ private[spark] class MapProvider(conf: JMap[String, String]) extends ConfigProvi override def get(key: String): Option[String] = Option(conf.get(key)) } - -/** - * A config provider that only reads Spark config keys. - */ -private[spark] class SparkConfigProvider(conf: JMap[String, String]) extends ConfigProvider { - - override def get(key: String): Option[String] = { - if (key.startsWith("spark.")) { - Option(conf.get(key)).orElse(SparkConf.getDeprecatedConfig(key, conf)) - } else { - None - } - } - -} diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala b/common/utils/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala similarity index 100% rename from core/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala rename to common/utils/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala diff --git a/common/utils/src/main/scala/org/apache/spark/util/SparkStringUtils.scala b/common/utils/src/main/scala/org/apache/spark/util/SparkStringUtils.scala new file mode 100644 index 0000000000000..6915f373b84e5 --- /dev/null +++ b/common/utils/src/main/scala/org/apache/spark/util/SparkStringUtils.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +trait SparkStringUtils { + def stringToSeq(str: String): Seq[String] = { + import org.apache.spark.util.ArrayImplicits._ + str.split(",").map(_.trim()).filter(_.nonEmpty).toImmutableArraySeq + } +} + +object SparkStringUtils extends SparkStringUtils diff --git a/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java b/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java index 6959fe11820ff..1fab167adfeb0 100644 --- a/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java +++ b/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java @@ -21,11 +21,27 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.logging.log4j.Level; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; + +import org.apache.spark.internal.Logging$; import org.apache.spark.internal.SparkLogger; import org.apache.spark.internal.SparkLoggerFactory; public class StructuredSparkLoggerSuite extends SparkLoggerSuiteBase { + // Enable Structured Logging before running the tests + @BeforeAll + public static void setup() { + Logging$.MODULE$.enableStructuredLogging(); + } + + // Disable Structured Logging after running the tests + @AfterAll + public static void teardown() { + Logging$.MODULE$.disableStructuredLogging(); + } + private static final SparkLogger LOGGER = SparkLoggerFactory.getLogger(StructuredSparkLoggerSuite.class); diff --git a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala index 7631c25662219..9615eb2263636 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.util import scala.jdk.CollectionConverters._ +import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.internal.{Logging, MDC} @@ -26,7 +27,16 @@ import org.apache.spark.internal.LogKeys.{EXIT_CODE, OFFSET, RANGE} class MDCSuite extends AnyFunSuite // scalastyle:ignore funsuite - with Logging { + with Logging + with BeforeAndAfterAll { + + override def beforeAll(): Unit = { + Logging.enableStructuredLogging() + } + + override def afterAll(): Unit = { + Logging.disableStructuredLogging() + } test("check MDC message") { val log = log"This is a log, exitcode ${MDC(EXIT_CODE, 10086)}" diff --git a/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala index 2ba2b15c49f33..248136798b362 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala @@ -17,19 +17,16 @@ package org.apache.spark.util import org.apache.logging.log4j.Level -import org.scalatest.BeforeAndAfterAll import org.apache.spark.internal.Logging -class PatternLoggingSuite extends LoggingSuiteBase with BeforeAndAfterAll { +class PatternLoggingSuite extends LoggingSuiteBase { override def className: String = classOf[PatternLoggingSuite].getSimpleName override def logFilePath: String = "target/pattern.log" override def beforeAll(): Unit = Logging.disableStructuredLogging() - override def afterAll(): Unit = Logging.enableStructuredLogging() - override def expectedPatternForBasicMsg(level: Level): String = { s""".*$level $className: This is a log message\n""" } diff --git a/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala index 48951c2084f17..0026b696f0695 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala @@ -23,14 +23,21 @@ import java.nio.file.Files import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.logging.log4j.Level +import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.internal.{LogEntry, Logging, LogKey, LogKeys, MDC, MessageWithContext} trait LoggingSuiteBase extends AnyFunSuite // scalastyle:ignore funsuite + with BeforeAndAfterAll with Logging { + override def afterAll(): Unit = { + super.afterAll() + Logging.disableStructuredLogging() + } + def className: String def logFilePath: String @@ -202,7 +209,7 @@ trait LoggingSuiteBase } } - private val customLog = log"${MDC(CustomLogKeys.CUSTOM_LOG_KEY, "Custom log message.")}" + private lazy val customLog = log"${MDC(CustomLogKeys.CUSTOM_LOG_KEY, "Custom log message.")}" test("Logging with custom LogKey") { Seq( (Level.ERROR, () => logError(customLog)), @@ -265,6 +272,13 @@ class StructuredLoggingSuite extends LoggingSuiteBase { override def className: String = classOf[StructuredLoggingSuite].getSimpleName override def logFilePath: String = "target/structured.log" + override def beforeAll(): Unit = { + super.beforeAll() + Logging.enableStructuredLogging() + } + + override def afterAll(): Unit = super.afterAll() + private val jsonMapper = new ObjectMapper().registerModule(DefaultScalaModule) private def compactAndToRegexPattern(json: String): String = { jsonMapper.readTree(json).toString. diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/ShreddingUtils.java b/common/variant/src/main/java/org/apache/spark/types/variant/ShreddingUtils.java new file mode 100644 index 0000000000000..6a04bf9a2b259 --- /dev/null +++ b/common/variant/src/main/java/org/apache/spark/types/variant/ShreddingUtils.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.types.variant; + +import java.math.BigDecimal; +import java.util.ArrayList; + +import static org.apache.spark.types.variant.VariantUtil.*; + +public class ShreddingUtils { + // Interface to read from a shredded result. It essentially has the same interface and semantics + // as Spark's `SpecializedGetters`, but we need a new interface to avoid the dependency. + public interface ShreddedRow { + boolean isNullAt(int ordinal); + boolean getBoolean(int ordinal); + byte getByte(int ordinal); + short getShort(int ordinal); + int getInt(int ordinal); + long getLong(int ordinal); + float getFloat(int ordinal); + double getDouble(int ordinal); + BigDecimal getDecimal(int ordinal, int precision, int scale); + String getString(int ordinal); + byte[] getBinary(int ordinal); + ShreddedRow getStruct(int ordinal, int numFields); + ShreddedRow getArray(int ordinal); + int numElements(); + } + + // This `rebuild` function should only be called on the top-level schema, and that other private + // implementation will be called on any recursively shredded sub-schema. + public static Variant rebuild(ShreddedRow row, VariantSchema schema) { + if (schema.topLevelMetadataIdx < 0 || row.isNullAt(schema.topLevelMetadataIdx)) { + throw malformedVariant(); + } + byte[] metadata = row.getBinary(schema.topLevelMetadataIdx); + if (schema.isUnshredded()) { + // `rebuild` is unnecessary for unshredded variant. + if (row.isNullAt(schema.variantIdx)) { + throw malformedVariant(); + } + return new Variant(row.getBinary(schema.variantIdx), metadata); + } + VariantBuilder builder = new VariantBuilder(false); + rebuild(row, metadata, schema, builder); + return builder.result(); + } + + // Rebuild a variant value from the shredded data according to the reconstruction algorithm in + // https://github.com/apache/parquet-format/blob/master/VariantShredding.md. + // Append the result to `builder`. + public static void rebuild(ShreddedRow row, byte[] metadata, VariantSchema schema, + VariantBuilder builder) { + int typedIdx = schema.typedIdx; + int variantIdx = schema.variantIdx; + if (typedIdx >= 0 && !row.isNullAt(typedIdx)) { + if (schema.scalarSchema != null) { + VariantSchema.ScalarType scalar = schema.scalarSchema; + if (scalar instanceof VariantSchema.StringType) { + builder.appendString(row.getString(typedIdx)); + } else if (scalar instanceof VariantSchema.IntegralType) { + VariantSchema.IntegralType it = (VariantSchema.IntegralType) scalar; + long value = 0; + switch (it.size) { + case BYTE: + value = row.getByte(typedIdx); + break; + case SHORT: + value = row.getShort(typedIdx); + break; + case INT: + value = row.getInt(typedIdx); + break; + case LONG: + value = row.getLong(typedIdx); + break; + } + builder.appendLong(value); + } else if (scalar instanceof VariantSchema.FloatType) { + builder.appendFloat(row.getFloat(typedIdx)); + } else if (scalar instanceof VariantSchema.DoubleType) { + builder.appendDouble(row.getDouble(typedIdx)); + } else if (scalar instanceof VariantSchema.BooleanType) { + builder.appendBoolean(row.getBoolean(typedIdx)); + } else if (scalar instanceof VariantSchema.BinaryType) { + builder.appendBinary(row.getBinary(typedIdx)); + } else if (scalar instanceof VariantSchema.DecimalType) { + VariantSchema.DecimalType dt = (VariantSchema.DecimalType) scalar; + builder.appendDecimal(row.getDecimal(typedIdx, dt.precision, dt.scale)); + } else if (scalar instanceof VariantSchema.DateType) { + builder.appendDate(row.getInt(typedIdx)); + } else if (scalar instanceof VariantSchema.TimestampType) { + builder.appendTimestamp(row.getLong(typedIdx)); + } else { + assert scalar instanceof VariantSchema.TimestampNTZType; + builder.appendTimestampNtz(row.getLong(typedIdx)); + } + } else if (schema.arraySchema != null) { + VariantSchema elementSchema = schema.arraySchema; + ShreddedRow array = row.getArray(typedIdx); + int start = builder.getWritePos(); + ArrayList offsets = new ArrayList<>(array.numElements()); + for (int i = 0; i < array.numElements(); i++) { + offsets.add(builder.getWritePos() - start); + rebuild(array.getStruct(i, elementSchema.numFields), metadata, elementSchema, builder); + } + builder.finishWritingArray(start, offsets); + } else { + ShreddedRow object = row.getStruct(typedIdx, schema.objectSchema.length); + ArrayList fields = new ArrayList<>(); + int start = builder.getWritePos(); + for (int fieldIdx = 0; fieldIdx < schema.objectSchema.length; ++fieldIdx) { + // Shredded field must not be null. + if (object.isNullAt(fieldIdx)) { + throw malformedVariant(); + } + String fieldName = schema.objectSchema[fieldIdx].fieldName; + VariantSchema fieldSchema = schema.objectSchema[fieldIdx].schema; + ShreddedRow fieldValue = object.getStruct(fieldIdx, fieldSchema.numFields); + // If the field doesn't have non-null `typed_value` or `value`, it is missing. + if ((fieldSchema.typedIdx >= 0 && !fieldValue.isNullAt(fieldSchema.typedIdx)) || + (fieldSchema.variantIdx >= 0 && !fieldValue.isNullAt(fieldSchema.variantIdx))) { + int id = builder.addKey(fieldName); + fields.add(new VariantBuilder.FieldEntry(fieldName, id, builder.getWritePos() - start)); + rebuild(fieldValue, metadata, fieldSchema, builder); + } + } + if (variantIdx >= 0 && !row.isNullAt(variantIdx)) { + // Add the leftover fields in the variant binary. + Variant v = new Variant(row.getBinary(variantIdx), metadata); + if (v.getType() != VariantUtil.Type.OBJECT) throw malformedVariant(); + for (int i = 0; i < v.objectSize(); ++i) { + Variant.ObjectField field = v.getFieldAtIndex(i); + // `value` must not contain any shredded field. + if (schema.objectSchemaMap.containsKey(field.key)) { + throw malformedVariant(); + } + int id = builder.addKey(field.key); + fields.add(new VariantBuilder.FieldEntry(field.key, id, builder.getWritePos() - start)); + builder.appendVariant(field.value); + } + } + builder.finishWritingObject(start, fields); + } + } else if (variantIdx >= 0 && !row.isNullAt(variantIdx)) { + // `typed_value` doesn't exist or is null. Read from `value`. + builder.appendVariant(new Variant(row.getBinary(variantIdx), metadata)); + } else { + // This means the variant is missing in a context where it must present, so the input data is + // invalid. + throw malformedVariant(); + } + } +} diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantSchema.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantSchema.java index 551e46214859a..d1e6cc3a727fa 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantSchema.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantSchema.java @@ -138,6 +138,12 @@ public VariantSchema(int typedIdx, int variantIdx, int topLevelMetadataIdx, int this.arraySchema = arraySchema; } + // Return whether the variant column is unshrededed. The user is not required to do anything + // special, but can have certain optimizations for unshrededed variant. + public boolean isUnshredded() { + return topLevelMetadataIdx >= 0 && variantIdx >= 0 && typedIdx < 0; + } + @Override public String toString() { return "VariantSchema{" + diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantShreddingWriter.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantShreddingWriter.java index b5f8ea0a1484b..bbee7ee0dca38 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantShreddingWriter.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantShreddingWriter.java @@ -101,7 +101,9 @@ public static ShreddedResult castShredded( int id = v.getDictionaryIdAtIndex(i); fieldEntries.add(new VariantBuilder.FieldEntry( field.key, id, variantBuilder.getWritePos() - start)); - variantBuilder.appendVariant(field.value); + // shallowAppendVariant is needed for correctness, since we're relying on the metadata IDs + // being unchanged. + variantBuilder.shallowAppendVariant(field.value); } } if (numFieldsMatched < objectSchema.length) { @@ -133,8 +135,6 @@ public static ShreddedResult castShredded( // Store the typed value. result.addScalar(typedValue); } else { - VariantBuilder variantBuilder = new VariantBuilder(false); - variantBuilder.appendVariant(v); result.addVariantValue(v.getValue()); } } else { diff --git a/conf/log4j2.properties.pattern-layout-template b/conf/log4j2-json-layout.properties.template similarity index 72% rename from conf/log4j2.properties.pattern-layout-template rename to conf/log4j2-json-layout.properties.template index ab96e03baed20..76499bb6691e7 100644 --- a/conf/log4j2.properties.pattern-layout-template +++ b/conf/log4j2-json-layout.properties.template @@ -19,17 +19,11 @@ rootLogger.level = info rootLogger.appenderRef.stdout.ref = console -# In the pattern layout configuration below, we specify an explicit `%ex` conversion -# pattern for logging Throwables. If this was omitted, then (by default) Log4J would -# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional -# class packaging information. That extra information can sometimes add a substantial -# performance overhead, so we disable it in our default logging config. -# For more information, see SPARK-39361. appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex +appender.console.layout.type = JsonTemplateLayout +appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json # Set the default spark-shell/spark-sql log level to WARN. When running the # spark-shell/spark-sql, the log level for these classes is used to overwrite @@ -60,10 +54,3 @@ logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHan logger.RetryingHMSHandler.level = fatal logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry logger.FunctionRegistry.level = error - -# For deploying Spark ThriftServer -# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 -appender.console.filter.1.type = RegexFilter -appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* -appender.console.filter.1.onMatch = deny -appender.console.filter.1.onMismatch = neutral diff --git a/conf/log4j2.properties.template b/conf/log4j2.properties.template index 8767245314449..011fca58c9b2a 100644 --- a/conf/log4j2.properties.template +++ b/conf/log4j2.properties.template @@ -19,11 +19,17 @@ rootLogger.level = info rootLogger.appenderRef.stdout.ref = console +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = JsonTemplateLayout -appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex # Set the default spark-shell/spark-sql log level to WARN. When running the # spark-shell/spark-sql, the log level for these classes is used to overwrite @@ -54,10 +60,3 @@ logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHan logger.RetryingHMSHandler.level = fatal logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry logger.FunctionRegistry.level = error - -# For deploying Spark ThriftServer -# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 -appender.console.filter.1.type = RegexFilter -appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* -appender.console.filter.1.onMatch = deny -appender.console.filter.1.onMismatch = neutral diff --git a/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt b/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt index 0f4579f5da24f..c41782457cd9e 100644 --- a/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt +++ b/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt @@ -2,140 +2,140 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 2061 2066 7 7.6 131.0 1.0X +Sum 1971 1989 26 8.0 125.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 2032 2033 2 7.7 129.2 1.0X +Sum 1958 2014 80 8.0 124.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1992 2005 19 7.9 126.6 1.0X +Sum 1956 1987 44 8.0 124.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1992 2017 35 7.9 126.6 1.0X +Sum 1953 1962 12 8.1 124.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1981 1981 0 7.9 125.9 1.0X +Sum 1948 1950 3 8.1 123.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1981 1984 4 7.9 126.0 1.0X +Sum 1933 1938 6 8.1 122.9 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of columns 3863 3867 5 2.7 368.4 1.0X +Sum of columns 3570 3574 6 2.9 340.4 1.0X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column 2129 2143 20 7.4 135.4 1.0X -Partition column 1984 1986 2 7.9 126.1 1.1X -Both columns 2209 2231 31 7.1 140.4 1.0X +Data column 2062 2083 30 7.6 131.1 1.0X +Partition column 1869 1873 5 8.4 118.9 1.1X +Both columns 2057 2093 51 7.6 130.8 1.0X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2511 2564 75 4.2 239.5 1.0X +Sum of string length 2010 2012 3 5.2 191.7 1.0X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 3303 3317 20 3.2 315.0 1.0X +Sum of string length 3082 3094 16 3.4 293.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2221 2252 44 4.7 211.8 1.0X +Sum of string length 2220 2245 36 4.7 211.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 1170 1174 6 9.0 111.6 1.0X +Sum of string length 1152 1159 10 9.1 109.9 1.0X ================================================================================================ Select All From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Wide Column Scan from 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select of all columns 19095 19150 78 0.0 38190.4 1.0X +Select of all columns 20941 20946 6 0.0 41882.8 1.0X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 3188 3192 4 0.3 3040.7 1.0X +Sum of single column 3425 3440 21 0.3 3266.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 6229 6254 35 0.2 5940.6 1.0X +Sum of single column 6740 6770 43 0.2 6427.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 9279 9318 56 0.1 8848.8 1.0X +Sum of single column 9988 10056 96 0.1 9525.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 5538 5544 7 0.2 5537.5 1.0X -pushdown disabled 5546 5571 24 0.2 5546.5 1.0X -w/ filters 2312 2324 18 0.4 2312.4 2.4X +w/o filters 5802 5844 44 0.2 5801.7 1.0X +pushdown disabled 5544 5616 97 0.2 5543.9 1.0X +w/ filters 2605 2609 5 0.4 2605.4 2.2X diff --git a/connector/avro/benchmarks/AvroReadBenchmark-results.txt b/connector/avro/benchmarks/AvroReadBenchmark-results.txt index db6193e67ac39..117cb0b05e8a3 100644 --- a/connector/avro/benchmarks/AvroReadBenchmark-results.txt +++ b/connector/avro/benchmarks/AvroReadBenchmark-results.txt @@ -2,140 +2,140 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1959 1993 47 8.0 124.6 1.0X +Sum 1945 1948 4 8.1 123.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1929 1949 28 8.2 122.6 1.0X +Sum 1941 1965 34 8.1 123.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1965 1974 13 8.0 124.9 1.0X +Sum 1910 1921 15 8.2 121.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1904 1918 20 8.3 121.0 1.0X +Sum 1923 1927 6 8.2 122.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1916 1934 26 8.2 121.8 1.0X +Sum 1893 1898 7 8.3 120.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1911 1917 8 8.2 121.5 1.0X +Sum 1890 1894 6 8.3 120.2 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of columns 3426 3450 34 3.1 326.8 1.0X +Sum of columns 3614 3616 2 2.9 344.7 1.0X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column 1871 1888 24 8.4 118.9 1.0X -Partition column 1713 1720 9 9.2 108.9 1.1X -Both columns 1962 1970 12 8.0 124.7 1.0X +Data column 2106 2108 2 7.5 133.9 1.0X +Partition column 1862 1864 3 8.4 118.4 1.1X +Both columns 2359 2382 32 6.7 150.0 0.9X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2042 2055 18 5.1 194.7 1.0X +Sum of string length 2147 2151 6 4.9 204.7 1.0X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 3089 3109 28 3.4 294.6 1.0X +Sum of string length 3410 3421 16 3.1 325.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2103 2104 2 5.0 200.5 1.0X +Sum of string length 2133 2157 34 4.9 203.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 1073 1079 10 9.8 102.3 1.0X +Sum of string length 1107 1110 4 9.5 105.6 1.0X ================================================================================================ Select All From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Wide Column Scan from 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select of all columns 18300 18346 64 0.0 36600.5 1.0X +Select of all columns 19318 19384 94 0.0 38635.4 1.0X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 3144 3148 5 0.3 2998.8 1.0X +Sum of single column 3159 3183 34 0.3 3012.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 6216 6229 19 0.2 5927.8 1.0X +Sum of single column 6352 6387 49 0.2 6058.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 9286 9585 423 0.1 8855.6 1.0X +Sum of single column 9512 9539 39 0.1 9070.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 5308 5326 23 0.2 5307.5 1.0X -pushdown disabled 5253 5288 33 0.2 5252.7 1.0X -w/ filters 2036 2061 24 0.5 2036.3 2.6X +w/o filters 5474 5481 12 0.2 5474.4 1.0X +pushdown disabled 5453 5490 44 0.2 5452.7 1.0X +w/ filters 2210 2223 18 0.5 2209.7 2.5X diff --git a/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt b/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt index a071bc767cfaa..b8c0d3b95e360 100644 --- a/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt +++ b/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1566 1588 30 10.0 99.6 1.0X -Output Single Double Column 1655 1668 18 9.5 105.3 0.9X -Output Int and String Column 3493 3496 5 4.5 222.1 0.4X -Output Partitions 3062 3112 71 5.1 194.7 0.5X -Output Buckets 3937 3952 20 4.0 250.3 0.4X +Output Single Int Column 1562 1586 33 10.1 99.3 1.0X +Output Single Double Column 1658 1695 52 9.5 105.4 0.9X +Output Int and String Column 3516 3524 11 4.5 223.6 0.4X +Output Partitions 2936 3033 138 5.4 186.6 0.5X +Output Buckets 3856 3882 36 4.1 245.1 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro compression with different codec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -BZIP2: 115765 115975 297 0.0 1157649.1 1.0X -DEFLATE: 6345 6370 35 0.0 63448.5 18.2X -UNCOMPRESSED: 5183 5184 1 0.0 51827.4 22.3X -SNAPPY: 4611 4614 3 0.0 46112.5 25.1X -XZ: 54096 57854 5315 0.0 540956.3 2.1X -ZSTANDARD: 4877 4888 15 0.0 48770.9 23.7X +BZIP2: 117457 117471 19 0.0 1174572.2 1.0X +DEFLATE: 6340 6364 34 0.0 63404.0 18.5X +UNCOMPRESSED: 4990 4998 12 0.0 49898.9 23.5X +SNAPPY: 4561 4564 4 0.0 45610.2 25.8X +XZ: 43883 49072 7337 0.0 438832.8 2.7X +ZSTANDARD: 4774 4777 4 0.0 47741.1 24.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro deflate with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -DEFLATE: deflate.level=1 4807 4847 57 0.0 48065.8 1.0X -DEFLATE: deflate.level=3 4803 4809 7 0.0 48033.5 1.0X -DEFLATE: deflate.level=5 6373 6389 22 0.0 63728.7 0.8X -DEFLATE: deflate.level=7 6427 6460 47 0.0 64266.6 0.7X -DEFLATE: deflate.level=9 6628 6634 10 0.0 66277.2 0.7X +DEFLATE: deflate.level=1 4752 4764 17 0.0 47519.5 1.0X +DEFLATE: deflate.level=3 4682 4687 8 0.0 46819.6 1.0X +DEFLATE: deflate.level=5 6382 6392 13 0.0 63820.4 0.7X +DEFLATE: deflate.level=7 6477 6532 77 0.0 64774.8 0.7X +DEFLATE: deflate.level=9 6773 6783 15 0.0 67729.6 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro xz with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XZ: xz.level=1 11974 12000 37 0.0 119736.9 1.0X -XZ: xz.level=3 21671 21813 201 0.0 216709.0 0.6X -XZ: xz.level=5 47055 47335 397 0.0 470545.5 0.3X -XZ: xz.level=7 74766 75069 428 0.0 747658.3 0.2X -XZ: xz.level=9 146478 146490 16 0.0 1464783.7 0.1X +XZ: xz.level=1 11571 11577 9 0.0 115710.1 1.0X +XZ: xz.level=3 21469 21642 245 0.0 214687.1 0.5X +XZ: xz.level=5 40907 40912 7 0.0 409072.6 0.3X +XZ: xz.level=7 60545 61371 1167 0.0 605453.1 0.2X +XZ: xz.level=9 136882 137479 845 0.0 1368819.9 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro zstandard with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ZSTANDARD: zstandard.level=1 4760 4797 53 0.0 47598.3 1.0X -ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4666 4696 43 0.0 46656.0 1.0X -ZSTANDARD: zstandard.level=3 4845 4869 33 0.0 48452.8 1.0X -ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4790 4801 16 0.0 47896.5 1.0X -ZSTANDARD: zstandard.level=5 5125 5164 55 0.0 51248.6 0.9X -ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 4912 4928 22 0.0 49122.5 1.0X -ZSTANDARD: zstandard.level=7 5319 5333 19 0.0 53192.1 0.9X -ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5250 5284 48 0.0 52501.2 0.9X -ZSTANDARD: zstandard.level=9 6087 6087 0 0.0 60869.7 0.8X -ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 6219 6234 21 0.0 62191.3 0.8X +ZSTANDARD: zstandard.level=1 4722 4763 58 0.0 47221.7 1.0X +ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4734 5114 536 0.0 47341.8 1.0X +ZSTANDARD: zstandard.level=3 4816 4837 29 0.0 48162.0 1.0X +ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4741 4766 35 0.0 47414.8 1.0X +ZSTANDARD: zstandard.level=5 5054 5155 143 0.0 50536.2 0.9X +ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 4869 4874 7 0.0 48690.8 1.0X +ZSTANDARD: zstandard.level=7 5325 5350 35 0.0 53251.2 0.9X +ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5283 5308 35 0.0 52828.8 0.9X +ZSTANDARD: zstandard.level=9 6092 6116 35 0.0 60917.5 0.8X +ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 5925 5935 15 0.0 59246.3 0.8X diff --git a/connector/avro/benchmarks/AvroWriteBenchmark-results.txt b/connector/avro/benchmarks/AvroWriteBenchmark-results.txt index 1a605c0ea0e90..03fea3f0379f4 100644 --- a/connector/avro/benchmarks/AvroWriteBenchmark-results.txt +++ b/connector/avro/benchmarks/AvroWriteBenchmark-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1544 1567 34 10.2 98.1 1.0X -Output Single Double Column 1635 1647 17 9.6 104.0 0.9X -Output Int and String Column 3324 3334 15 4.7 211.3 0.5X -Output Partitions 2961 3047 122 5.3 188.2 0.5X -Output Buckets 3776 3778 3 4.2 240.1 0.4X +Output Single Int Column 1562 1564 3 10.1 99.3 1.0X +Output Single Double Column 1658 1677 27 9.5 105.4 0.9X +Output Int and String Column 3417 3456 55 4.6 217.2 0.5X +Output Partitions 2923 3064 199 5.4 185.8 0.5X +Output Buckets 3769 3772 4 4.2 239.6 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro compression with different codec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -BZIP2: 130388 131379 1402 0.0 1303881.3 1.0X -DEFLATE: 6523 6538 21 0.0 65227.6 20.0X -UNCOMPRESSED: 5394 5425 43 0.0 53944.9 24.2X -SNAPPY: 4813 4816 3 0.0 48134.6 27.1X -XZ: 54364 54382 26 0.0 543640.7 2.4X -ZSTANDARD: 4864 4873 13 0.0 48635.9 26.8X +BZIP2: 132067 132334 377 0.0 1320668.2 1.0X +DEFLATE: 6456 6466 14 0.0 64562.5 20.5X +UNCOMPRESSED: 5188 5189 2 0.0 51879.6 25.5X +SNAPPY: 4678 4679 2 0.0 46777.8 28.2X +XZ: 42468 42597 183 0.0 424677.8 3.1X +ZSTANDARD: 4796 4801 7 0.0 47963.8 27.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro deflate with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -DEFLATE: deflate.level=1 4909 4916 9 0.0 49091.1 1.0X -DEFLATE: deflate.level=3 4874 4903 42 0.0 48735.8 1.0X -DEFLATE: deflate.level=5 6460 6473 19 0.0 64601.7 0.8X -DEFLATE: deflate.level=7 6450 6482 46 0.0 64497.5 0.8X -DEFLATE: deflate.level=9 6875 6878 5 0.0 68745.4 0.7X +DEFLATE: deflate.level=1 4736 4751 21 0.0 47356.5 1.0X +DEFLATE: deflate.level=3 4795 4797 4 0.0 47945.1 1.0X +DEFLATE: deflate.level=5 6489 6492 4 0.0 64885.9 0.7X +DEFLATE: deflate.level=7 6464 6484 29 0.0 64640.7 0.7X +DEFLATE: deflate.level=9 6740 6761 30 0.0 67404.6 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro xz with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XZ: xz.level=1 12210 12226 22 0.0 122101.7 1.0X -XZ: xz.level=3 22235 22235 0 0.0 222346.3 0.5X -XZ: xz.level=5 47597 47659 88 0.0 475969.7 0.3X -XZ: xz.level=7 69231 69482 356 0.0 692308.3 0.2X -XZ: xz.level=9 147042 148998 2766 0.0 1470415.9 0.1X +XZ: xz.level=1 12053 12062 13 0.0 120526.1 1.0X +XZ: xz.level=3 22766 22771 7 0.0 227656.6 0.5X +XZ: xz.level=5 40993 42080 1538 0.0 409927.7 0.3X +XZ: xz.level=7 64226 64623 562 0.0 642261.7 0.2X +XZ: xz.level=9 143378 145508 3013 0.0 1433775.6 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Avro zstandard with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ZSTANDARD: zstandard.level=1 4750 4817 94 0.0 47504.2 1.0X -ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4753 4802 69 0.0 47532.8 1.0X -ZSTANDARD: zstandard.level=3 4920 4924 6 0.0 49198.5 1.0X -ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4792 4799 9 0.0 47921.8 1.0X -ZSTANDARD: zstandard.level=5 5240 5276 51 0.0 52404.0 0.9X -ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 5072 5101 41 0.0 50722.5 0.9X -ZSTANDARD: zstandard.level=7 5542 5591 69 0.0 55416.5 0.9X -ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5605 5617 17 0.0 56050.4 0.8X -ZSTANDARD: zstandard.level=9 6311 6403 130 0.0 63109.5 0.8X -ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 6324 6331 10 0.0 63236.4 0.8X +ZSTANDARD: zstandard.level=1 4816 4828 16 0.0 48164.5 1.0X +ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4669 4875 292 0.0 46692.3 1.0X +ZSTANDARD: zstandard.level=3 4849 4883 48 0.0 48492.9 1.0X +ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4793 4796 4 0.0 47929.4 1.0X +ZSTANDARD: zstandard.level=5 5098 5145 66 0.0 50982.7 0.9X +ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 5011 5024 19 0.0 50106.6 1.0X +ZSTANDARD: zstandard.level=7 5480 5502 31 0.0 54796.9 0.9X +ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5459 5469 14 0.0 54591.1 0.9X +ZSTANDARD: zstandard.level=9 6319 6343 34 0.0 63188.9 0.8X +ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 6146 6171 35 0.0 61460.8 0.8X diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroExpressionEvalUtils.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroExpressionEvalUtils.scala new file mode 100644 index 0000000000000..1a9a3609c8a5e --- /dev/null +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroExpressionEvalUtils.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema + +import org.apache.spark.sql.catalyst.util.{ParseMode, PermissiveMode} +import org.apache.spark.unsafe.types.UTF8String + +object AvroExpressionEvalUtils { + + def schemaOfAvro( + avroOptions: AvroOptions, + parseMode: ParseMode, + expectedSchema: Schema): UTF8String = { + val dt = SchemaConverters.toSqlType( + expectedSchema, + avroOptions.useStableIdForUnionType, + avroOptions.stableIdPrefixForUnionType, + avroOptions.recursiveFieldMaxDepth).dataType + val schema = parseMode match { + // With PermissiveMode, the output Catalyst row might contain columns of null values for + // corrupt records, even if some of the columns are not nullable in the user-provided schema. + // Therefore we force the schema to be all nullable here. + case PermissiveMode => dt.asNullable + case _ => dt + } + UTF8String.fromString(schema.sql) + } +} diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaOfAvro.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaOfAvro.scala new file mode 100644 index 0000000000000..094fd4254e16a --- /dev/null +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaOfAvro.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema + +import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Literal, RuntimeReplaceable} +import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke +import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode} +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, ObjectType} + +private[sql] case class SchemaOfAvro( + jsonFormatSchema: String, + options: Map[String, String]) + extends LeafExpression with RuntimeReplaceable { + + override def dataType: DataType = SQLConf.get.defaultStringType + + override def nullable: Boolean = false + + @transient private lazy val avroOptions = AvroOptions(options) + + @transient private lazy val actualSchema = + new Schema.Parser().setValidateDefaults(false).parse(jsonFormatSchema) + + @transient private lazy val expectedSchema = avroOptions.schema.getOrElse(actualSchema) + + @transient private lazy val parseMode: ParseMode = { + val mode = avroOptions.parseMode + if (mode != PermissiveMode && mode != FailFastMode) { + throw QueryCompilationErrors.parseModeUnsupportedError( + prettyName, mode + ) + } + mode + } + + override def prettyName: String = "schema_of_avro" + + @transient private lazy val avroOptionsObjectType = ObjectType(classOf[AvroOptions]) + @transient private lazy val parseModeObjectType = ObjectType(classOf[ParseMode]) + @transient private lazy val schemaObjectType = ObjectType(classOf[Schema]) + + override def replacement: Expression = StaticInvoke( + AvroExpressionEvalUtils.getClass, + dataType, + "schemaOfAvro", + Seq( + Literal(avroOptions, avroOptionsObjectType), + Literal(parseMode, parseModeObjectType), + Literal(expectedSchema, schemaObjectType)), + Seq(avroOptionsObjectType, parseModeObjectType, schemaObjectType) + ) +} diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala index 8ec711b2757f5..e898253be1168 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala @@ -42,10 +42,12 @@ case class AvroTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = AvroUtils.inferSchema(sparkSession, options.asScala.toMap, files) - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder { - override def build(): Write = AvroWrite(paths, formatName, supportsDataType, info) + override def build(): Write = + AvroWrite(paths, formatName, supportsDataType, mergedWriteInfo(info)) } + } override def supportsDataType(dataType: DataType): Boolean = AvroUtils.supportsDataType(dataType) diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala index 096cdfe0b9ee4..8c128d4c7ea65 100644 --- a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala @@ -629,4 +629,40 @@ class AvroFunctionsSuite extends QueryTest with SharedSparkSession { assert(readbackPerson2.get(2).toString === person2.get(2)) } } + + test("schema_of_avro") { + val df = spark.range(1) + val avroIntType = s""" + |{ + | "type": "int", + | "name": "id" + |}""".stripMargin + checkAnswer(df.select(functions.schema_of_avro(avroIntType)), Row("INT")) + + val avroStructType = + """ + |{ + | "type": "record", + | "name": "person", + | "fields": [ + | {"name": "name", "type": "string"}, + | {"name": "age", "type": "int"}, + | {"name": "country", "type": "string"} + | ] + |}""".stripMargin + checkAnswer(df.select(functions.schema_of_avro(avroStructType)), + Row("STRUCT")) + + val avroMultiType = + """ + |{ + | "type": "record", + | "name": "person", + | "fields": [ + | {"name": "u", "type": ["int", "string"]} + | ] + |}""".stripMargin + checkAnswer(df.select(functions.schema_of_avro(avroMultiType)), + Row("STRUCT NOT NULL>")) + } } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala index 631e9057f8d15..75df538678a3d 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -42,7 +42,7 @@ import org.apache.spark.sql.errors.DataTypeErrors.toSQLId import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.expressions.SparkUserDefinedFunction import org.apache.spark.sql.functions.{struct, to_json} -import org.apache.spark.sql.internal.{ColumnNodeToProtoConverter, DataFrameWriterImpl, DataFrameWriterV2Impl, MergeIntoWriterImpl, ToScalaUDF, UDFAdaptors, UnresolvedAttribute, UnresolvedRegex} +import org.apache.spark.sql.internal.{ColumnNodeToProtoConverter, DataFrameWriterImpl, DataFrameWriterV2Impl, MergeIntoWriterImpl, SubqueryExpressionNode, SubqueryType, ToScalaUDF, UDFAdaptors, UnresolvedAttribute, UnresolvedRegex} import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types.{Metadata, StructType} import org.apache.spark.storage.StorageLevel @@ -288,9 +288,10 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ def stat: DataFrameStatFunctions = new DataFrameStatFunctions(toDF()) - private def buildJoin(right: Dataset[_])(f: proto.Join.Builder => Unit): DataFrame = { + private def buildJoin(right: Dataset[_], cols: Seq[Column] = Seq.empty)( + f: proto.Join.Builder => Unit): DataFrame = { checkSameSparkSession(right) - sparkSession.newDataFrame { builder => + sparkSession.newDataFrame(cols) { builder => val joinBuilder = builder.getJoinBuilder joinBuilder.setLeft(plan.getRoot).setRight(right.plan.getRoot) f(joinBuilder) @@ -334,7 +335,7 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ def join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame = { - buildJoin(right) { builder => + buildJoin(right, Seq(joinExprs)) { builder => builder .setJoinType(toJoinType(joinType)) .setJoinCondition(joinExprs.expr) @@ -383,11 +384,50 @@ class Dataset[T] private[sql] ( } } + private def lateralJoin( + right: DS[_], + joinExprs: Option[Column], + joinType: String): DataFrame = { + val joinTypeValue = toJoinType(joinType) + joinTypeValue match { + case proto.Join.JoinType.JOIN_TYPE_INNER | proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER | + proto.Join.JoinType.JOIN_TYPE_CROSS => + case _ => + throw new IllegalArgumentException(s"Unsupported lateral join type $joinType") + } + sparkSession.newDataFrame(joinExprs.toSeq) { builder => + val lateralJoinBuilder = builder.getLateralJoinBuilder + lateralJoinBuilder.setLeft(plan.getRoot).setRight(right.plan.getRoot) + joinExprs.foreach(c => lateralJoinBuilder.setJoinCondition(c.expr)) + lateralJoinBuilder.setJoinType(joinTypeValue) + } + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_]): DataFrame = { + lateralJoin(right, None, "inner") + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_], joinExprs: Column): DataFrame = { + lateralJoin(right, Some(joinExprs), "inner") + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_], joinType: String): DataFrame = { + lateralJoin(right, None, joinType) + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_], joinExprs: Column, joinType: String): DataFrame = { + lateralJoin(right, Some(joinExprs), joinType) + } + override protected def sortInternal(global: Boolean, sortCols: Seq[Column]): Dataset[T] = { val sortExprs = sortCols.map { c => ColumnNodeToProtoConverter(c.sortOrder).getSortOrder } - sparkSession.newDataset(agnosticEncoder) { builder => + sparkSession.newDataset(agnosticEncoder, sortCols) { builder => builder.getSortBuilder .setInput(plan.getRoot) .setIsGlobal(global) @@ -463,7 +503,7 @@ class Dataset[T] private[sql] ( * methods and typed select methods is the encoder used to build the return dataset. */ private def selectUntyped(encoder: AgnosticEncoder[_], cols: Seq[Column]): Dataset[_] = { - sparkSession.newDataset(encoder) { builder => + sparkSession.newDataset(encoder, cols) { builder => builder.getProjectBuilder .setInput(plan.getRoot) .addAllExpressions(cols.map(_.typedExpr(this.encoder)).asJava) @@ -471,29 +511,32 @@ class Dataset[T] private[sql] ( } /** @inheritdoc */ - def filter(condition: Column): Dataset[T] = sparkSession.newDataset(agnosticEncoder) { - builder => + def filter(condition: Column): Dataset[T] = { + sparkSession.newDataset(agnosticEncoder, Seq(condition)) { builder => builder.getFilterBuilder.setInput(plan.getRoot).setCondition(condition.expr) + } } private def buildUnpivot( ids: Array[Column], valuesOption: Option[Array[Column]], variableColumnName: String, - valueColumnName: String): DataFrame = sparkSession.newDataFrame { builder => - val unpivot = builder.getUnpivotBuilder - .setInput(plan.getRoot) - .addAllIds(ids.toImmutableArraySeq.map(_.expr).asJava) - .setVariableColumnName(variableColumnName) - .setValueColumnName(valueColumnName) - valuesOption.foreach { values => - unpivot.getValuesBuilder - .addAllValues(values.toImmutableArraySeq.map(_.expr).asJava) + valueColumnName: String): DataFrame = { + sparkSession.newDataFrame(ids.toSeq ++ valuesOption.toSeq.flatten) { builder => + val unpivot = builder.getUnpivotBuilder + .setInput(plan.getRoot) + .addAllIds(ids.toImmutableArraySeq.map(_.expr).asJava) + .setVariableColumnName(variableColumnName) + .setValueColumnName(valueColumnName) + valuesOption.foreach { values => + unpivot.getValuesBuilder + .addAllValues(values.toImmutableArraySeq.map(_.expr).asJava) + } } } private def buildTranspose(indices: Seq[Column]): DataFrame = - sparkSession.newDataFrame { builder => + sparkSession.newDataFrame(indices) { builder => val transpose = builder.getTransposeBuilder.setInput(plan.getRoot) indices.foreach { indexColumn => transpose.addIndexColumns(indexColumn.expr) @@ -585,18 +628,15 @@ class Dataset[T] private[sql] ( def transpose(): DataFrame = buildTranspose(Seq.empty) - // TODO(SPARK-50134): Support scalar Subquery API in Spark Connect - // scalastyle:off not.implemented.error.usage /** @inheritdoc */ def scalar(): Column = { - ??? + Column(SubqueryExpressionNode(plan.getRoot, SubqueryType.SCALAR)) } /** @inheritdoc */ def exists(): Column = { - ??? + Column(SubqueryExpressionNode(plan.getRoot, SubqueryType.EXISTS)) } - // scalastyle:on not.implemented.error.usage /** @inheritdoc */ def limit(n: Int): Dataset[T] = sparkSession.newDataset(agnosticEncoder) { builder => @@ -743,7 +783,7 @@ class Dataset[T] private[sql] ( val aliases = values.zip(names).map { case (value, name) => value.name(name).expr.getAlias } - sparkSession.newDataFrame { builder => + sparkSession.newDataFrame(values) { builder => builder.getWithColumnsBuilder .setInput(plan.getRoot) .addAllAliases(aliases.asJava) @@ -803,10 +843,12 @@ class Dataset[T] private[sql] ( @scala.annotation.varargs def drop(col: Column, cols: Column*): DataFrame = buildDrop(col +: cols) - private def buildDrop(cols: Seq[Column]): DataFrame = sparkSession.newDataFrame { builder => - builder.getDropBuilder - .setInput(plan.getRoot) - .addAllColumns(cols.map(_.expr).asJava) + private def buildDrop(cols: Seq[Column]): DataFrame = { + sparkSession.newDataFrame(cols) { builder => + builder.getDropBuilder + .setInput(plan.getRoot) + .addAllColumns(cols.map(_.expr).asJava) + } } private def buildDropByNames(cols: Seq[String]): DataFrame = sparkSession.newDataFrame { @@ -976,12 +1018,13 @@ class Dataset[T] private[sql] ( private def buildRepartitionByExpression( numPartitions: Option[Int], - partitionExprs: Seq[Column]): Dataset[T] = sparkSession.newDataset(agnosticEncoder) { - builder => + partitionExprs: Seq[Column]): Dataset[T] = { + sparkSession.newDataset(agnosticEncoder, partitionExprs) { builder => val repartitionBuilder = builder.getRepartitionByExpressionBuilder .setInput(plan.getRoot) .addAllPartitionExprs(partitionExprs.map(_.expr).asJava) numPartitions.foreach(repartitionBuilder.setNumPartitions) + } } /** @inheritdoc */ @@ -1113,7 +1156,7 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ @scala.annotation.varargs def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = { - sparkSession.newDataset(agnosticEncoder) { builder => + sparkSession.newDataset(agnosticEncoder, expr +: exprs) { builder => builder.getCollectMetricsBuilder .setInput(plan.getRoot) .setName(name) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 63b5f27c4745e..d5505d2222c4f 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -27,7 +27,7 @@ import org.apache.spark.connect.proto import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{agnosticEncoderFor, ProductEncoder} import org.apache.spark.sql.connect.ConnectConversions._ -import org.apache.spark.sql.connect.common.UdfUtils +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, UdfUtils} import org.apache.spark.sql.expressions.SparkUserDefinedFunction import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.ColumnNodeToProtoConverter.toExpr @@ -502,6 +502,7 @@ private class KeyValueGroupedDatasetImpl[K, V, IK, IV]( } val outputEncoder = agnosticEncoderFor[U] + val stateEncoder = agnosticEncoderFor[S] val nf = UDFAdaptors.flatMapGroupsWithStateWithMappedValues(func, valueMapFunc) sparkSession.newDataset[U](outputEncoder) { builder => @@ -509,11 +510,12 @@ private class KeyValueGroupedDatasetImpl[K, V, IK, IV]( groupMapBuilder .setInput(plan.getRoot) .addAllGroupingExpressions(groupingExprs) - .setFunc(getUdf(nf, outputEncoder)(ivEncoder)) + .setFunc(getUdf(nf, outputEncoder, stateEncoder)(ivEncoder)) .setIsMapGroupsWithState(isMapGroupWithState) .setOutputMode(if (outputMode.isEmpty) OutputMode.Update.toString else outputMode.get.toString) .setTimeoutConf(timeoutConf.toString) + .setStateSchema(DataTypeProtoConverter.toConnectProtoType(stateEncoder.schema)) if (initialStateImpl != null) { groupMapBuilder @@ -533,6 +535,21 @@ private class KeyValueGroupedDatasetImpl[K, V, IK, IV]( udf.apply(inputEncoders.map(_ => col("*")): _*).expr.getCommonInlineUserDefinedFunction } + private def getUdf[U: Encoder, S: Encoder]( + nf: AnyRef, + outputEncoder: AgnosticEncoder[U], + stateEncoder: AgnosticEncoder[S])( + inEncoders: AgnosticEncoder[_]*): proto.CommonInlineUserDefinedFunction = { + // Apply keyAs changes by setting kEncoder + // Add the state encoder to the inputEncoders. + val inputEncoders = kEncoder +: stateEncoder +: inEncoders + val udf = SparkUserDefinedFunction( + function = nf, + inputEncoders = inputEncoders, + outputEncoder = outputEncoder) + udf.apply(inputEncoders.map(_ => col("*")): _*).expr.getCommonInlineUserDefinedFunction + } + /** * We cannot deserialize a connect [[KeyValueGroupedDataset]] because of a class clash on the * server side. We null out the instance for now. diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 5bded40b0d132..0944c88a67906 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -45,7 +45,7 @@ class RelationalGroupedDataset private[sql] ( import df.sparkSession.RichColumn protected def toDF(aggExprs: Seq[Column]): DataFrame = { - df.sparkSession.newDataFrame { builder => + df.sparkSession.newDataFrame(groupingExprs ++ aggExprs) { builder => val aggBuilder = builder.getAggregateBuilder .setInput(df.plan.getRoot) groupingExprs.foreach(c => aggBuilder.addGroupingExpressions(c.expr)) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLContext.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLContext.scala new file mode 100644 index 0000000000000..3603eb6ea508d --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.{List => JList, Map => JMap, Properties} + +import scala.jdk.CollectionConverters.PropertiesHasAsScala +import scala.reflect.runtime.universe.TypeTag + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.Stable +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.connect.ConnectClientUnsupportedErrors +import org.apache.spark.sql.connect.ConnectConversions._ +import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.streaming.{DataStreamReader, StreamingQueryManager} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ExecutionListenerManager + +@Stable +class SQLContext private[sql] (override val sparkSession: SparkSession) + extends api.SQLContext(sparkSession) { + + /** @inheritdoc */ + def newSession(): SQLContext = sparkSession.newSession().sqlContext + + /** @inheritdoc */ + def listenerManager: ExecutionListenerManager = sparkSession.listenerManager + + /** @inheritdoc */ + def setConf(props: Properties): Unit = sparkSession.conf.synchronized { + props.asScala.foreach { case (k, v) => sparkSession.conf.set(k, v) } + } + + /** @inheritdoc */ + def experimental: ExperimentalMethods = sparkSession.experimental + + /** @inheritdoc */ + def udf: UDFRegistration = sparkSession.udf + + // scalastyle:off + // Disable style checker so "implicits" object can start with lowercase i + + /** @inheritdoc */ + object implicits extends SQLImplicits { + + /** @inheritdoc */ + override protected def session: SparkSession = sparkSession + } + + // scalastyle:on + + /** @inheritdoc */ + def read: DataFrameReader = sparkSession.read + + /** @inheritdoc */ + def readStream: DataStreamReader = sparkSession.readStream + + /** + * Returns a `StreamingQueryManager` that allows managing all the + * [[org.apache.spark.sql.streaming.StreamingQuery StreamingQueries]] active on `this` context. + * + * @since 4.0.0 + */ + def streams: StreamingQueryManager = sparkSession.streams + + /** @inheritdoc */ + override def sparkContext: SparkContext = { + throw ConnectClientUnsupportedErrors.sparkContext() + } + + /** @inheritdoc */ + override def emptyDataFrame: Dataset[Row] = super.emptyDataFrame + + /** @inheritdoc */ + override def createDataFrame[A <: Product: TypeTag](rdd: RDD[A]): Dataset[Row] = + super.createDataFrame(rdd) + + /** @inheritdoc */ + override def createDataFrame[A <: Product: TypeTag](data: Seq[A]): Dataset[Row] = + super.createDataFrame(data) + + /** @inheritdoc */ + override def baseRelationToDataFrame(baseRelation: BaseRelation): Dataset[Row] = + super.baseRelationToDataFrame(baseRelation) + + /** @inheritdoc */ + override def createDataFrame(rowRDD: RDD[Row], schema: StructType): Dataset[Row] = + super.createDataFrame(rowRDD, schema) + + /** @inheritdoc */ + override def createDataset[T: Encoder](data: Seq[T]): Dataset[T] = super.createDataset(data) + + /** @inheritdoc */ + override def createDataset[T: Encoder](data: RDD[T]): Dataset[T] = super.createDataset(data) + + /** @inheritdoc */ + override def createDataset[T: Encoder](data: JList[T]): Dataset[T] = + super.createDataset(data) + + /** @inheritdoc */ + override def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): Dataset[Row] = + super.createDataFrame(rowRDD, schema) + + /** @inheritdoc */ + override def createDataFrame(rows: JList[Row], schema: StructType): Dataset[Row] = + super.createDataFrame(rows, schema) + + /** @inheritdoc */ + override def createDataFrame(rdd: RDD[_], beanClass: Class[_]): Dataset[Row] = + super.createDataFrame(rdd, beanClass) + + /** @inheritdoc */ + override def createDataFrame(rdd: JavaRDD[_], beanClass: Class[_]): Dataset[Row] = + super.createDataFrame(rdd, beanClass) + + /** @inheritdoc */ + override def createDataFrame(data: JList[_], beanClass: Class[_]): Dataset[Row] = + super.createDataFrame(data, beanClass) + + /** @inheritdoc */ + override def createExternalTable(tableName: String, path: String): Dataset[Row] = + super.createExternalTable(tableName, path) + + /** @inheritdoc */ + override def createExternalTable( + tableName: String, + path: String, + source: String): Dataset[Row] = { + super.createExternalTable(tableName, path, source) + } + + /** @inheritdoc */ + override def createExternalTable( + tableName: String, + source: String, + options: JMap[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, options) + } + + /** @inheritdoc */ + override def createExternalTable( + tableName: String, + source: String, + options: Map[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, options) + } + + /** @inheritdoc */ + override def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: JMap[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, schema, options) + } + + /** @inheritdoc */ + override def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: Map[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, schema, options) + } + + /** @inheritdoc */ + override def range(end: Long): Dataset[Row] = super.range(end) + + /** @inheritdoc */ + override def range(start: Long, end: Long): Dataset[Row] = super.range(start, end) + + /** @inheritdoc */ + override def range(start: Long, end: Long, step: Long): Dataset[Row] = + super.range(start, end, step) + + /** @inheritdoc */ + override def range(start: Long, end: Long, step: Long, numPartitions: Int): Dataset[Row] = + super.range(start, end, step, numPartitions) + + /** @inheritdoc */ + override def sql(sqlText: String): Dataset[Row] = super.sql(sqlText) + + /** @inheritdoc */ + override def table(tableName: String): Dataset[Row] = super.table(tableName) + + /** @inheritdoc */ + override def tables(): Dataset[Row] = super.tables() + + /** @inheritdoc */ + override def tables(databaseName: String): Dataset[Row] = super.tables(databaseName) + + /** @inheritdoc */ + override def applySchema(rowRDD: RDD[Row], schema: StructType): Dataset[Row] = + super.applySchema(rowRDD, schema) + + /** @inheritdoc */ + override def applySchema(rowRDD: JavaRDD[Row], schema: StructType): Dataset[Row] = + super.applySchema(rowRDD, schema) + + /** @inheritdoc */ + override def applySchema(rdd: RDD[_], beanClass: Class[_]): Dataset[Row] = + super.applySchema(rdd, beanClass) + + /** @inheritdoc */ + override def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): Dataset[Row] = + super.applySchema(rdd, beanClass) + + /** @inheritdoc */ + @scala.annotation.varargs + override def parquetFile(paths: String*): Dataset[Row] = super.parquetFile(paths: _*) + + /** @inheritdoc */ + override def jsonFile(path: String): Dataset[Row] = super.jsonFile(path) + + /** @inheritdoc */ + override def jsonFile(path: String, schema: StructType): Dataset[Row] = + super.jsonFile(path, schema) + + /** @inheritdoc */ + override def jsonFile(path: String, samplingRatio: Double): Dataset[Row] = + super.jsonFile(path, samplingRatio) + + /** @inheritdoc */ + override def jsonRDD(json: RDD[String]): Dataset[Row] = super.jsonRDD(json) + + /** @inheritdoc */ + override def jsonRDD(json: JavaRDD[String]): Dataset[Row] = super.jsonRDD(json) + + /** @inheritdoc */ + override def jsonRDD(json: RDD[String], schema: StructType): Dataset[Row] = + super.jsonRDD(json, schema) + + /** @inheritdoc */ + override def jsonRDD(json: JavaRDD[String], schema: StructType): Dataset[Row] = + super.jsonRDD(json, schema) + + /** @inheritdoc */ + override def jsonRDD(json: RDD[String], samplingRatio: Double): Dataset[Row] = + super.jsonRDD(json, samplingRatio) + + /** @inheritdoc */ + override def jsonRDD(json: JavaRDD[String], samplingRatio: Double): Dataset[Row] = + super.jsonRDD(json, samplingRatio) + + /** @inheritdoc */ + override def load(path: String): Dataset[Row] = super.load(path) + + /** @inheritdoc */ + override def load(path: String, source: String): Dataset[Row] = super.load(path, source) + + /** @inheritdoc */ + override def load(source: String, options: JMap[String, String]): Dataset[Row] = + super.load(source, options) + + /** @inheritdoc */ + override def load(source: String, options: Map[String, String]): Dataset[Row] = + super.load(source, options) + + /** @inheritdoc */ + override def load( + source: String, + schema: StructType, + options: JMap[String, String]): Dataset[Row] = { + super.load(source, schema, options) + } + + /** @inheritdoc */ + override def load( + source: String, + schema: StructType, + options: Map[String, String]): Dataset[Row] = { + super.load(source, schema, options) + } + + /** @inheritdoc */ + override def jdbc(url: String, table: String): Dataset[Row] = super.jdbc(url, table) + + /** @inheritdoc */ + override def jdbc( + url: String, + table: String, + columnName: String, + lowerBound: Long, + upperBound: Long, + numPartitions: Int): Dataset[Row] = { + super.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions) + } + + /** @inheritdoc */ + override def jdbc(url: String, table: String, theParts: Array[String]): Dataset[Row] = { + super.jdbc(url, table, theParts) + } +} +object SQLContext extends api.SQLContextCompanion { + + override private[sql] type SQLContextImpl = SQLContext + override private[sql] type SparkContextImpl = SparkContext + + /** + * Get the singleton SQLContext if it exists or create a new one. + * + * This function can be used to create a singleton SQLContext object that can be shared across + * the JVM. + * + * If there is an active SQLContext for current thread, it will be returned instead of the + * global one. + * + * @param sparkContext + * The SparkContext. This parameter is not used in Spark Connect. + * + * @since 4.0.0 + */ + def getOrCreate(sparkContext: SparkContext): SQLContext = { + SparkSession.builder().getOrCreate().sqlContext + } + + /** @inheritdoc */ + override def setActive(sqlContext: SQLContext): Unit = super.setActive(sqlContext) +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala index 4690253da808b..993b09ace9139 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql /** @inheritdoc */ -abstract class SQLImplicits private[sql] (override val session: SparkSession) - extends api.SQLImplicits { +abstract class SQLImplicits extends api.SQLImplicits { type DS[U] = Dataset[U] + + protected def session: SparkSession } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala index 7edb1f51f11b1..89519034d07cc 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -34,6 +34,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.connect.proto import org.apache.spark.connect.proto.ExecutePlanResponse +import org.apache.spark.connect.proto.ExecutePlanResponse.ObservedMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalog.Catalog @@ -45,7 +46,7 @@ import org.apache.spark.sql.connect.client.{ClassFinder, CloseableIterator, Spar import org.apache.spark.sql.connect.client.SparkConnectClient.Configuration import org.apache.spark.sql.connect.client.arrow.ArrowSerializer import org.apache.spark.sql.functions.lit -import org.apache.spark.sql.internal.{CatalogImpl, ConnectRuntimeConfig, SessionCleaner, SessionState, SharedState, SqlApiConf} +import org.apache.spark.sql.internal.{CatalogImpl, ConnectRuntimeConfig, SessionCleaner, SessionState, SharedState, SqlApiConf, SubqueryExpressionNode} import org.apache.spark.sql.internal.ColumnNodeToProtoConverter.{toExpr, toTypedExpr} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.streaming.DataStreamReader @@ -187,8 +188,7 @@ class SparkSession private[sql] ( throw ConnectClientUnsupportedErrors.sessionState() /** @inheritdoc */ - override def sqlContext: SQLContext = - throw ConnectClientUnsupportedErrors.sqlContext() + override val sqlContext: SQLContext = new SQLContext(this) /** @inheritdoc */ override def listenerManager: ExecutionListenerManager = @@ -210,15 +210,38 @@ class SparkSession private[sql] ( throw ConnectClientUnsupportedErrors.executeCommand() /** @inheritdoc */ - @Experimental - def sql(sqlText: String, args: Array[_]): DataFrame = newDataFrame { builder => + def sql(sqlText: String, args: Array[_]): DataFrame = { + val sqlCommand = proto.SqlCommand + .newBuilder() + .setSql(sqlText) + .addAllPosArguments(args.map(lit(_).expr).toImmutableArraySeq.asJava) + .build() + sql(sqlCommand) + } + + /** @inheritdoc */ + def sql(sqlText: String, args: Map[String, Any]): DataFrame = { + sql(sqlText, args.asJava) + } + + /** @inheritdoc */ + override def sql(sqlText: String, args: java.util.Map[String, Any]): DataFrame = { + val sqlCommand = proto.SqlCommand + .newBuilder() + .setSql(sqlText) + .putAllNamedArguments(args.asScala.map { case (k, v) => (k, lit(v).expr) }.asJava) + .build() + sql(sqlCommand) + } + + /** @inheritdoc */ + override def sql(query: String): DataFrame = { + sql(query, Array.empty) + } + + private def sql(sqlCommand: proto.SqlCommand): DataFrame = newDataFrame { builder => // Send the SQL once to the server and then check the output. - val cmd = newCommand(b => - b.setSqlCommand( - proto.SqlCommand - .newBuilder() - .setSql(sqlText) - .addAllPosArguments(args.map(lit(_).expr).toImmutableArraySeq.asJava))) + val cmd = newCommand(b => b.setSqlCommand(sqlCommand)) val plan = proto.Plan.newBuilder().setCommand(cmd) val responseIter = client.execute(plan.build()) @@ -234,43 +257,6 @@ class SparkSession private[sql] ( } } - /** @inheritdoc */ - @Experimental - def sql(sqlText: String, args: Map[String, Any]): DataFrame = { - sql(sqlText, args.asJava) - } - - /** @inheritdoc */ - @Experimental - override def sql(sqlText: String, args: java.util.Map[String, Any]): DataFrame = newDataFrame { - builder => - // Send the SQL once to the server and then check the output. - val cmd = newCommand(b => - b.setSqlCommand( - proto.SqlCommand - .newBuilder() - .setSql(sqlText) - .putAllNamedArguments(args.asScala.map { case (k, v) => (k, lit(v).expr) }.asJava))) - val plan = proto.Plan.newBuilder().setCommand(cmd) - val responseIter = client.execute(plan.build()) - - try { - val response = responseIter - .find(_.hasSqlCommandResult) - .getOrElse(throw new RuntimeException("SQLCommandResult must be present")) - // Update the builder with the values from the result. - builder.mergeFrom(response.getSqlCommandResult.getRelation) - } finally { - // consume the rest of the iterator - responseIter.foreach(_ => ()) - } - } - - /** @inheritdoc */ - override def sql(query: String): DataFrame = { - sql(query, Array.empty) - } - /** @inheritdoc */ def read: DataFrameReader = new DataFrameReader(this) @@ -314,7 +300,9 @@ class SparkSession private[sql] ( // scalastyle:off /** @inheritdoc */ - object implicits extends SQLImplicits(this) + object implicits extends SQLImplicits { + override protected def session: SparkSession = SparkSession.this + } // scalastyle:on /** @inheritdoc */ @@ -336,20 +324,111 @@ class SparkSession private[sql] ( } } + /** + * Create a DataFrame including the proto plan built by the given function. + * + * @param f + * The function to build the proto plan. + * @return + * The DataFrame created from the proto plan. + */ @Since("4.0.0") @DeveloperApi def newDataFrame(f: proto.Relation.Builder => Unit): DataFrame = { newDataset(UnboundRowEncoder)(f) } + /** + * Create a DataFrame including the proto plan built by the given function. + * + * Use this method when columns are used to create a new DataFrame. When there are columns + * referring to other Dataset or DataFrame, the plan will be wrapped with a `WithRelation`. + * + * {{{ + * with_relations [id 10] + * root: plan [id 9] using columns referring to other Dataset or DataFrame, holding plan ids + * reference: + * refs#1: [id 8] plan for the reference 1 + * refs#2: [id 5] plan for the reference 2 + * }}} + * + * @param cols + * The columns to be used in the DataFrame. + * @param f + * The function to build the proto plan. + * @return + * The DataFrame created from the proto plan. + */ + @Since("4.0.0") + @DeveloperApi + def newDataFrame(cols: Seq[Column])(f: proto.Relation.Builder => Unit): DataFrame = { + newDataset(UnboundRowEncoder, cols)(f) + } + + /** + * Create a Dataset including the proto plan built by the given function. + * + * @param encoder + * The encoder for the Dataset. + * @param f + * The function to build the proto plan. + * @return + * The Dataset created from the proto plan. + */ @Since("4.0.0") @DeveloperApi def newDataset[T](encoder: AgnosticEncoder[T])( f: proto.Relation.Builder => Unit): Dataset[T] = { + newDataset[T](encoder, Seq.empty)(f) + } + + /** + * Create a Dataset including the proto plan built by the given function. + * + * Use this method when columns are used to create a new Dataset. When there are columns + * referring to other Dataset or DataFrame, the plan will be wrapped with a `WithRelation`. + * + * {{{ + * with_relations [id 10] + * root: plan [id 9] using columns referring to other Dataset or DataFrame, holding plan ids + * reference: + * refs#1: [id 8] plan for the reference 1 + * refs#2: [id 5] plan for the reference 2 + * }}} + * + * @param encoder + * The encoder for the Dataset. + * @param cols + * The columns to be used in the DataFrame. + * @param f + * The function to build the proto plan. + * @return + * The Dataset created from the proto plan. + */ + @Since("4.0.0") + @DeveloperApi + def newDataset[T](encoder: AgnosticEncoder[T], cols: Seq[Column])( + f: proto.Relation.Builder => Unit): Dataset[T] = { + val references = cols.flatMap(_.node.collect { case n: SubqueryExpressionNode => + n.relation + }) + val builder = proto.Relation.newBuilder() f(builder) builder.getCommonBuilder.setPlanId(planIdGenerator.getAndIncrement()) - val plan = proto.Plan.newBuilder().setRoot(builder).build() + + val rootBuilder = if (references.length == 0) { + builder + } else { + val rootBuilder = proto.Relation.newBuilder() + rootBuilder.getWithRelationsBuilder + .setRoot(builder) + .addAllReferences(references.asJava) + rootBuilder.getCommonBuilder.setPlanId(planIdGenerator.getAndIncrement()) + rootBuilder + } + + val plan = proto.Plan.newBuilder().setRoot(rootBuilder).build() new Dataset[T](this, plan, encoder) } @@ -385,13 +464,8 @@ class SparkSession private[sql] ( private[sql] def timeZoneId: String = conf.get(SqlApiConf.SESSION_LOCAL_TIMEZONE_KEY) private[sql] def execute[T](plan: proto.Plan, encoder: AgnosticEncoder[T]): SparkResult[T] = { - val value = client.execute(plan) - new SparkResult( - value, - allocator, - encoder, - timeZoneId, - Some(setMetricsAndUnregisterObservation)) + val value = executeInternal(plan) + new SparkResult(value, allocator, encoder, timeZoneId) } private[sql] def execute(f: proto.Relation.Builder => Unit): Unit = { @@ -400,7 +474,7 @@ class SparkSession private[sql] ( builder.getCommonBuilder.setPlanId(planIdGenerator.getAndIncrement()) val plan = proto.Plan.newBuilder().setRoot(builder).build() // .foreach forces that the iterator is consumed and closed - client.execute(plan).foreach(_ => ()) + executeInternal(plan).foreach(_ => ()) } @Since("4.0.0") @@ -409,11 +483,26 @@ class SparkSession private[sql] ( val plan = proto.Plan.newBuilder().setCommand(command).build() // .toSeq forces that the iterator is consumed and closed. On top, ignore all // progress messages. - client.execute(plan).filter(!_.hasExecutionProgress).toSeq + executeInternal(plan).filter(!_.hasExecutionProgress).toSeq } - private[sql] def execute(plan: proto.Plan): CloseableIterator[ExecutePlanResponse] = - client.execute(plan) + /** + * The real `execute` method that calls into `SparkConnectClient`. + * + * Here we inject a lazy map to process registered observed metrics, so consumers of the + * returned iterator does not need to worry about it. + * + * Please make sure all `execute` methods call this method. + */ + private[sql] def executeInternal(plan: proto.Plan): CloseableIterator[ExecutePlanResponse] = { + client + .execute(plan) + .map { response => + // Note, this map() is lazy. + processRegisteredObservedMetrics(response.getObservedMetricsList) + response + } + } private[sql] def registerUdf(udf: proto.CommonInlineUserDefinedFunction): Unit = { val command = proto.Command.newBuilder().setRegisterFunction(udf).build() @@ -555,10 +644,14 @@ class SparkSession private[sql] ( observationRegistry.putIfAbsent(planId, observation) } - private[sql] def setMetricsAndUnregisterObservation(planId: Long, metrics: Row): Unit = { - val observationOrNull = observationRegistry.remove(planId) - if (observationOrNull != null) { - observationOrNull.setMetricsAndNotify(metrics) + private def processRegisteredObservedMetrics(metrics: java.util.List[ObservedMetrics]): Unit = { + metrics.asScala.map { metric => + // Here we only process metrics that belong to a registered Observation object. + // All metrics, whether registered or not, will be collected by `SparkResult`. + val observationOrNull = observationRegistry.remove(metric.getPlanId) + if (observationOrNull != null) { + observationOrNull.setMetricsAndNotify(SparkResult.transformObservedMetrics(metric)) + } } } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/TableValuedFunction.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/TableValuedFunction.scala index 4f2687b537862..2a5afd1d58717 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/TableValuedFunction.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/TableValuedFunction.scala @@ -47,7 +47,7 @@ class TableValuedFunction(sparkSession: SparkSession) extends api.TableValuedFun } private def fn(name: String, args: Seq[Column]): Dataset[Row] = { - sparkSession.newDataFrame { builder => + sparkSession.newDataFrame(args) { builder => builder.getUnresolvedTableValuedFunctionBuilder .setFunctionName(name) .addAllArguments(args.map(toExpr).asJava) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/ConnectClientUnsupportedErrors.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/ConnectClientUnsupportedErrors.scala index e73bcb8a0059d..5783a20348d75 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/ConnectClientUnsupportedErrors.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/ConnectClientUnsupportedErrors.scala @@ -53,7 +53,4 @@ private[sql] object ConnectClientUnsupportedErrors { def sparkContext(): SparkUnsupportedOperationException = unsupportedFeatureException("SESSION_SPARK_CONTEXT") - - def sqlContext(): SparkUnsupportedOperationException = - unsupportedFeatureException("SESSION_SQL_CONTEXT") } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/ConnectRuntimeConfig.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/ConnectRuntimeConfig.scala index be1a13cb2fed2..74348e8e015e2 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/ConnectRuntimeConfig.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/ConnectRuntimeConfig.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.internal import org.apache.spark.connect.proto.{ConfigRequest, ConfigResponse, KeyValue} import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.{ConfigEntry, ConfigReader, OptionalConfigEntry} import org.apache.spark.sql.RuntimeConfig import org.apache.spark.sql.connect.client.SparkConnectClient @@ -28,7 +29,7 @@ import org.apache.spark.sql.connect.client.SparkConnectClient */ class ConnectRuntimeConfig private[sql] (client: SparkConnectClient) extends RuntimeConfig - with Logging { + with Logging { self => /** @inheritdoc */ def set(key: String, value: String): Unit = { @@ -37,6 +38,13 @@ class ConnectRuntimeConfig private[sql] (client: SparkConnectClient) } } + /** @inheritdoc */ + override private[sql] def set[T](entry: ConfigEntry[T], value: T): Unit = { + require(entry != null, "entry cannot be null") + require(value != null, s"value cannot be null for key: ${entry.key}") + set(entry.key, entry.stringConverter(value)) + } + /** @inheritdoc */ @throws[NoSuchElementException]("if the key is not set and there is no default value") def get(key: String): String = getOption(key).getOrElse { @@ -45,11 +53,39 @@ class ConnectRuntimeConfig private[sql] (client: SparkConnectClient) /** @inheritdoc */ def get(key: String, default: String): String = { - executeConfigRequestSingleValue { builder => - builder.getGetWithDefaultBuilder.addPairsBuilder().setKey(key).setValue(default) + val kv = executeConfigRequestSinglePair { builder => + val pairsBuilder = builder.getGetWithDefaultBuilder + .addPairsBuilder() + .setKey(key) + if (default != null) { + pairsBuilder.setValue(default) + } + } + if (kv.hasValue) { + kv.getValue + } else { + default } } + /** @inheritdoc */ + override private[sql] def get[T](entry: ConfigEntry[T]): T = { + require(entry != null, "entry cannot be null") + entry.readFrom(reader) + } + + /** @inheritdoc */ + override private[sql] def get[T](entry: OptionalConfigEntry[T]): Option[T] = { + require(entry != null, "entry cannot be null") + entry.readFrom(reader) + } + + /** @inheritdoc */ + override private[sql] def get[T](entry: ConfigEntry[T], default: T): T = { + require(entry != null, "entry cannot be null") + Option(get(entry.key, null)).map(entry.valueConverter).getOrElse(default) + } + /** @inheritdoc */ def getAll: Map[String, String] = { val response = executeConfigRequest { builder => @@ -65,11 +101,11 @@ class ConnectRuntimeConfig private[sql] (client: SparkConnectClient) /** @inheritdoc */ def getOption(key: String): Option[String] = { - val pair = executeConfigRequestSinglePair { builder => + val kv = executeConfigRequestSinglePair { builder => builder.getGetOptionBuilder.addKeys(key) } - if (pair.hasValue) { - Option(pair.getValue) + if (kv.hasValue) { + Option(kv.getValue) } else { None } @@ -84,17 +120,11 @@ class ConnectRuntimeConfig private[sql] (client: SparkConnectClient) /** @inheritdoc */ def isModifiable(key: String): Boolean = { - val modifiable = executeConfigRequestSingleValue { builder => + val kv = executeConfigRequestSinglePair { builder => builder.getIsModifiableBuilder.addKeys(key) } - java.lang.Boolean.valueOf(modifiable) - } - - private def executeConfigRequestSingleValue( - f: ConfigRequest.Operation.Builder => Unit): String = { - val pair = executeConfigRequestSinglePair(f) - require(pair.hasValue, "The returned pair does not have a value set") - pair.getValue + require(kv.hasValue, "The returned pair does not have a value set") + java.lang.Boolean.valueOf(kv.getValue) } private def executeConfigRequestSinglePair( @@ -113,4 +143,6 @@ class ConnectRuntimeConfig private[sql] (client: SparkConnectClient) } response } + + private val reader = new ConfigReader((key: String) => Option(self.get(key, null))) } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala index 34a8a91a0ddf8..7802d9750bbc3 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala @@ -73,13 +73,19 @@ object ColumnNodeToProtoConverter extends (ColumnNode => proto.Expression) { .setColName(regex) planId.foreach(b.setPlanId) - case UnresolvedFunction(functionName, arguments, isDistinct, isUserDefinedFunction, _, _) => - // TODO(SPARK-49087) use internal namespace. + case UnresolvedFunction( + functionName, + arguments, + isDistinct, + isUserDefinedFunction, + isInternal, + _) => builder.getUnresolvedFunctionBuilder .setFunctionName(functionName) .setIsUserDefinedFunction(isUserDefinedFunction) .setIsDistinct(isDistinct) .addAllArguments(arguments.map(apply(_, e)).asJava) + .setIsInternal(isInternal) case Alias(child, name, metadata, _) => val b = builder.getAliasBuilder.setExpr(apply(child, e)) @@ -156,6 +162,7 @@ object ColumnNodeToProtoConverter extends (ColumnNode => proto.Expression) { case CaseWhenOtherwise(branches, otherwise, _) => val b = builder.getUnresolvedFunctionBuilder .setFunctionName("when") + .setIsInternal(false) branches.foreach { case (condition, value) => b.addArguments(apply(condition, e)) b.addArguments(apply(value, e)) @@ -164,6 +171,18 @@ object ColumnNodeToProtoConverter extends (ColumnNode => proto.Expression) { b.addArguments(apply(value, e)) } + case LazyExpression(child, _) => + builder.getLazyExpressionBuilder.setChild(apply(child, e)) + + case SubqueryExpressionNode(relation, subqueryType, _) => + val b = builder.getSubqueryExpressionBuilder + b.setSubqueryType(subqueryType match { + case SubqueryType.SCALAR => proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_SCALAR + case SubqueryType.EXISTS => proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_EXISTS + }) + assert(relation.hasCommon && relation.getCommon.hasPlanId) + b.setPlanId(relation.getCommon.getPlanId) + case ProtoColumnNode(e, _) => return e @@ -214,4 +233,24 @@ case class ProtoColumnNode( override val origin: Origin = CurrentOrigin.get) extends ColumnNode { override def sql: String = expr.toString + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty +} + +sealed trait SubqueryType + +object SubqueryType { + case object SCALAR extends SubqueryType + case object EXISTS extends SubqueryType +} + +case class SubqueryExpressionNode( + relation: proto.Relation, + subqueryType: SubqueryType, + override val origin: Origin = CurrentOrigin.get) + extends ColumnNode { + override def sql: String = subqueryType match { + case SubqueryType.SCALAR => s"($relation)" + case _ => s"$subqueryType ($relation)" + } + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 9fcc31e562682..b2c4fcf64e70f 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -135,7 +135,10 @@ final class DataStreamWriter[T] private[sql] (ds: Dataset[T]) extends api.DataSt /** @inheritdoc */ @Evolving def foreachBatch(function: (Dataset[T], Long) => Unit): this.type = { - val serializedFn = SparkSerDeUtils.serialize(function) + // SPARK-50661: the client should send the encoder for the input dataset together with the + // function to the server. + val serializedFn = + SparkSerDeUtils.serialize(ForeachWriterPacket(function, ds.agnosticEncoder)) sinkBuilder.getForeachBatchBuilder.getScalaFunctionBuilder .setPayload(ByteString.copyFrom(serializedFn)) .setOutputType(DataTypeProtoConverter.toConnectProtoType(NullType)) // Unused. diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala index 0371981b728d1..c7979b8e033ea 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -33,6 +33,7 @@ import org.scalatest.PrivateMethodTester import org.apache.spark.{SparkArithmeticException, SparkException, SparkUpgradeException} import org.apache.spark.SparkBuildInfo.{spark_version => SPARK_VERSION} +import org.apache.spark.internal.config.ConfigBuilder import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, TableAlreadyExistsException, TempTableAlreadyExistsException} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.StringEncoder import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -1006,8 +1007,12 @@ class ClientE2ETestSuite test("RuntimeConfig") { intercept[NoSuchElementException](spark.conf.get("foo.bar")) assert(spark.conf.getOption("foo.bar").isEmpty) + assert(spark.conf.get("foo.bar", "nope") == "nope") + assert(spark.conf.get("foo.bar", null) == null) spark.conf.set("foo.bar", value = true) assert(spark.conf.getOption("foo.bar") === Option("true")) + assert(spark.conf.get("foo.bar", "nope") === "true") + assert(spark.conf.get("foo.bar", null) === "true") spark.conf.set("foo.bar.numBaz", 100L) assert(spark.conf.get("foo.bar.numBaz") === "100") spark.conf.set("foo.bar.name", "donkey") @@ -1020,6 +1025,24 @@ class ClientE2ETestSuite assert(spark.conf.isModifiable("spark.sql.ansi.enabled")) assert(!spark.conf.isModifiable("spark.sql.globalTempDatabase")) intercept[Exception](spark.conf.set("spark.sql.globalTempDatabase", "/dev/null")) + + val entry = ConfigBuilder("my.simple.conf").intConf.createOptional + intercept[NoSuchElementException](spark.conf.get(entry.key)) + assert(spark.conf.get(entry).isEmpty) + assert(spark.conf.get(entry, Option(55)) === Option(55)) + spark.conf.set(entry, Option(33)) + assert(spark.conf.get(entry.key) === "33") + assert(spark.conf.get(entry) === Option(33)) + assert(spark.conf.get(entry, Option(55)) === Option(33)) + + val entryWithDefault = ConfigBuilder("my.important.conf").intConf.createWithDefault(10) + intercept[NoSuchElementException](spark.conf.get(entryWithDefault.key)) + assert(spark.conf.get(entryWithDefault) === 10) + assert(spark.conf.get(entryWithDefault, 11) === 11) + spark.conf.set(entryWithDefault, 12) + assert(spark.conf.get(entryWithDefault.key) === "12") + assert(spark.conf.get(entryWithDefault) === 12) + assert(spark.conf.get(entryWithDefault, 11) === 12) } test("SparkVersion") { @@ -1536,28 +1559,49 @@ class ClientE2ETestSuite val ob1Metrics = Map("ob1" -> new GenericRowWithSchema(Array(0, 49, 98), ob1Schema)) val ob2Metrics = Map("ob2" -> new GenericRowWithSchema(Array(-1, 48, 97), ob2Schema)) + val obMetrics = observedDf.collectResult().getObservedMetrics assert(df.collectResult().getObservedMetrics === Map.empty) assert(observedDf.collectResult().getObservedMetrics === ob1Metrics) - assert(observedObservedDf.collectResult().getObservedMetrics === ob1Metrics ++ ob2Metrics) - } - - test("Observation.get is blocked until the query is finished") { - val df = spark.range(99).withColumn("extra", col("id") - 1) - val observation = new Observation("ob1") - val observedDf = df.observe(observation, min("id"), avg("id"), max("id")) - - // Start a new thread to get the observation - val future = Future(observation.get)(ExecutionContext.global) - // make sure the thread is blocked right now - val e = intercept[java.util.concurrent.TimeoutException] { - SparkThreadUtils.awaitResult(future, 2.seconds) + assert(obMetrics.map(_._2.schema) === Seq(ob1Schema)) + + val obObMetrics = observedObservedDf.collectResult().getObservedMetrics + assert(obObMetrics === ob1Metrics ++ ob2Metrics) + assert(obObMetrics.map(_._2.schema).exists(_.equals(ob1Schema))) + assert(obObMetrics.map(_._2.schema).exists(_.equals(ob2Schema))) + } + + for (collectFunc <- Seq( + ("collect", (df: DataFrame) => df.collect()), + ("collectAsList", (df: DataFrame) => df.collectAsList()), + ("collectResult", (df: DataFrame) => df.collectResult().length), + ("write", (df: DataFrame) => df.write.format("noop").mode("append").save()))) + test( + "Observation.get is blocked until the query is finished, " + + s"collect using method ${collectFunc._1}") { + val df = spark.range(99).withColumn("extra", col("id") - 1) + val ob1 = new Observation("ob1") + val ob2 = new Observation("ob2") + val observedDf = df.observe(ob1, min("id"), avg("id"), max("id")) + val observedObservedDf = observedDf.observe(ob2, min("extra"), avg("extra"), max("extra")) + // Start new threads to get observations + val future1 = Future(ob1.get)(ExecutionContext.global) + val future2 = Future(ob2.get)(ExecutionContext.global) + // make sure the threads are blocked right now + val e1 = intercept[java.util.concurrent.TimeoutException] { + SparkThreadUtils.awaitResult(future1, 2.seconds) + } + assert(e1.getMessage.contains("timed out after")) + val e2 = intercept[java.util.concurrent.TimeoutException] { + SparkThreadUtils.awaitResult(future2, 2.seconds) + } + assert(e2.getMessage.contains("timed out after")) + collectFunc._2(observedObservedDf) + // make sure the threads are unblocked after the query is finished + val metrics1 = SparkThreadUtils.awaitResult(future1, 5.seconds) + assert(metrics1 === Map("min(id)" -> 0, "avg(id)" -> 49, "max(id)" -> 98)) + val metrics2 = SparkThreadUtils.awaitResult(future2, 5.seconds) + assert(metrics2 === Map("min(extra)" -> -1, "avg(extra)" -> 48, "max(extra)" -> 97)) } - assert(e.getMessage.contains("Future timed out")) - observedDf.collect() - // make sure the thread is unblocked after the query is finished - val metrics = SparkThreadUtils.awaitResult(future, 2.seconds) - assert(metrics === Map("min(id)" -> 0, "avg(id)" -> 49, "max(id)" -> 98)) - } test("SPARK-48852: trim function on a string column returns correct results") { val session: SparkSession = spark diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala new file mode 100644 index 0000000000000..1d2165b668f61 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala @@ -0,0 +1,732 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.SparkRuntimeException +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} + +class DataFrameSubquerySuite extends QueryTest with RemoteSparkSession { + import testImplicits._ + + val row = identity[(java.lang.Integer, java.lang.Double)](_) + + lazy val l = Seq( + row((1, 2.0)), + row((1, 2.0)), + row((2, 1.0)), + row((2, 1.0)), + row((3, 3.0)), + row((null, null)), + row((null, 5.0)), + row((6, null))).toDF("a", "b") + + lazy val r = Seq( + row((2, 3.0)), + row((2, 3.0)), + row((3, 2.0)), + row((4, 1.0)), + row((null, null)), + row((null, 5.0)), + row((6, null))).toDF("c", "d") + + override def beforeAll(): Unit = { + super.beforeAll() + l.createOrReplaceTempView("l") + r.createOrReplaceTempView("r") + } + + test("noop outer()") { + checkAnswer(spark.range(1).select($"id".outer()), Row(0)) + checkError( + intercept[AnalysisException](spark.range(1).select($"outer_col".outer()).collect()), + "UNRESOLVED_COLUMN.WITH_SUGGESTION", + parameters = Map("objectName" -> "`outer_col`", "proposal" -> "`id`")) + } + + test("simple uncorrelated scalar subquery") { + checkAnswer( + spark.range(1).select(spark.range(1).select(lit(1)).scalar().as("b")), + sql("select (select 1 as b) as b")) + + checkAnswer( + spark + .range(1) + .select( + spark.range(1).select(spark.range(1).select(lit(1)).scalar() + 1).scalar() + lit(1)), + sql("select (select (select 1) + 1) + 1")) + + // string type + checkAnswer( + spark.range(1).select(spark.range(1).select(lit("s")).scalar().as("b")), + sql("select (select 's' as s) as b")) + } + + test("uncorrelated scalar subquery should return null if there is 0 rows") { + checkAnswer( + spark.range(1).select(spark.range(1).select(lit("s")).limit(0).scalar().as("b")), + sql("select (select 's' as s limit 0) as b")) + } + + test("uncorrelated scalar subquery on a DataFrame generated query") { + withTempView("subqueryData") { + val df = Seq((1, "one"), (2, "two"), (3, "three")).toDF("key", "value") + df.createOrReplaceTempView("subqueryData") + + checkAnswer( + spark + .range(1) + .select( + spark + .table("subqueryData") + .select($"key") + .where($"key" > 2) + .orderBy($"key") + .limit(1) + .scalar() + lit(1)), + sql("select (select key from subqueryData where key > 2 order by key limit 1) + 1")) + + checkAnswer( + spark.range(1).select(-spark.table("subqueryData").select(max($"key")).scalar()), + sql("select -(select max(key) from subqueryData)")) + + checkAnswer( + spark.range(1).select(spark.table("subqueryData").select($"value").limit(0).scalar()), + sql("select (select value from subqueryData limit 0)")) + + checkAnswer( + spark + .range(1) + .select( + spark + .table("subqueryData") + .where($"key" === spark.table("subqueryData").select(max($"key")).scalar() - lit(1)) + .select(min($"value")) + .scalar()), + sql( + "select (select min(value) from subqueryData" + + " where key = (select max(key) from subqueryData) - 1)")) + } + } + + test("correlated scalar subquery in SELECT with outer() function") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") + // We can use the `.outer()` function to wrap either the outer column, or the entire condition, + // or the SQL string of the condition. + Seq($"t1.a" === $"t2.a".outer(), ($"t1.a" === $"t2.a").outer(), expr("t1.a = t2.a").outer()) + .foreach { cond => + checkAnswer( + df1.select($"a", df2.where(cond).select(sum($"b")).scalar().as("sum_b")), + sql("select a, (select sum(b) from l t1 where t1.a = t2.a) sum_b from l t2")) + } + } + + test("correlated scalar subquery in WHERE with outer() function") { + // We can use the `.outer()` function to wrap either the outer column, or the entire condition, + // or the SQL string of the condition. + Seq($"a".outer() === $"c", ($"a" === $"c").outer(), expr("a = c").outer()).foreach { cond => + checkAnswer( + spark.table("l").where($"b" < spark.table("r").where(cond).select(max($"d")).scalar()), + sql("select * from l where b < (select max(d) from r where a = c)")) + } + } + + test("EXISTS predicate subquery with outer() function") { + // We can use the `.outer()` function to wrap either the outer column, or the entire condition, + // or the SQL string of the condition. + Seq($"a".outer() === $"c", ($"a" === $"c").outer(), expr("a = c").outer()).foreach { cond => + checkAnswer( + spark.table("l").where(spark.table("r").where(cond).exists()), + sql("select * from l where exists (select * from r where l.a = r.c)")) + + checkAnswer( + spark.table("l").where(spark.table("r").where(cond).exists() && $"a" <= lit(2)), + sql("select * from l where exists (select * from r where l.a = r.c) and l.a <= 2")) + } + } + + test("SPARK-15677: Queries against local relations with scalar subquery in Select list") { + withTempView("t1", "t2") { + Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t1") + Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t2") + + checkAnswer( + spark.table("t1").select(spark.range(1).select(lit(1).as("col")).scalar()), + sql("SELECT (select 1 as col) from t1")) + + checkAnswer( + spark.table("t1").select(spark.table("t2").select(max($"c1")).scalar()), + sql("SELECT (select max(c1) from t2) from t1")) + + checkAnswer( + spark.table("t1").select(lit(1) + spark.range(1).select(lit(1).as("col")).scalar()), + sql("SELECT 1 + (select 1 as col) from t1")) + + checkAnswer( + spark.table("t1").select($"c1", spark.table("t2").select(max($"c1")).scalar() + $"c2"), + sql("SELECT c1, (select max(c1) from t2) + c2 from t1")) + + checkAnswer( + spark + .table("t1") + .select( + $"c1", + spark.table("t2").where($"t1.c2".outer() === $"t2.c2").select(max($"c1")).scalar()), + sql("SELECT c1, (select max(c1) from t2 where t1.c2 = t2.c2) from t1")) + } + } + + test("NOT EXISTS predicate subquery") { + checkAnswer( + spark.table("l").where(!spark.table("r").where($"a".outer() === $"c").exists()), + sql("select * from l where not exists (select * from r where l.a = r.c)")) + + checkAnswer( + spark + .table("l") + .where(!spark.table("r").where($"a".outer() === $"c" && $"b".outer() < $"d").exists()), + sql("select * from l where not exists (select * from r where l.a = r.c and l.b < r.d)")) + } + + test("EXISTS predicate subquery within OR") { + checkAnswer( + spark + .table("l") + .where(spark.table("r").where($"a".outer() === $"c").exists() || + spark.table("r").where($"a".outer() === $"c").exists()), + sql( + "select * from l where exists (select * from r where l.a = r.c)" + + " or exists (select * from r where l.a = r.c)")) + + checkAnswer( + spark + .table("l") + .where(!spark.table("r").where($"a".outer() === $"c" && $"b".outer() < $"d").exists() || + !spark.table("r").where($"a".outer() === $"c").exists()), + sql( + "select * from l where not exists (select * from r where l.a = r.c and l.b < r.d)" + + " or not exists (select * from r where l.a = r.c)")) + } + + test("correlated scalar subquery in select (null safe equal)") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") + checkAnswer( + df1.select( + $"a", + df2.where($"t2.a" <=> $"t1.a".outer()).select(sum($"b")).scalar().as("sum_b")), + sql("select a, (select sum(b) from l t2 where t2.a <=> t1.a) sum_b from l t1")) + } + + test("correlated scalar subquery in aggregate") { + checkAnswer( + spark + .table("l") + .groupBy( + $"a", + spark.table("r").where($"a".outer() === $"c").select(sum($"d")).scalar().as("sum_d")) + .agg(Map.empty[String, String]), + sql("select a, (select sum(d) from r where a = c) sum_d from l l1 group by 1, 2")) + } + + test("SPARK-34269: correlated subquery with view in aggregate's grouping expression") { + withTable("tr") { + withView("vr") { + r.write.saveAsTable("tr") + sql("create view vr as select * from tr") + checkAnswer( + spark + .table("l") + .groupBy( + $"a", + spark + .table("vr") + .where($"a".outer() === $"c") + .select(sum($"d")) + .scalar() + .as("sum_d")) + .agg(Map.empty[String, String]), + sql("select a, (select sum(d) from vr where a = c) sum_d from l l1 group by 1, 2")) + } + } + } + + test("non-aggregated correlated scalar subquery") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") + val exception1 = intercept[SparkRuntimeException] { + df1 + .select($"a", df2.where($"t1.a" === $"t2.a".outer()).select($"b").scalar().as("sum_b")) + .collect() + } + checkError(exception1, condition = "SCALAR_SUBQUERY_TOO_MANY_ROWS") + } + + test("non-equal correlated scalar subquery") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") + checkAnswer( + df1.select( + $"a", + df2.where($"t2.a" < $"t1.a".outer()).select(sum($"b")).scalar().as("sum_b")), + sql("select a, (select sum(b) from l t2 where t2.a < t1.a) sum_b from l t1")) + } + + test("disjunctive correlated scalar subquery") { + checkAnswer( + spark + .table("l") + .where( + spark + .table("r") + .where(($"a".outer() === $"c" && $"d" === 2.0) || + ($"a".outer() === $"c" && $"d" === 1.0)) + .select(count(lit(1))) + .scalar() > 0) + .select($"a"), + sql(""" + |select a + |from l + |where (select count(*) + | from r + | where (a = c and d = 2.0) or (a = c and d = 1.0)) > 0 + """.stripMargin)) + } + + test("correlated scalar subquery with missing outer reference") { + checkAnswer( + spark + .table("l") + .select($"a", spark.table("r").where($"c" === $"a").select(sum($"d")).scalar()), + sql("select a, (select sum(d) from r where c = a) from l")) + } + + private def table1() = { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + spark.table("t1") + } + + private def table2() = { + sql("CREATE VIEW t2(c1, c2) AS VALUES (0, 2), (0, 3)") + spark.table("t2") + } + + private def table3() = { + sql( + "CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + spark.table("t3") + } + + test("lateral join with single column select") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(spark.range(1).select($"c1".outer())).toDF("c1", "c2", "c3"), + sql("SELECT * FROM t1, LATERAL (SELECT c1)").toDF("c1", "c2", "c3")) + checkAnswer( + t1.lateralJoin(t2.select($"c1")).toDF("c1", "c2", "c3"), + sql("SELECT * FROM t1, LATERAL (SELECT c1 FROM t2)").toDF("c1", "c2", "c3")) + checkAnswer( + t1.lateralJoin(t2.select($"t1.c1".outer())).toDF("c1", "c2", "c3"), + sql("SELECT * FROM t1, LATERAL (SELECT t1.c1 FROM t2)").toDF("c1", "c2", "c3")) + checkAnswer( + t1.lateralJoin(t2.select($"t1.c1".outer() + $"t2.c1")).toDF("c1", "c2", "c3"), + sql("SELECT * FROM t1, LATERAL (SELECT t1.c1 + t2.c1 FROM t2)").toDF("c1", "c2", "c3")) + } + } + + test("lateral join with star expansion") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(spark.range(1).select().select($"*")), + sql("SELECT * FROM t1, LATERAL (SELECT *)")) + checkAnswer( + t1.lateralJoin(t2.select($"*")).toDF("c1", "c2", "c3", "c4"), + sql("SELECT * FROM t1, LATERAL (SELECT * FROM t2)").toDF("c1", "c2", "c3", "c4")) + checkAnswer( + t1.lateralJoin(t2.select($"t1.*".outer(), $"t2.*")) + .toDF("c1", "c2", "c3", "c4", "c5", "c6"), + sql("SELECT * FROM t1, LATERAL (SELECT t1.*, t2.* FROM t2)") + .toDF("c1", "c2", "c3", "c4", "c5", "c6")) + checkAnswer( + t1.lateralJoin(t2.alias("t1").select($"t1.*")).toDF("c1", "c2", "c3", "c4"), + sql("SELECT * FROM t1, LATERAL (SELECT t1.* FROM t2 AS t1)").toDF("c1", "c2", "c3", "c4")) + } + } + + test("lateral join with different join types") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + $"c2".outer()).as("c3")), + $"c2" === $"c3"), + sql("SELECT * FROM t1 JOIN LATERAL (SELECT c1 + c2 AS c3) ON c2 = c3")) + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + $"c2".outer()).as("c3")), + $"c2" === $"c3", + "left"), + sql("SELECT * FROM t1 LEFT JOIN LATERAL (SELECT c1 + c2 AS c3) ON c2 = c3")) + checkAnswer( + t1.lateralJoin(spark.range(1).select(($"c1".outer() + $"c2".outer()).as("c3")), "cross"), + sql("SELECT * FROM t1 CROSS JOIN LATERAL (SELECT c1 + c2 AS c3)")) + } + } + + test("lateral join with subquery alias") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.lateralJoin(spark.range(1).select($"c1".outer(), $"c2".outer()).toDF("a", "b").as("s")) + .select("a", "b"), + sql("SELECT a, b FROM t1, LATERAL (SELECT c1, c2) s(a, b)")) + } + } + + test("lateral join with correlated equality / non-equality predicates") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(t2.where($"t1.c1".outer() === $"t2.c1").select($"c2")) + .toDF("c1", "c2", "c3"), + sql("SELECT * FROM t1, LATERAL (SELECT c2 FROM t2 WHERE t1.c1 = t2.c1)") + .toDF("c1", "c2", "c3")) + checkAnswer( + t1.lateralJoin(t2.where($"t1.c1".outer() < $"t2.c1").select($"c2")) + .toDF("c1", "c2", "c3"), + sql("SELECT * FROM t1, LATERAL (SELECT c2 FROM t2 WHERE t1.c1 < t2.c1)") + .toDF("c1", "c2", "c3")) + } + } + + test("lateral join with aggregation and correlated non-equality predicates") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(t2.where($"t1.c2".outer() < $"t2.c2").select(max($"c2").as("m"))), + sql("SELECT * FROM t1, LATERAL (SELECT max(c2) AS m FROM t2 WHERE t1.c2 < t2.c2)")) + } + } + + test("lateral join can reference preceding FROM clause items") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.join(t2) + .lateralJoin(spark.range(1).select($"t1.c2".outer() + $"t2.c2".outer())) + .toDF("c1", "c2", "c3", "c4", "c5"), + sql("SELECT * FROM t1 JOIN t2 JOIN LATERAL (SELECT t1.c2 + t2.c2)") + .toDF("c1", "c2", "c3", "c4", "c5")) + } + } + + test("multiple lateral joins") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.lateralJoin(spark.range(1).select(($"c1".outer() + $"c2".outer()).as("a"))) + .lateralJoin(spark.range(1).select(($"c1".outer() - $"c2".outer()).as("b"))) + .lateralJoin(spark.range(1).select(($"a".outer() * $"b".outer()).as("c"))), + sql(""" + |SELECT * FROM t1, + |LATERAL (SELECT c1 + c2 AS a), + |LATERAL (SELECT c1 - c2 AS b), + |LATERAL (SELECT a * b AS c) + |""".stripMargin)) + } + } + + test("lateral join in between regular joins") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(t2.where($"t1.c1".outer() === $"t2.c1").select($"c2").as("s"), "left") + .join(t1.as("t3"), $"s.c2" === $"t3.c2", "left") + .toDF("c1", "c2", "c3", "c4", "c5"), + sql(""" + |SELECT * FROM t1 + |LEFT OUTER JOIN LATERAL (SELECT c2 FROM t2 WHERE t1.c1 = t2.c1) s + |LEFT OUTER JOIN t1 t3 ON s.c2 = t3.c2 + |""".stripMargin) + .toDF("c1", "c2", "c3", "c4", "c5")) + } + } + + test("nested lateral joins") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(t2.lateralJoin(spark.range(1).select($"c1".outer()))) + .toDF("c1", "c2", "c3", "c4", "c5"), + sql("SELECT * FROM t1, LATERAL (SELECT * FROM t2, LATERAL (SELECT c1))") + .toDF("c1", "c2", "c3", "c4", "c5")) + checkAnswer( + t1.lateralJoin( + spark + .range(1) + .select(($"c1".outer() + lit(1)).as("c1")) + .lateralJoin(spark.range(1).select($"c1".outer()))) + .toDF("c1", "c2", "c3", "c4"), + sql( + "SELECT * FROM t1, LATERAL (SELECT * FROM (SELECT c1 + 1 AS c1), LATERAL (SELECT c1))") + .toDF("c1", "c2", "c3", "c4")) + } + } + + test("scalar subquery inside lateral join") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + // uncorrelated + checkAnswer( + t1.lateralJoin(spark.range(1).select($"c2".outer(), t2.select(min($"c2")).scalar())) + .toDF("c1", "c2", "c3", "c4"), + sql("SELECT * FROM t1, LATERAL (SELECT c2, (SELECT MIN(c2) FROM t2))") + .toDF("c1", "c2", "c3", "c4")) + + // correlated + checkAnswer( + t1.lateralJoin( + spark + .range(1) + .select($"c1".outer().as("a")) + .select(t2.where($"c1" === $"a".outer()).select(sum($"c2")).scalar())), + sql(""" + |SELECT * FROM t1, LATERAL ( + | SELECT (SELECT SUM(c2) FROM t2 WHERE c1 = a) FROM (SELECT c1 AS a) + |) + |""".stripMargin)) + } + } + + test("lateral join inside subquery") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + // uncorrelated + checkAnswer( + t1.where( + $"c1" === t2 + .lateralJoin(spark.range(1).select($"c1".outer().as("a"))) + .select(min($"a")) + .scalar()), + sql("SELECT * FROM t1 WHERE c1 = (SELECT MIN(a) FROM t2, LATERAL (SELECT c1 AS a))")) + // correlated + checkAnswer( + t1.where( + $"c1" === t2 + .lateralJoin(spark.range(1).select($"c1".outer().as("a"))) + .where($"c1" === $"t1.c1".outer()) + .select(min($"a")) + .scalar()), + sql( + "SELECT * FROM t1 " + + "WHERE c1 = (SELECT MIN(a) FROM t2, LATERAL (SELECT c1 AS a) WHERE c1 = t1.c1)")) + } + } + + test("lateral join with table-valued functions") { + withView("t1", "t3") { + val t1 = table1() + val t3 = table3() + + checkAnswer(t1.lateralJoin(spark.tvf.range(3)), sql("SELECT * FROM t1, LATERAL RANGE(3)")) + checkAnswer( + t1.lateralJoin(spark.tvf.explode(array($"c1".outer(), $"c2".outer()))), + sql("SELECT * FROM t1, LATERAL EXPLODE(ARRAY(c1, c2)) t2(c3)")) + checkAnswer( + t3.lateralJoin(spark.tvf.explode_outer($"c2".outer())), + sql("SELECT * FROM t3, LATERAL EXPLODE_OUTER(c2) t2(v)")) + checkAnswer( + spark.tvf + .explode(array(lit(1), lit(2))) + .toDF("v") + .lateralJoin(spark.range(1).select($"v".outer() + 1)), + sql("SELECT * FROM EXPLODE(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1)")) + } + } + + test("lateral join with table-valued functions and join conditions") { + withView("t1", "t3") { + val t1 = table1() + val t3 = table3() + + checkAnswer( + t1.lateralJoin(spark.tvf.explode(array($"c1".outer(), $"c2".outer())), $"c1" === $"col"), + sql("SELECT * FROM t1 JOIN LATERAL EXPLODE(ARRAY(c1, c2)) t(c3) ON t1.c1 = c3")) + checkAnswer( + t3.lateralJoin(spark.tvf.explode($"c2".outer()), $"c1" === $"col"), + sql("SELECT * FROM t3 JOIN LATERAL EXPLODE(c2) t(c3) ON t3.c1 = c3")) + checkAnswer( + t3.lateralJoin(spark.tvf.explode($"c2".outer()), $"c1" === $"col", "left"), + sql("SELECT * FROM t3 LEFT JOIN LATERAL EXPLODE(c2) t(c3) ON t3.c1 = c3")) + } + } + + test("subquery with generator / table-valued functions") { + withView("t1") { + val t1 = table1() + + checkAnswer( + spark.range(1).select(explode(t1.select(collect_list("c2")).scalar())), + sql("SELECT EXPLODE((SELECT COLLECT_LIST(c2) FROM t1))")) + checkAnswer( + spark.tvf.explode(t1.select(collect_list("c2")).scalar()), + sql("SELECT * FROM EXPLODE((SELECT COLLECT_LIST(c2) FROM t1))")) + } + } + + test("subquery in join condition") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.join(t2, $"t1.c1" === t1.select(max("c1")).scalar()).toDF("c1", "c2", "c3", "c4"), + sql("SELECT * FROM t1 JOIN t2 ON t1.c1 = (SELECT MAX(c1) FROM t1)") + .toDF("c1", "c2", "c3", "c4")) + } + } + + test("subquery in unpivot") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkError( + intercept[AnalysisException] { + t1.unpivot(Array(t2.exists()), "c1", "c2").collect() + }, + "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_IN_EXISTS_SUBQUERY", + parameters = Map("treeNode" -> "(?s)'Unpivot.*"), + matchPVals = true) + checkError( + intercept[AnalysisException] { + t1.unpivot(Array($"c1"), Array(t2.exists()), "c1", "c2").collect() + }, + "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_IN_EXISTS_SUBQUERY", + parameters = Map("treeNode" -> "(?s)Expand.*"), + matchPVals = true) + } + } + + test("subquery in transpose") { + withView("t1") { + val t1 = table1() + + checkError( + intercept[AnalysisException] { + t1.transpose(t1.select(max("c1")).scalar()).collect() + }, + "TRANSPOSE_INVALID_INDEX_COLUMN", + parameters = Map("reason" -> "Index column must be an atomic attribute")) + } + } + + test("subquery in withColumns") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .select($"c1".outer() + $"c2".outer()) + .scalar()), + t1.select($"*", ($"c1" + $"c2").as("scalar"))) + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .withColumn("c1", $"c1".outer()) + .select($"c1" + $"c2".outer()) + .scalar()), + t1.select($"*", ($"c1" + $"c2").as("scalar"))) + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .select($"c1".outer().as("c1")) + .withColumn("c2", $"c2".outer()) + .select($"c1" + $"c2") + .scalar()), + t1.select($"*", ($"c1" + $"c2").as("scalar"))) + } + } + + test("subquery in withColumnsRenamed") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .select($"c1".outer().as("c1"), $"c2".outer().as("c2")) + .withColumnsRenamed(Map("c1" -> "x", "c2" -> "y")) + .select($"x" + $"y") + .scalar()), + t1.select($"*", ($"c1".as("x") + $"c2".as("y")).as("scalar"))) + } + } + + test("subquery in drop") { + withView("t1") { + val t1 = table1() + + checkAnswer(t1.drop(spark.range(1).select(lit("c1")).scalar()), t1) + } + } + + test("subquery in repartition") { + withView("t1") { + val t1 = table1() + + checkAnswer(t1.repartition(spark.range(1).select(lit(1)).scalar()), t1) + } + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala index 4c0357a3ed984..12a49ad21676e 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSession { + import testImplicits._ test("explode") { val actual1 = spark.tvf.explode(array(lit(1), lit(2))) @@ -50,6 +51,31 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual6, expected6) } + test("explode - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql( + "CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin( + spark.tvf.explode(array($"c1".outer(), $"c2".outer())).toDF("c3").as("t2")), + sql("SELECT * FROM t1, LATERAL EXPLODE(ARRAY(c1, c2)) t2(c3)")) + checkAnswer( + t3.lateralJoin(spark.tvf.explode($"c2".outer()).toDF("v").as("t2")), + sql("SELECT * FROM t3, LATERAL EXPLODE(c2) t2(v)")) + checkAnswer( + spark.tvf + .explode(array(lit(1), lit(2))) + .toDF("v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM EXPLODE(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1)")) + } + } + test("explode_outer") { val actual1 = spark.tvf.explode_outer(array(lit(1), lit(2))) val expected1 = spark.sql("SELECT * FROM explode_outer(array(1, 2))") @@ -78,6 +104,31 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual6, expected6) } + test("explode_outer - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql( + "CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin( + spark.tvf.explode_outer(array($"c1".outer(), $"c2".outer())).toDF("c3").as("t2")), + sql("SELECT * FROM t1, LATERAL EXPLODE_OUTER(ARRAY(c1, c2)) t2(c3)")) + checkAnswer( + t3.lateralJoin(spark.tvf.explode_outer($"c2".outer()).toDF("v").as("t2")), + sql("SELECT * FROM t3, LATERAL EXPLODE_OUTER(c2) t2(v)")) + checkAnswer( + spark.tvf + .explode_outer(array(lit(1), lit(2))) + .toDF("v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM EXPLODE_OUTER(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1)")) + } + } + test("inline") { val actual1 = spark.tvf.inline(array(struct(lit(1), lit("a")), struct(lit(2), lit("b")))) val expected1 = spark.sql("SELECT * FROM inline(array(struct(1, 'a'), struct(2, 'b')))") @@ -98,6 +149,28 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual3, expected3) } + test("inline - lateral join") { + withView("array_struct") { + sql(""" + |CREATE VIEW array_struct(id, arr) AS VALUES + | (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), + | (2, ARRAY()), + | (3, ARRAY(STRUCT(3, 'c'))) + |""".stripMargin) + val arrayStruct = spark.table("array_struct") + + checkAnswer( + arrayStruct.lateralJoin(spark.tvf.inline($"arr".outer())), + sql("SELECT * FROM array_struct JOIN LATERAL INLINE(arr)")) + checkAnswer( + arrayStruct.lateralJoin( + spark.tvf.inline($"arr".outer()).toDF("k", "v").as("t"), + $"id" === $"k", + "left"), + sql("SELECT * FROM array_struct LEFT JOIN LATERAL INLINE(arr) t(k, v) ON id = k")) + } + } + test("inline_outer") { val actual1 = spark.tvf.inline_outer(array(struct(lit(1), lit("a")), struct(lit(2), lit("b")))) @@ -119,6 +192,28 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual3, expected3) } + test("inline_outer - lateral join") { + withView("array_struct") { + sql(""" + |CREATE VIEW array_struct(id, arr) AS VALUES + | (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), + | (2, ARRAY()), + | (3, ARRAY(STRUCT(3, 'c'))) + |""".stripMargin) + val arrayStruct = spark.table("array_struct") + + checkAnswer( + arrayStruct.lateralJoin(spark.tvf.inline_outer($"arr".outer())), + sql("SELECT * FROM array_struct JOIN LATERAL INLINE_OUTER(arr)")) + checkAnswer( + arrayStruct.lateralJoin( + spark.tvf.inline_outer($"arr".outer()).toDF("k", "v").as("t"), + $"id" === $"k", + "left"), + sql("SELECT * FROM array_struct LEFT JOIN LATERAL INLINE_OUTER(arr) t(k, v) ON id = k")) + } + } + test("json_tuple") { val actual = spark.tvf.json_tuple(lit("""{"a":1,"b":2}"""), lit("a"), lit("b")) val expected = spark.sql("""SELECT * FROM json_tuple('{"a":1,"b":2}', 'a', 'b')""") @@ -131,6 +226,51 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi assert(ex.messageParameters("functionName") == "`json_tuple`") } + test("json_tuple - lateral join") { + withView("json_table") { + sql(""" + |CREATE OR REPLACE TEMP VIEW json_table(key, jstring) AS VALUES + | ('1', '{"f1": "1", "f2": "2", "f3": 3, "f5": 5.23}'), + | ('2', '{"f1": "1", "f3": "3", "f2": 2, "f4": 4.01}'), + | ('3', '{"f1": 3, "f4": "4", "f3": "3", "f2": 2, "f5": 5.01}'), + | ('4', cast(null as string)), + | ('5', '{"f1": null, "f5": ""}'), + | ('6', '[invalid JSON string]') + |""".stripMargin) + val jsonTable = spark.table("json_table") + + checkAnswer( + jsonTable + .as("t1") + .lateralJoin( + spark.tvf + .json_tuple( + $"t1.jstring".outer(), + lit("f1"), + lit("f2"), + lit("f3"), + lit("f4"), + lit("f5")) + .as("t2")) + .select($"t1.key", $"t2.*"), + sql( + "SELECT t1.key, t2.* FROM json_table t1, " + + "LATERAL json_tuple(t1.jstring, 'f1', 'f2', 'f3', 'f4', 'f5') t2")) + checkAnswer( + jsonTable + .as("t1") + .lateralJoin(spark.tvf + .json_tuple($"jstring".outer(), lit("f1"), lit("f2"), lit("f3"), lit("f4"), lit("f5")) + .as("t2")) + .where($"t2.c0".isNotNull) + .select($"t1.key", $"t2.*"), + sql( + "SELECT t1.key, t2.* FROM json_table t1, " + + "LATERAL json_tuple(t1.jstring, 'f1', 'f2', 'f3', 'f4', 'f5') t2 " + + "WHERE t2.c0 IS NOT NULL")) + } + } + test("posexplode") { val actual1 = spark.tvf.posexplode(array(lit(1), lit(2))) val expected1 = spark.sql("SELECT * FROM posexplode(array(1, 2))") @@ -159,6 +299,30 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual6, expected6) } + test("posexplode - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql( + "CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin(spark.tvf.posexplode(array($"c1".outer(), $"c2".outer()))), + sql("SELECT * FROM t1, LATERAL POSEXPLODE(ARRAY(c1, c2))")) + checkAnswer( + t3.lateralJoin(spark.tvf.posexplode($"c2".outer())), + sql("SELECT * FROM t3, LATERAL POSEXPLODE(c2)")) + checkAnswer( + spark.tvf + .posexplode(array(lit(1), lit(2))) + .toDF("p", "v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM POSEXPLODE(ARRAY(1, 2)) t(p, v), LATERAL (SELECT v + 1)")) + } + } + test("posexplode_outer") { val actual1 = spark.tvf.posexplode_outer(array(lit(1), lit(2))) val expected1 = spark.sql("SELECT * FROM posexplode_outer(array(1, 2))") @@ -187,12 +351,63 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual6, expected6) } + test("posexplode_outer - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql( + "CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin(spark.tvf.posexplode_outer(array($"c1".outer(), $"c2".outer()))), + sql("SELECT * FROM t1, LATERAL POSEXPLODE_OUTER(ARRAY(c1, c2))")) + checkAnswer( + t3.lateralJoin(spark.tvf.posexplode_outer($"c2".outer())), + sql("SELECT * FROM t3, LATERAL POSEXPLODE_OUTER(c2)")) + checkAnswer( + spark.tvf + .posexplode_outer(array(lit(1), lit(2))) + .toDF("p", "v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM POSEXPLODE_OUTER(ARRAY(1, 2)) t(p, v), LATERAL (SELECT v + 1)")) + } + } + test("stack") { val actual = spark.tvf.stack(lit(2), lit(1), lit(2), lit(3)) val expected = spark.sql("SELECT * FROM stack(2, 1, 2, 3)") checkAnswer(actual, expected) } + test("stack - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql( + "CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin( + spark.tvf.stack(lit(2), lit("Key"), $"c1".outer(), lit("Value"), $"c2".outer()).as("t")) + .select($"t.*"), + sql("SELECT t.* FROM t1, LATERAL stack(2, 'Key', c1, 'Value', c2) t")) + checkAnswer( + t1.lateralJoin( + spark.tvf.stack(lit(1), $"c1".outer(), $"c2".outer()).toDF("x", "y").as("t")) + .select($"t.*"), + sql("SELECT t.* FROM t1 JOIN LATERAL stack(1, c1, c2) t(x, y)")) + checkAnswer( + t1.join(t3, $"t1.c1" === $"t3.c1") + .lateralJoin(spark.tvf.stack(lit(1), $"t1.c2".outer(), $"t3.c2".outer()).as("t")) + .select($"t.*"), + sql("SELECT t.* FROM t1 JOIN t3 ON t1.c1 = t3.c1 JOIN LATERAL stack(1, t1.c2, t3.c2) t")) + } + } + test("collations") { val actual = spark.tvf.collations() val expected = spark.sql("SELECT * FROM collations()") @@ -205,8 +420,7 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual, expected) } - // TODO(SPARK-50063): Support VARIANT in Spark Connect Scala client - ignore("variant_explode") { + test("variant_explode") { val actual1 = spark.tvf.variant_explode(parse_json(lit("""["hello", "world"]"""))) val expected1 = spark.sql("""SELECT * FROM variant_explode(parse_json('["hello", "world"]'))""") @@ -237,8 +451,28 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi checkAnswer(actual6, expected6) } - // TODO(SPARK-50063): Support VARIANT in Spark Connect Scala client - ignore("variant_explode_outer") { + test("variant_explode - lateral join") { + withView("variant_table") { + sql(""" + |CREATE VIEW variant_table(id, v) AS + |SELECT id, parse_json(v) AS v FROM VALUES + |(0, '["hello", "world"]'), (1, '{"a": true, "b": 3.14}'), + |(2, '[]'), (3, '{}'), + |(4, NULL), (5, '1') + |AS t(id, v) + |""".stripMargin) + val variantTable = spark.table("variant_table") + + checkAnswer( + variantTable + .as("t1") + .lateralJoin(spark.tvf.variant_explode($"v".outer()).as("t")) + .select($"t1.id", $"t.*"), + sql("SELECT t1.id, t.* FROM variant_table AS t1, LATERAL variant_explode(v) AS t")) + } + } + + test("variant_explode_outer") { val actual1 = spark.tvf.variant_explode_outer(parse_json(lit("""["hello", "world"]"""))) val expected1 = spark.sql("""SELECT * FROM variant_explode_outer(parse_json('["hello", "world"]'))""") @@ -268,4 +502,25 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with RemoteSparkSessi val expected6 = spark.sql("SELECT * FROM variant_explode_outer(parse_json('1'))") checkAnswer(actual6, expected6) } + + test("variant_explode_outer - lateral join") { + withView("variant_table") { + sql(""" + |CREATE VIEW variant_table(id, v) AS + |SELECT id, parse_json(v) AS v FROM VALUES + |(0, '["hello", "world"]'), (1, '{"a": true, "b": 3.14}'), + |(2, '[]'), (3, '{}'), + |(4, NULL), (5, '1') + |AS t(id, v) + |""".stripMargin) + val variantTable = spark.table("variant_table") + + checkAnswer( + variantTable + .as("t1") + .lateralJoin(spark.tvf.variant_explode_outer($"v".outer()).as("t")) + .select($"t1.id", $"t.*"), + sql("SELECT t1.id, t.* FROM variant_table AS t1, LATERAL variant_explode_outer(v) AS t")) + } + } } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala index 988774d5eec94..021b4fea26e2a 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala @@ -460,6 +460,14 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with RemoteSparkSessi (5, "hello")) } + test("SPARK-50789: reduceGroups on unresolved plan") { + val ds = Seq("abc", "xyz", "hello").toDS().select("*").as[String] + checkDatasetUnorderly( + ds.groupByKey(_.length).reduceGroups(_ + _), + (3, "abcxyz"), + (5, "hello")) + } + test("groupby") { val ds = Seq(("a", 1, 10), ("a", 2, 20), ("b", 2, 1), ("b", 1, 2), ("c", 1, 1)) .toDF("key", "seq", "value") @@ -479,6 +487,25 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with RemoteSparkSessi "(c,1,1)") } + test("SPARK-50693: groupby on unresolved plan") { + val ds = Seq(("a", 1, 10), ("a", 2, 20), ("b", 2, 1), ("b", 1, 2), ("c", 1, 1)) + .toDF("key", "seq", "value") + val grouped = ds.select("*").groupBy($"key").as[String, (String, Int, Int)] + val aggregated = grouped + .flatMapSortedGroups($"seq", expr("length(key)"), $"value") { (g, iter) => + Iterator(g, iter.mkString(", ")) + } + + checkDatasetUnorderly( + aggregated, + "a", + "(a,1,10), (a,2,20)", + "b", + "(b,1,2), (b,2,1)", + "c", + "(c,1,1)") + } + test("groupby - keyAs, keys") { val ds = Seq(("a", 1, 10), ("a", 2, 20), ("b", 2, 1), ("b", 1, 2), ("c", 1, 1)) .toDF("key", "seq", "value") @@ -597,6 +624,16 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with RemoteSparkSessi ("c", 1L)) } + test("SPARK-50693: RowEncoder in udf on unresolved plan") { + val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDF("c1", "c2") + + checkDatasetUnorderly( + ds.select("*").groupByKey(k => k.getAs[String](0)).agg(sum("c2").as[Long]), + ("a", 30L), + ("b", 3L), + ("c", 1L)) + } + test("mapGroups with row encoder") { val df = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDF("c1", "c2") @@ -611,6 +648,21 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with RemoteSparkSessi 1) } + test("SPARK-50693: mapGroups with row encoder on unresolved plan") { + val df = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDF("c1", "c2") + + checkDataset( + df.select("*") + .groupByKey(r => r.getAs[String]("c1")) + .mapGroups((_, it) => + it.map(r => { + r.getAs[Int]("c2") + }).sum), + 30, + 3, + 1) + } + test("coGroup with row encoder") { val df1 = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDF("c1", "c2") val df2 = Seq(("x", 10), ("x", 20), ("y", 1), ("y", 2), ("a", 1)).toDF("c1", "c2") @@ -632,6 +684,30 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with RemoteSparkSessi 3) } + test("SPARK-50693: coGroup with row encoder on unresolved plan") { + val df1 = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDF("c1", "c2") + val df2 = Seq(("x", 10), ("x", 20), ("y", 1), ("y", 2), ("a", 1)).toDF("c1", "c2") + + Seq((df1.select("*"), df2), (df1, df2.select("*")), (df1.select("*"), df2.select("*"))) + .foreach { case (df1, df2) => + val ds1: KeyValueGroupedDataset[String, Row] = + df1.groupByKey(r => r.getAs[String]("c1")) + val ds2: KeyValueGroupedDataset[String, Row] = + df2.groupByKey(r => r.getAs[String]("c1")) + checkDataset( + ds1.cogroup(ds2)((_, it, it2) => { + val sum1 = it.map(r => r.getAs[Int]("c2")).sum + val sum2 = it2.map(r => r.getAs[Int]("c2")).sum + Iterator(sum1 + sum2) + }), + 31, + 3, + 1, + 30, + 3) + } + } + test("serialize as null") { val kvgds = session.range(10).groupByKey(_ % 2) val bytes = SparkSerDeUtils.serialize(kvgds) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLExpressionsSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLExpressionsSuite.scala new file mode 100644 index 0000000000000..fcd2b3a388042 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLExpressionsSuite.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} +import org.apache.spark.unsafe.types.VariantVal + +class SQLExpressionsSuite extends QueryTest with RemoteSparkSession { + + test("variants") { + val topLevelVariants = spark.sql("select parse_json(id::string) from range(10)") + checkAnswer( + topLevelVariants, + (0 until 10) + .map(i => Row(new VariantVal(Array[Byte](12, i.toByte), Array[Byte](1, 0, 0))))) + val structsOfVariants = spark.sql("select struct(parse_json(id::string)) from range(10)") + checkAnswer( + structsOfVariants, + (0 until 10) + .map(i => Row(Row(new VariantVal(Array[Byte](12, i.toByte), Array[Byte](1, 0, 0)))))) + val arraysOfVariants = spark.sql("select array(parse_json(id::string)) from range(10)") + checkAnswer( + arraysOfVariants, + (0 until 10) + .map(i => Row(Seq(new VariantVal(Array[Byte](12, i.toByte), Array[Byte](1, 0, 0)))))) + val mapsOfVariants = spark.sql("select map(id, parse_json(id::string)) from range(10)") + checkAnswer( + mapsOfVariants, + (0 until 10) + .map(i => Row(Map((i, new VariantVal(Array[Byte](12, i.toByte), Array[Byte](1, 0, 0))))))) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UnsupportedFeaturesSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UnsupportedFeaturesSuite.scala index 6a26cf581751d..42ae6987c9f36 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UnsupportedFeaturesSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UnsupportedFeaturesSuite.scala @@ -79,10 +79,6 @@ class UnsupportedFeaturesSuite extends ConnectFunSuite { _.listenerManager } - testUnsupportedFeature("SparkSession.sqlContext", "SESSION_SQL_CONTEXT") { - _.sqlContext - } - testUnsupportedFeature( "SparkSession.baseRelationToDataFrame", "SESSION_BASE_RELATION_TO_DATAFRAME") { diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala index ca754c7b542f7..19275326d6421 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala @@ -301,6 +301,14 @@ class UserDefinedFunctionE2ETestSuite extends QueryTest with RemoteSparkSession checkDataset(df.filter(r => r.getInt(1) > 5), Row("a", 10), Row("a", 20)) } + test("SPARK-50693: Filter with row input encoder on unresolved plan") { + val session: SparkSession = spark + import session.implicits._ + val df = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDF("c1", "c2") + + checkDataset(df.select("*").filter(r => r.getInt(1) > 5), Row("a", 10), Row("a", 20)) + } + test("mapPartitions with row input encoder") { val session: SparkSession = spark import session.implicits._ @@ -393,6 +401,13 @@ class UserDefinedFunctionE2ETestSuite extends QueryTest with RemoteSparkSession assert(ds.select(aggCol).head() == 135) // 45 + 90 } + test("SPARK-50789: UDAF custom Aggregator - toColumn on unresolved plan") { + val encoder = Encoders.product[UdafTestInput] + val aggCol = new CompleteUdafTestInputAggregator().toColumn + val ds = spark.range(10).withColumn("extra", col("id") * 2).select("*").as(encoder) + assert(ds.select(aggCol).head() == 135) // 45 + 90 + } + test("UDAF custom Aggregator - multiple extends - toColumn") { val encoder = Encoders.product[UdafTestInput] val aggCol = new CompleteGrandChildUdafTestInputAggregator().toColumn @@ -400,11 +415,24 @@ class UserDefinedFunctionE2ETestSuite extends QueryTest with RemoteSparkSession assert(ds.select(aggCol).head() == 540) // (45 + 90) * 4 } - test("UDAF custom aggregator - with rows - toColumn") { + test("SPARK-50789: UDAF custom Aggregator - multiple extends - toColumn on unresolved plan") { + val encoder = Encoders.product[UdafTestInput] + val aggCol = new CompleteGrandChildUdafTestInputAggregator().toColumn + val ds = spark.range(10).withColumn("extra", col("id") * 2).select("*").as(encoder) + assert(ds.select(aggCol).head() == 540) // (45 + 90) * 4 + } + + test("UDAF custom Aggregator - with rows - toColumn") { val ds = spark.range(10).withColumn("extra", col("id") * 2) assert(ds.select(RowAggregator.toColumn).head() == 405) assert(ds.agg(RowAggregator.toColumn).head().getLong(0) == 405) } + + test("SPARK-50789: UDAF custom Aggregator - with rows - toColumn on unresolved plan") { + val ds = spark.range(10).withColumn("extra", col("id") * 2).select("*") + assert(ds.select(RowAggregator.toColumn).head() == 405) + assert(ds.agg(RowAggregator.toColumn).head().getLong(0) == 405) + } } case class UdafTestInput(id: Long, extra: Long) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala index d9ff8d9122ead..7bac10e79d0b4 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala @@ -176,8 +176,6 @@ object CheckConnectJvmClientCompatibility { // Skip unsupported classes ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ExperimentalMethods"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SQLContext"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SQLContext$*"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SparkSessionExtensions"), ProblemFilters.exclude[MissingClassProblem]( "org.apache.spark.sql.SparkSessionExtensionsProvider"), @@ -185,6 +183,11 @@ object CheckConnectJvmClientCompatibility { "org.apache.spark.sql.ExtendedExplainGenerator"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.UDTFRegistration"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.DataSourceRegistration"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.TableArg"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.artifact.ArtifactStateForCleanup"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.artifact.ArtifactStateForCleanup$"), // DataFrameNaFunctions ProblemFilters.exclude[Problem]("org.apache.spark.sql.DataFrameNaFunctions.fillValue"), @@ -233,9 +236,11 @@ object CheckConnectJvmClientCompatibility { "org.apache.spark.sql.artifact.ArtifactManager$SparkContextResourceType$"), // ColumnNode conversions + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.sql.SparkSession"), + ProblemFilters.exclude[DirectMissingMethodProblem]( + "org.apache.spark.sql.SparkSession.expression"), ProblemFilters.exclude[DirectMissingMethodProblem]( - "org.apache.spark.sql.SparkSession.Converter"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SparkSession$Converter$"), + "org.apache.spark.sql.SparkSession.toRichColumn"), // UDFRegistration ProblemFilters.exclude[DirectMissingMethodProblem]( @@ -295,10 +300,9 @@ object CheckConnectJvmClientCompatibility { "org.apache.spark.sql.KeyValueGroupedDatasetImpl$"), // ColumnNode conversions - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "org.apache.spark.sql.SparkSession#RichColumn.expr"), ProblemFilters.exclude[DirectMissingMethodProblem]( - "org.apache.spark.sql.SparkSession#RichColumn.typedExpr"), + "org.apache.spark.sql.SparkSession.RichColumn"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SparkSession$RichColumn"), // New public APIs added in the client // Dataset @@ -330,6 +334,11 @@ object CheckConnectJvmClientCompatibility { ProblemFilters.exclude[DirectMissingMethodProblem]( "org.apache.spark.sql.SparkSession#Builder.interceptor"), + // Private case class in SQLContext + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SQLContext$ListTableRow"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.SQLContext$ListTableRow$"), + // SQLImplicits ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits.session"), diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala index 10e4c11c406fe..d0468c8d57b58 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala @@ -45,6 +45,7 @@ import org.apache.spark.sql.connect.client.CloseableIterator import org.apache.spark.sql.connect.client.arrow.FooEnum.FooEnum import org.apache.spark.sql.test.ConnectFunSuite import org.apache.spark.sql.types.{ArrayType, DataType, DayTimeIntervalType, Decimal, DecimalType, IntegerType, Metadata, SQLUserDefinedType, StringType, StructType, UserDefinedType, YearMonthIntervalType} +import org.apache.spark.unsafe.types.VariantVal /** * Tests for encoding external data to and from arrow. @@ -264,6 +265,52 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll { assert(inspector.numBatches == 1) } + test("variant round trip") { + val variantEncoder = toRowEncoder(new StructType().add("v", "variant")) + roundTripAndCheckIdentical(variantEncoder) { () => + val maybeNull = MaybeNull(7) + Iterator.tabulate(101)(i => + Row(maybeNull(new VariantVal(Array[Byte](12, i.toByte), Array[Byte](1, 0, 0))))) + } + + val nestedVariantEncoder = toRowEncoder( + new StructType() + .add( + "s", + new StructType() + .add("i1", "int") + .add("v1", "variant") + .add("i2", "int") + .add("v2", "variant")) + .add("a", "array") + .add("m", "map")) + + roundTripAndCheckIdentical(nestedVariantEncoder) { () => + val maybeNull5 = MaybeNull(5) + val maybeNull7 = MaybeNull(7) + val maybeNull11 = MaybeNull(11) + val maybeNull13 = MaybeNull(13) + val maybeNull17 = MaybeNull(17) + Iterator.tabulate(100)(i => + Row( + maybeNull5( + Row( + i, + maybeNull7(new VariantVal(Array[Byte](12, i.toByte), Array[Byte](1, 0, 0))), + i + 1, + maybeNull11( + new VariantVal(Array[Byte](12, (i + 1).toByte), Array[Byte](1, 0, 0))))), + maybeNull7((0 until 10).map(j => + new VariantVal(Array[Byte](12, (i + j).toByte), Array[Byte](1, 0, 0)))), + maybeNull13( + Map( + ( + i.toString, + maybeNull17( + new VariantVal(Array[Byte](12, (i + 2).toByte), Array[Byte](1, 0, 0)))))))) + } + } + test("multiple batches - split by record count") { val inspector = new CountingBatchInspector roundTripAndCheckIdentical( diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToProtoConverterSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToProtoConverterSuite.scala index 2efd396735191..94729d34f37b5 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToProtoConverterSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToProtoConverterSuite.scala @@ -128,19 +128,22 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite { .setFunctionName("+") .setIsDistinct(false) .addArguments(attribute("a")) - .addArguments(expr(_.getLiteralBuilder.setInteger(1))))) + .addArguments(expr(_.getLiteralBuilder.setInteger(1))) + .setIsInternal(false))) testConversion( UnresolvedFunction( "db1.myAgg", Seq(UnresolvedAttribute("a")), isDistinct = true, - isUserDefinedFunction = true), + isUserDefinedFunction = true, + isInternal = true), expr( _.getUnresolvedFunctionBuilder .setFunctionName("db1.myAgg") .setIsDistinct(true) .setIsUserDefinedFunction(true) - .addArguments(attribute("a")))) + .addArguments(attribute("a")) + .setIsInternal(true))) } test("alias") { @@ -247,10 +250,12 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite { expr( _.getWindowBuilder .setWindowFunction( - expr(_.getUnresolvedFunctionBuilder - .setFunctionName("sum") - .setIsDistinct(false) - .addArguments(attribute("a")))) + expr( + _.getUnresolvedFunctionBuilder + .setFunctionName("sum") + .setIsDistinct(false) + .addArguments(attribute("a")) + .setIsInternal(false))) .addPartitionSpec(attribute("b")) .addPartitionSpec(attribute("c")) .addOrderSpec(proto.Expression.SortOrder @@ -276,7 +281,8 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite { _.getUnresolvedFunctionBuilder .setFunctionName("sum") .setIsDistinct(false) - .addArguments(attribute("a")))) + .addArguments(attribute("a")) + .setIsInternal(false))) .addPartitionSpec(attribute("b")) .addPartitionSpec(attribute("c")))) testWindowFrame( @@ -310,7 +316,8 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite { _.getUnresolvedFunctionBuilder .setFunctionName("+") .addArguments(expr(_.setUnresolvedNamedLambdaVariable(catX))) - .addArguments(attribute("y")))) + .addArguments(attribute("y")) + .setIsInternal(false))) .addArguments(catX))) } @@ -330,7 +337,8 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite { .setFunctionName("when") .addArguments(attribute("c1")) .addArguments(expr(_.getLiteralBuilder.setString("r1"))) - .addArguments(expr(_.getLiteralBuilder.setString("fallback"))))) + .addArguments(expr(_.getLiteralBuilder.setString("fallback"))) + .setIsInternal(false))) } test("extract field") { @@ -431,4 +439,5 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite { private[internal] case class Nope(override val origin: Origin = CurrentOrigin.get) extends ColumnNode { override def sql: String = "nope" + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala index b1a7d81916e92..199a1507a3b19 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala @@ -28,9 +28,8 @@ import org.scalatest.concurrent.Futures.timeout import org.scalatest.time.SpanSugar._ import org.apache.spark.SparkException -import org.apache.spark.api.java.function.VoidFunction2 import org.apache.spark.internal.Logging -import org.apache.spark.sql.{DataFrame, ForeachWriter, Row, SparkSession} +import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row, SparkSession} import org.apache.spark.sql.functions.{col, lit, udf, window} import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryIdleEvent, QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} import org.apache.spark.sql.test.{IntegrationTestUtils, QueryTest, RemoteSparkSession} @@ -567,7 +566,7 @@ class ClientStreamingQuerySuite extends QueryTest with RemoteSparkSession with L } } - test("foreachBatch") { + test("foreachBatch with DataFrame") { // Starts a streaming query with a foreachBatch function, which writes batchId and row count // to a temp view. The test verifies that the view is populated with data. @@ -581,7 +580,12 @@ class ClientStreamingQuerySuite extends QueryTest with RemoteSparkSession with L .option("numPartitions", "1") .load() .writeStream - .foreachBatch(new ForeachBatchFn(viewName)) + .foreachBatch((df: DataFrame, batchId: Long) => { + val count = df.collect().map(row => row.getLong(1)).sum + df.sparkSession + .createDataFrame(Seq((batchId, count))) + .createOrReplaceGlobalTempView(viewName) + }) .start() eventually(timeout(30.seconds)) { // Wait for first progress. @@ -596,6 +600,7 @@ class ClientStreamingQuerySuite extends QueryTest with RemoteSparkSession with L .collect() .toSeq assert(rows.size > 0) + assert(rows.map(_.getLong(1)).sum > 0) logInfo(s"Rows in $tableName: $rows") } @@ -603,6 +608,75 @@ class ClientStreamingQuerySuite extends QueryTest with RemoteSparkSession with L } } + test("foreachBatch with Dataset[java.lang.Long]") { + val viewName = "test_view" + val tableName = s"global_temp.$viewName" + + withTable(tableName) { + val session = spark + import session.implicits._ + val q = spark.readStream + .format("rate") + .option("rowsPerSecond", "10") + .option("numPartitions", "1") + .load() + .select($"value") + .as[java.lang.Long] + .writeStream + .foreachBatch((ds: Dataset[java.lang.Long], batchId: Long) => { + val count = ds.collect().map(v => v.asInstanceOf[Long]).sum + ds.sparkSession + .createDataFrame(Seq((batchId, count))) + .createOrReplaceGlobalTempView(viewName) + }) + .start() + + eventually(timeout(30.seconds)) { // Wait for first progress. + assert(q.lastProgress != null, "Failed to make progress") + assert(q.lastProgress.numInputRows > 0) + } + + eventually(timeout(30.seconds)) { + // There should be row(s) in temporary view created by foreachBatch. + val rows = spark + .sql(s"select * from $tableName") + .collect() + .toSeq + assert(rows.size > 0) + assert(rows.map(_.getLong(1)).sum > 0) + logInfo(s"Rows in $tableName: $rows") + } + + q.stop() + } + } + + test("foreachBatch with Dataset[TestClass]") { + val session: SparkSession = spark + import session.implicits._ + val viewName = "test_view" + val tableName = s"global_temp.$viewName" + + val df = spark.readStream + .format("rate") + .option("rowsPerSecond", "10") + .load() + + val q = df + .selectExpr("CAST(value AS INT)") + .as[TestClass] + .writeStream + .foreachBatch((ds: Dataset[TestClass], batchId: Long) => { + val count = ds.collect().map(_.value).sum + }) + .start() + eventually(timeout(30.seconds)) { + assert(q.isActive) + assert(q.exception.isEmpty) + } + q.stop() + } + abstract class EventCollector extends StreamingQueryListener { protected def tablePostfix: String @@ -700,14 +774,3 @@ class TestForeachWriter[T] extends ForeachWriter[T] { case class TestClass(value: Int) { override def toString: String = value.toString } - -class ForeachBatchFn(val viewName: String) - extends VoidFunction2[DataFrame, java.lang.Long] - with Serializable { - override def call(df: DataFrame, batchId: java.lang.Long): Unit = { - val count = df.count() - df.sparkSession - .createDataFrame(Seq((batchId.toLong, count))) - .createOrReplaceGlobalTempView(viewName) - } -} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala index dc74463f1a25b..9bd6614028cbf 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala @@ -55,7 +55,9 @@ class FlatMapGroupsWithStateStreamingSuite extends QueryTest with RemoteSparkSes val stateFunc = (key: String, values: Iterator[ClickEvent], state: GroupState[ClickState]) => { if (state.exists) throw new IllegalArgumentException("state.exists should be false") - Iterator(ClickState(key, values.size)) + val newState = ClickState(key, values.size) + state.update(newState) + Iterator(newState) } spark.sql("DROP TABLE IF EXISTS my_sink") @@ -96,7 +98,9 @@ class FlatMapGroupsWithStateStreamingSuite extends QueryTest with RemoteSparkSes val stateFunc = (key: String, values: Iterator[ClickEvent], state: GroupState[ClickState]) => { val currState = state.getOption.getOrElse(ClickState(key, 0)) - Iterator(ClickState(key, currState.count + values.size)) + val newState = ClickState(key, currState.count + values.size) + state.update(newState) + Iterator(newState) } val initialState = flatMapGroupsWithStateInitialStateData .toDS() @@ -141,7 +145,9 @@ class FlatMapGroupsWithStateStreamingSuite extends QueryTest with RemoteSparkSes val stateFunc = (key: String, values: Iterator[ClickEvent], state: GroupState[ClickState]) => { if (state.exists) throw new IllegalArgumentException("state.exists should be false") - ClickState(key, values.size) + val newState = ClickState(key, values.size) + state.update(newState) + newState } spark.sql("DROP TABLE IF EXISTS my_sink") @@ -183,7 +189,9 @@ class FlatMapGroupsWithStateStreamingSuite extends QueryTest with RemoteSparkSes val stateFunc = (key: String, values: Iterator[ClickEvent], state: GroupState[ClickState]) => { val currState = state.getOption.getOrElse(ClickState(key, 0)) - ClickState(key, currState.count + values.size) + val newState = ClickState(key, currState.count + values.size) + state.update(newState) + newState } val initialState = flatMapGroupsWithStateInitialStateData .toDS() diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/IntegrationTestUtils.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/IntegrationTestUtils.scala index 61d08912aec23..3ae9b9fc73b48 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/IntegrationTestUtils.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/IntegrationTestUtils.scala @@ -74,7 +74,7 @@ object IntegrationTestUtils { // Redirect server log into console "--conf", - s"spark.driver.extraJavaOptions=-Dlog4j.configuration=$log4j2") + s"spark.driver.extraJavaOptions=-Dlog4j.configurationFile=$log4j2") } else Seq.empty } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala index 8837c76b76aeb..f22644074324c 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala @@ -19,8 +19,11 @@ package org.apache.spark.sql.test import java.util.TimeZone +import scala.jdk.CollectionConverters._ + import org.scalatest.Assertions +import org.apache.spark.{QueryContextType, SparkThrowable} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.util.SparkStringUtils.sideBySide import org.apache.spark.util.ArrayImplicits._ @@ -53,6 +56,158 @@ abstract class QueryTest extends ConnectFunSuite with SQLHelper { checkAnswer(df, expectedAnswer.toImmutableArraySeq) } + case class ExpectedContext( + contextType: QueryContextType, + objectType: String, + objectName: String, + startIndex: Int, + stopIndex: Int, + fragment: String, + callSitePattern: String) + + object ExpectedContext { + def apply(fragment: String, start: Int, stop: Int): ExpectedContext = { + ExpectedContext("", "", start, stop, fragment) + } + + def apply( + objectType: String, + objectName: String, + startIndex: Int, + stopIndex: Int, + fragment: String): ExpectedContext = { + new ExpectedContext( + QueryContextType.SQL, + objectType, + objectName, + startIndex, + stopIndex, + fragment, + "") + } + + def apply(fragment: String, callSitePattern: String): ExpectedContext = { + new ExpectedContext(QueryContextType.DataFrame, "", "", -1, -1, fragment, callSitePattern) + } + } + + /** + * Checks an exception with an error condition against expected results. + * @param exception + * The exception to check + * @param condition + * The expected error condition identifying the error + * @param sqlState + * Optional the expected SQLSTATE, not verified if not supplied + * @param parameters + * A map of parameter names and values. The names are as defined in the error-classes file. + * @param matchPVals + * Optionally treat the parameters value as regular expression pattern. false if not supplied. + */ + protected def checkError( + exception: SparkThrowable, + condition: String, + sqlState: Option[String] = None, + parameters: Map[String, String] = Map.empty, + matchPVals: Boolean = false, + queryContext: Array[ExpectedContext] = Array.empty): Unit = { + assert(exception.getCondition === condition) + sqlState.foreach(state => assert(exception.getSqlState === state)) + val expectedParameters = exception.getMessageParameters.asScala + if (matchPVals) { + assert(expectedParameters.size === parameters.size) + expectedParameters.foreach(exp => { + val parm = parameters.getOrElse( + exp._1, + throw new IllegalArgumentException("Missing parameter" + exp._1)) + if (!exp._2.matches(parm)) { + throw new IllegalArgumentException( + "For parameter '" + exp._1 + "' value '" + exp._2 + + "' does not match: " + parm) + } + }) + } else { + assert(expectedParameters === parameters) + } + val actualQueryContext = exception.getQueryContext() + assert( + actualQueryContext.length === queryContext.length, + "Invalid length of the query context") + actualQueryContext.zip(queryContext).foreach { case (actual, expected) => + assert( + actual.contextType() === expected.contextType, + "Invalid contextType of a query context Actual:" + actual.toString) + if (actual.contextType() == QueryContextType.SQL) { + assert( + actual.objectType() === expected.objectType, + "Invalid objectType of a query context Actual:" + actual.toString) + assert( + actual.objectName() === expected.objectName, + "Invalid objectName of a query context. Actual:" + actual.toString) + assert( + actual.startIndex() === expected.startIndex, + "Invalid startIndex of a query context. Actual:" + actual.toString) + assert( + actual.stopIndex() === expected.stopIndex, + "Invalid stopIndex of a query context. Actual:" + actual.toString) + assert( + actual.fragment() === expected.fragment, + "Invalid fragment of a query context. Actual:" + actual.toString) + } else if (actual.contextType() == QueryContextType.DataFrame) { + assert( + actual.fragment() === expected.fragment, + "Invalid code fragment of a query context. Actual:" + actual.toString) + if (expected.callSitePattern.nonEmpty) { + assert( + actual.callSite().matches(expected.callSitePattern), + "Invalid callSite of a query context. Actual:" + actual.toString) + } + } + } + } + + protected def checkError( + exception: SparkThrowable, + condition: String, + sqlState: String, + parameters: Map[String, String]): Unit = + checkError(exception, condition, Some(sqlState), parameters) + + protected def checkError( + exception: SparkThrowable, + condition: String, + sqlState: String, + parameters: Map[String, String], + context: ExpectedContext): Unit = + checkError(exception, condition, Some(sqlState), parameters, false, Array(context)) + + protected def checkError( + exception: SparkThrowable, + condition: String, + parameters: Map[String, String], + context: ExpectedContext): Unit = + checkError(exception, condition, None, parameters, false, Array(context)) + + protected def checkError( + exception: SparkThrowable, + condition: String, + sqlState: String, + context: ExpectedContext): Unit = + checkError(exception, condition, Some(sqlState), Map.empty, false, Array(context)) + + protected def checkError( + exception: SparkThrowable, + condition: String, + sqlState: Option[String], + parameters: Map[String, String], + context: ExpectedContext): Unit = + checkError(exception, condition, sqlState, parameters, false, Array(context)) + + protected def getCurrentClassCallSitePattern: String = { + val cs = Thread.currentThread().getStackTrace()(2) + s"${cs.getClassName}\\..*\\(${cs.getFileName}:\\d+\\)" + } + /** * Evaluates a dataset to make sure that the result of calling collect matches the given * expected answer. diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/SQLHelper.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/SQLHelper.scala index 4a574a15f7ab8..d9828ae92267b 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/SQLHelper.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/SQLHelper.scala @@ -21,13 +21,28 @@ import java.util.UUID import org.scalatest.Assertions.fail -import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession, SQLImplicits} +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.util.{SparkErrorUtils, SparkFileUtils} trait SQLHelper { def spark: SparkSession + // Shorthand for running a query using our SparkSession + protected lazy val sql: String => DataFrame = spark.sql _ + + /** + * A helper object for importing SQL implicits. + * + * Note that the alternative of importing `spark.implicits._` is not possible here. This is + * because we create the `SparkSession` immediately before the first test is run, but the + * implicits import is needed in the constructor. + */ + protected object testImplicits extends SQLImplicits { + override protected def session: SparkSession = spark + } + /** * Sets all SQL configurations specified in `pairs`, calls `f`, and then restores all SQL * configurations. @@ -96,6 +111,22 @@ trait SQLHelper { finally SparkFileUtils.deleteRecursively(path) } + /** + * Drops temporary view `viewNames` after calling `f`. + */ + protected def withTempView(viewNames: String*)(f: => Unit): Unit = { + SparkErrorUtils.tryWithSafeFinally(f) { + viewNames.foreach { viewName => + try spark.catalog.dropTempView(viewName) + catch { + // If the test failed part way, we don't want to mask the failure by failing to remove + // temp views that never got created. + case _: NoSuchTableException => + } + } + } + } + /** * Drops table `tableName` after calling `f`. */ @@ -106,4 +137,13 @@ trait SQLHelper { } } } + + /** + * Drops view `viewName` after calling `f`. + */ + protected def withView(viewNames: String*)(f: => Unit): Unit = { + SparkErrorUtils.tryWithSafeFinally(f)(viewNames.foreach { name => + spark.sql(s"DROP VIEW IF EXISTS $name") + }) + } } diff --git a/connector/connect/docs/client-connection-string.md b/connector/connect/docs/client-connection-string.md index 37b2956a5c44a..df371c5beaaac 100644 --- a/connector/connect/docs/client-connection-string.md +++ b/connector/connect/docs/client-connection-string.md @@ -2,7 +2,7 @@ From the client perspective, Spark Connect mostly behaves as any other GRPC client and can be configured as such. However, to make it easy to use from -different programming languages and to have a homogenous connection surface +different programming languages and to have a homogeneous connection surface this document proposes what the user surface is for connecting to a Spark Connect endpoint. @@ -136,7 +136,7 @@ server_url = "sc://myhost.com:443/;use_ssl=true;token=ABCDEFG" As mentioned above, Spark Connect uses a regular GRPC client and the server path cannot be configured to remain compatible with the GRPC standard and HTTP. For -example the following examles are invalid. +example the following examples are invalid. ```python server_url = "sc://myhost.com:443/mypathprefix/;token=AAAAAAA" diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala index 9fbbc8ed2e0ff..b560f86ade38c 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala @@ -43,7 +43,7 @@ trait DockerIntegrationFunSuite extends SparkFunSuite { } } - /** Run the give body of code only if Kinesis tests are enabled */ + /** Run the given body of code only if ENABLE_DOCKER_INTEGRATION_TESTS is 1. */ def runIfTestsEnabled(message: String)(body: => Unit): Unit = { if (shouldRunTests) { body diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBDatabaseOnDocker.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBDatabaseOnDocker.scala new file mode 100644 index 0000000000000..61930268eb2ab --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBDatabaseOnDocker.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +import org.apache.spark.internal.Logging + +abstract class MariaDBDatabaseOnDocker extends DatabaseOnDocker with Logging { + override val imageName: String = + sys.env.getOrElse("MARIADB_DOCKER_IMAGE_NAME", "mariadb:10.11.10") + override val env: Map[String, String] = Map( + "MYSQL_ROOT_PASSWORD" -> "rootpass" + ) + override val usesIpc = false + override val jdbcPort = 3306 + + override def getEntryPoint: Option[String] = + Some("/docker-entrypoint/mariadb-docker-entrypoint.sh") +} diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala index 32c552eb8c7eb..962c70510b5bd 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala @@ -37,20 +37,11 @@ class MariaDBKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override protected val userName = s"mariadb/$dockerIp" override protected val keytabFileName = "mariadb.keytab" - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MARIADB_DOCKER_IMAGE_NAME", "mariadb:10.6.19") - override val env = Map( - "MYSQL_ROOT_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 3306 + override val db = new MariaDBDatabaseOnDocker() { override def getJdbcUrl(ip: String, port: Int): String = s"jdbc:mysql://$ip:$port/mysql?user=$principal" - override def getEntryPoint: Option[String] = - Some("/docker-entrypoint/mariadb-docker-entrypoint.sh") - override def beforeContainerStart( hostConfigBuilder: HostConfig, containerConfigBuilder: ContainerConfig): Unit = { diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresDatabaseOnDocker.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresDatabaseOnDocker.scala new file mode 100644 index 0000000000000..db2495ad3c698 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresDatabaseOnDocker.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +import org.apache.spark.internal.Logging + +class PostgresDatabaseOnDocker extends DatabaseOnDocker with Logging { + lazy override val imageName: String = + sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.2-alpine") + private val postgres_user = "postgres" + private val postgres_password = "rootpass" + override val env: Map[String, String] = Map( + "POSTGRES_PASSWORD" -> postgres_password + ) + override val usesIpc = false + override val jdbcPort: Int = 5432 + + override def getJdbcUrl(ip: String, port: Int): String = { + s"jdbc:postgresql://$ip:$port/postgres?user=$postgres_user&password=$postgres_password" + } +} diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala index 92a3e99586b5f..5c985da226b06 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala @@ -32,25 +32,16 @@ import org.apache.spark.sql.types._ import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:17.1-alpine): + * To run this test suite for a specific version (e.g., postgres:17.2-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.1-alpine + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.2-alpine * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly org.apache.spark.sql.jdbc.PostgresIntegrationSuite" * }}} */ @DockerTest class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.1-alpine") - override val env = Map( - "POSTGRES_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 5432 - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" - } + override val db = new PostgresDatabaseOnDocker override def dataPreparation(conn: Connection): Unit = { conn.prepareStatement("CREATE DATABASE foo").executeUpdate() diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala index 7c9fc477dbb78..b3cfe8bd77e2b 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala @@ -25,9 +25,9 @@ import org.apache.spark.sql.execution.datasources.jdbc.connection.SecureConnecti import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:17.1-alpine): + * To run this test suite for a specific version (e.g., postgres:17.2-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.1-alpine + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.2-alpine * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly *PostgresKrbIntegrationSuite" * }}} @@ -37,14 +37,7 @@ class PostgresKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override protected val userName = s"postgres/$dockerIp" override protected val keytabFileName = "postgres.keytab" - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.1-alpine") - override val env = Map( - "POSTGRES_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 5432 - + override val db = new PostgresDatabaseOnDocker { override def getJdbcUrl(ip: String, port: Int): String = s"jdbc:postgresql://$ip:$port/postgres?user=$principal&gsslib=gssapi" diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala index b6917df2d428a..3a1d5e18b7e5a 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala @@ -28,9 +28,9 @@ import org.apache.spark.tags.DockerTest /** * This suite is used to generate subqueries, and test Spark against Postgres. - * To run this test suite for a specific version (e.g., postgres:17.1-alpine): + * To run this test suite for a specific version (e.g., postgres:17.2-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.1-alpine + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.2-alpine * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly org.apache.spark.sql.jdbc.GeneratedSubquerySuite" * }}} @@ -38,16 +38,7 @@ import org.apache.spark.tags.DockerTest @DockerTest class GeneratedSubquerySuite extends DockerJDBCIntegrationSuite with QueryGeneratorHelper { - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.1-alpine") - override val env = Map( - "POSTGRES_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 5432 - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" - } + override val db = new PostgresDatabaseOnDocker private val FIRST_COLUMN = "a" private val SECOND_COLUMN = "b" diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgresSQLQueryTestSuite.scala similarity index 82% rename from connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgresSQLQueryTestSuite.scala index 56a83cc0a34d6..28320a9e0a949 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgresSQLQueryTestSuite.scala @@ -30,31 +30,21 @@ import org.apache.spark.tags.DockerTest * confidence, and you won't have to manually verify the golden files generated with your test. * 2. Add this line to your .sql file: --ONLY_IF spark * - * Note: To run this test suite for a specific version (e.g., postgres:17.1-alpine): + * Note: To run this test suite for a specific version (e.g., postgres:17.2-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.1-alpine + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.2-alpine * ./build/sbt -Pdocker-integration-tests * "testOnly org.apache.spark.sql.jdbc.PostgreSQLQueryTestSuite" * }}} */ @DockerTest -class PostgreSQLQueryTestSuite extends CrossDbmsQueryTestSuite { +class PostgresSQLQueryTestSuite extends CrossDbmsQueryTestSuite { val DATABASE_NAME = CrossDbmsQueryTestSuite.POSTGRES // Scope to only subquery directory for now. protected val customInputFilePath: String = new File(inputFilePath, "subquery").getAbsolutePath - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.1-alpine") - override val env = Map( - "POSTGRES_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 5432 - - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" - } + override val db = new PostgresDatabaseOnDocker override def dataPreparation(conn: Connection): Unit = { conn.prepareStatement( diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala index 95465cc6e40c5..eaf2a07ed4594 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala @@ -22,31 +22,23 @@ import java.sql.Connection import org.apache.spark.{SparkConf, SparkSQLException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException +import org.apache.spark.sql.execution.FilterExec import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog -import org.apache.spark.sql.jdbc.DatabaseOnDocker +import org.apache.spark.sql.jdbc.PostgresDatabaseOnDocker import org.apache.spark.sql.types._ import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:17.1-alpine) + * To run this test suite for a specific version (e.g., postgres:17.2-alpine) * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.1-alpine + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.2-alpine * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresIntegrationSuite" * }}} */ @DockerTest class PostgresIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { override val catalogName: String = "postgresql" - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.1-alpine") - override val env = Map( - "POSTGRES_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 5432 - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" - } + override val db = new PostgresDatabaseOnDocker override def sparkConf: SparkConf = super.sparkConf .set("spark.sql.catalog.postgresql", classOf[JDBCTableCatalog].getName) .set("spark.sql.catalog.postgresql.url", db.getJdbcUrl(dockerIp, externalPort)) @@ -252,6 +244,15 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCT } } + test("SPARK-49695: Postgres fix xor push-down") { + val df = spark.sql(s"select dept, name from $catalogName.employee where dept ^ 6 = 0") + val rows = df.collect() + assert(!df.queryExecution.sparkPlan.exists(_.isInstanceOf[FilterExec])) + assert(rows.length == 1) + assert(rows(0).getInt(0) === 6) + assert(rows(0).getString(1) === "jen") + } + override def testDatetime(tbl: String): Unit = { val df1 = sql(s"SELECT name FROM $tbl WHERE " + "dayofyear(date1) > 100 AND dayofmonth(date1) > 10 ") diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala index 75f7ede5bc733..f84bdb46850f2 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala @@ -21,29 +21,20 @@ import java.sql.Connection import scala.jdk.CollectionConverters._ -import org.apache.spark.sql.jdbc.{DatabaseOnDocker, DockerJDBCIntegrationSuite} +import org.apache.spark.sql.jdbc.{DockerJDBCIntegrationSuite, PostgresDatabaseOnDocker} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:17.1-alpine): + * To run this test suite for a specific version (e.g., postgres:17.2-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.1-alpine + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:17.2-alpine * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresNamespaceSuite" * }}} */ @DockerTest class PostgresNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:17.1-alpine") - override val env = Map( - "POSTGRES_PASSWORD" -> "rootpass" - ) - override val usesIpc = false - override val jdbcPort = 5432 - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" - } + override val db = new PostgresDatabaseOnDocker val map = new CaseInsensitiveStringMap( Map("url" -> db.getJdbcUrl(dockerIp, externalPort), diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index 1d119de43970f..22eeae97874b1 100644 --- a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -1591,22 +1591,7 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase with } } - -class KafkaMicroBatchV1SourceWithAdminSuite extends KafkaMicroBatchV1SourceSuite { - override def beforeAll(): Unit = { - super.beforeAll() - spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") - } -} - -class KafkaMicroBatchV2SourceWithAdminSuite extends KafkaMicroBatchV2SourceSuite { - override def beforeAll(): Unit = { - super.beforeAll() - spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") - } -} - -class KafkaMicroBatchV1SourceSuite extends KafkaMicroBatchSourceSuiteBase { +abstract class KafkaMicroBatchV1SourceSuite extends KafkaMicroBatchSourceSuiteBase { override def beforeAll(): Unit = { super.beforeAll() spark.conf.set( @@ -1637,7 +1622,7 @@ class KafkaMicroBatchV1SourceSuite extends KafkaMicroBatchSourceSuiteBase { } } -class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase { +abstract class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase { test("V2 Source is used by default") { val topic = newTopic() @@ -1870,6 +1855,35 @@ class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase { } } +class KafkaMicroBatchV1SourceWithAdminSuite extends KafkaMicroBatchV1SourceSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") + } +} + +class KafkaMicroBatchV1SourceWithConsumerSuite extends KafkaMicroBatchV1SourceSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "true") + } +} + +class KafkaMicroBatchV2SourceWithAdminSuite extends KafkaMicroBatchV2SourceSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") + } +} + +class KafkaMicroBatchV2SourceWithConsumerSuite extends KafkaMicroBatchV2SourceSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "true") + } +} + + abstract class KafkaSourceSuiteBase extends KafkaSourceTest { import testImplicits._ diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala index 9e06b6c6ff4a2..60de3705636ec 100644 --- a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala @@ -504,9 +504,7 @@ class KafkaTestUtils( props.put("sasl.enabled.mechanisms", "GSSAPI,SCRAM-SHA-512") } - // Can not use properties.putAll(propsMap.asJava) in scala-2.12 - // See https://github.com/scala/bug/issues/10418 - withBrokerProps.foreach { case (k, v) => props.put(k, v) } + props.putAll(withBrokerProps.asJava) props } diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala index cefaa3de182a5..f7bea064d2d6c 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala @@ -57,7 +57,7 @@ private[spark] class DirectKafkaInputDStream[K, V]( ppc: PerPartitionConfig ) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets { - private val initialRate = context.sparkContext.getConf.getLong( + private val initialRate = context.sparkContext.getReadOnlyConf.getLong( "spark.streaming.backpressure.initialRate", 0) val executorKafkaParams = { diff --git a/connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala index 4835e9de086c4..cc24c378f4cbf 100644 --- a/connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala @@ -275,7 +275,7 @@ private[streaming] object StreamingExamples extends Logging { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + - " To override add a custom log4j.properties to the classpath.") + " To override add a custom log4j2.properties to the classpath.") Configurator.setRootLevel(Level.WARN) } } diff --git a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index aaafb3215d031..cd740f971e484 100644 --- a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -27,7 +27,7 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason import com.amazonaws.services.kinesis.model.Record import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKeys.{REASON, RETRY_INTERVAL, SHARD_ID, WORKER_URL} +import org.apache.spark.internal.LogKeys.{KINESIS_REASON, RETRY_INTERVAL, SHARD_ID, WORKER_URL} /** * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor. @@ -119,7 +119,7 @@ private[kinesis] class KinesisRecordProcessor[T](receiver: KinesisReceiver[T], w checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason): Unit = { logInfo(log"Shutdown: Shutting down workerId ${MDC(WORKER_URL, workerId)} " + - log"with reason ${MDC(REASON, reason)}") + log"with reason ${MDC(KINESIS_REASON, reason)}") // null if not initialized before shutdown: if (shardId == null) { logWarning(log"No shardId for workerId ${MDC(WORKER_URL, workerId)}?") diff --git a/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala index 8dc4de1aa3609..7098840d62f91 100644 --- a/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala +++ b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala @@ -35,7 +35,7 @@ trait KinesisFunSuite extends SparkFunSuite { } } - /** Run the give body of code only if Kinesis tests are enabled */ + /** Run the given body of code only if ENABLE_KINESIS_TESTS is 1. */ def runIfTestsEnabled(message: String)(body: => Unit): Unit = { if (shouldRunTests) { body diff --git a/connector/profiler/README.md b/connector/profiler/README.md index 1326fd55df097..30d897f21b065 100644 --- a/connector/profiler/README.md +++ b/connector/profiler/README.md @@ -3,8 +3,15 @@ ## Build To build + +``` +./build/mvn clean package -DskipTests -Pjvm-profiler -pl :spark-profiler_2.13 -am +``` + +or + ``` - ./build/mvn clean package -DskipTests -Pjvm-profiler +./build/sbt -Pjvm-profiler clean "profiler/package" ``` ## Executor Code Profiling @@ -16,7 +23,7 @@ The profiler writes the jfr files to the executor's working directory in the exe Code profiling is currently only supported for * Linux (x64) -* Linux (arm 64) +* Linux (arm64) * Linux (musl, x64) * MacOS @@ -54,7 +61,7 @@ Then enable the profiling in the configuration. spark.executor.profiling.dfsDir (none) - An HDFS compatible path to which the profiler's output files are copied. The output files will be written as dfsDir/application_id/profile-appname-exec-executor_id.jfr
+ An HDFS compatible path to which the profiler's output files are copied. The output files will be written as dfsDir/{{APP_ID}}/profile-exec-{{EXECUTOR_ID}}.jfr
If no dfsDir is specified then the files are not copied over. Users should ensure there is sufficient disk space available otherwise it may lead to corrupt jfr files. 4.0.0 @@ -72,7 +79,7 @@ Then enable the profiling in the configuration. event=wall,interval=10ms,alloc=2m,lock=10ms,chunktime=300s Options to pass to the profiler. Detailed options are documented in the comments here: - Profiler arguments. + Profiler arguments. Note that the options to start, stop, specify output format, and output file do not have to be specified. 4.0.0 diff --git a/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala b/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala index 20b6db5221fa9..94e5b46c65881 100644 --- a/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala +++ b/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala @@ -17,17 +17,17 @@ package org.apache.spark.executor.profiler import java.io.{BufferedInputStream, FileInputStream, InputStream, IOException} -import java.net.URI import java.util.concurrent.{ScheduledExecutorService, TimeUnit} import one.profiler.{AsyncProfiler, AsyncProfilerLoader} import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} +import org.apache.hadoop.fs.permission.FsPermission import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} import org.apache.spark.internal.LogKeys.PATH -import org.apache.spark.util.ThreadUtils +import org.apache.spark.util.{ThreadUtils, Utils} /** @@ -38,15 +38,26 @@ private[spark] class ExecutorJVMProfiler(conf: SparkConf, executorId: String) ex private var running = false private val enableProfiler = conf.get(EXECUTOR_PROFILING_ENABLED) private val profilerOptions = conf.get(EXECUTOR_PROFILING_OPTIONS) - private val profilerDfsDir = conf.get(EXECUTOR_PROFILING_DFS_DIR) + private val profilerDfsDirOpt = conf.get(EXECUTOR_PROFILING_DFS_DIR) private val profilerLocalDir = conf.get(EXECUTOR_PROFILING_LOCAL_DIR) private val writeInterval = conf.get(EXECUTOR_PROFILING_WRITE_INTERVAL) - private val startcmd = s"start,$profilerOptions,file=$profilerLocalDir/profile.jfr" - private val stopcmd = s"stop,$profilerOptions,file=$profilerLocalDir/profile.jfr" - private val dumpcmd = s"dump,$profilerOptions,file=$profilerLocalDir/profile.jfr" - private val resumecmd = s"resume,$profilerOptions,file=$profilerLocalDir/profile.jfr" + private val appId = try { + conf.getAppId + } catch { + case _: NoSuchElementException => "local-" + System.currentTimeMillis + } + private val appAttemptId = conf.getOption("spark.app.attempt.id") + private val baseName = Utils.nameForAppAndAttempt(appId, appAttemptId) + private val profileFile = s"profile-exec-$executorId.jfr" + + private val startcmd = s"start,$profilerOptions,file=$profilerLocalDir/$profileFile" + private val stopcmd = s"stop,$profilerOptions,file=$profilerLocalDir/$profileFile" + private val dumpcmd = s"dump,$profilerOptions,file=$profilerLocalDir/$profileFile" + private val resumecmd = s"resume,$profilerOptions,file=$profilerLocalDir/$profileFile" + private val PROFILER_FOLDER_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort) + private val PROFILER_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("660", 8).toShort) private val UPLOAD_SIZE = 8 * 1024 * 1024 // 8 MB private var outputStream: FSDataOutputStream = _ private var inputStream: InputStream = _ @@ -89,28 +100,34 @@ private[spark] class ExecutorJVMProfiler(conf: SparkConf, executorId: String) ex } } + private def requireProfilerBaseDirAsDirectory(fs: FileSystem, profilerDfsDir: String): Unit = { + if (!fs.getFileStatus(new Path(profilerDfsDir)).isDirectory) { + throw new IllegalArgumentException( + s"Profiler DFS base directory $profilerDfsDir is not a directory.") + } + } + private def startWriting(): Unit = { - if (profilerDfsDir.isDefined) { - val applicationId = try { - conf.getAppId - } catch { - case _: NoSuchElementException => "local-" + System.currentTimeMillis + profilerDfsDirOpt.foreach { profilerDfsDir => + val profilerDirForApp = s"$profilerDfsDir/$baseName" + val profileOutputFile = s"$profilerDirForApp/$profileFile" + + val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + val fs = Utils.getHadoopFileSystem(profilerDfsDir, hadoopConf) + + requireProfilerBaseDirAsDirectory(fs, profilerDfsDir) + + val profilerDirForAppPath = new Path(profilerDirForApp) + if (!fs.exists(profilerDirForAppPath)) { + // SPARK-30860: use the class method to avoid the umask causing permission issues + FileSystem.mkdirs(fs, profilerDirForAppPath, PROFILER_FOLDER_PERMISSIONS) } - val config = SparkHadoopUtil.get.newConfiguration(conf) - val appName = conf.get("spark.app.name").replace(" ", "-") - val profilerOutputDirname = profilerDfsDir.get - - val profileOutputFile = - s"$profilerOutputDirname/$applicationId/profile-$appName-exec-$executorId.jfr" - val fs = FileSystem.get(new URI(profileOutputFile), config); - val filenamePath = new Path(profileOutputFile) - outputStream = fs.create(filenamePath) + + outputStream = FileSystem.create(fs, new Path(profileOutputFile), PROFILER_FILE_PERMISSIONS) try { - if (fs.exists(filenamePath)) { - fs.delete(filenamePath, true) - } logInfo(log"Copying executor profiling file to ${MDC(PATH, profileOutputFile)}") - inputStream = new BufferedInputStream(new FileInputStream(s"$profilerLocalDir/profile.jfr")) + inputStream = new BufferedInputStream( + new FileInputStream(s"$profilerLocalDir/$profileFile")) threadpool = ThreadUtils.newDaemonSingleThreadScheduledExecutor("profilerOutputThread") threadpool.scheduleWithFixedDelay( new Runnable() { @@ -158,14 +175,14 @@ private[spark] class ExecutorJVMProfiler(conf: SparkConf, executorId: String) ex } catch { case e: IOException => logError("Exception occurred while writing some profiler output: ", e) case e @ (_: IllegalArgumentException | _: IllegalStateException) => - logError("Some profiler output not written." + - " Exception occurred in profiler native code: ", e) + logError("Some profiler output not written. " + + "Exception occurred in profiler native code: ", e) case e: Exception => logError("Some profiler output not written. Unexpected exception: ", e) } } private def finishWriting(): Unit = { - if (profilerDfsDir.isDefined && writing) { + if (profilerDfsDirOpt.isDefined && writing) { try { // shutdown background writer threadpool.shutdown() @@ -177,8 +194,8 @@ private[spark] class ExecutorJVMProfiler(conf: SparkConf, executorId: String) ex } catch { case _: InterruptedException => Thread.currentThread().interrupt() case e: IOException => - logWarning("Some profiling output not written." + - "Exception occurred while completing profiler output", e) + logWarning("Some profiling output not written. " + + "Exception occurred while completing profiler output: ", e) } writing = false } diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml index e85481ef9e1c8..22d24a7cdb62d 100644 --- a/connector/protobuf/pom.xml +++ b/connector/protobuf/pom.xml @@ -189,7 +189,7 @@ src/test/resources/protobuf - true + direct java diff --git a/core/benchmarks/ChecksumBenchmark-jdk21-results.txt b/core/benchmarks/ChecksumBenchmark-jdk21-results.txt index 85370450f355c..9e20379abe1f5 100644 --- a/core/benchmarks/ChecksumBenchmark-jdk21-results.txt +++ b/core/benchmarks/ChecksumBenchmark-jdk21-results.txt @@ -2,13 +2,12 @@ Benchmark Checksum Algorithms ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Checksum Algorithms: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -CRC32 2743 2746 3 0.0 2678409.9 1.0X -CRC32C 1974 2055 70 0.0 1928129.2 1.4X -Adler32 12689 12709 17 0.0 12391425.9 0.2X -hadoop PureJavaCrc32C 23027 23041 13 0.0 22487098.9 0.1X +Adler32 11109 11110 1 0.0 10848227.5 1.0X +CRC32 2740 2748 7 0.0 2676147.3 4.1X +CRC32C 1824 1837 22 0.0 1781283.4 6.1X diff --git a/core/benchmarks/ChecksumBenchmark-results.txt b/core/benchmarks/ChecksumBenchmark-results.txt index cce5a61abf637..5422cabf4b2b7 100644 --- a/core/benchmarks/ChecksumBenchmark-results.txt +++ b/core/benchmarks/ChecksumBenchmark-results.txt @@ -2,13 +2,12 @@ Benchmark Checksum Algorithms ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Checksum Algorithms: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -CRC32 2757 2758 1 0.0 2692250.2 1.0X -CRC32C 2142 2244 116 0.0 2091901.8 1.3X -Adler32 12699 12712 15 0.0 12401205.6 0.2X -hadoop PureJavaCrc32C 23049 23066 15 0.0 22508320.3 0.1X +Adler32 11113 11117 4 0.0 10852521.2 1.0X +CRC32 2765 2766 1 0.0 2699768.2 4.0X +CRC32C 2003 2033 45 0.0 1955654.6 5.5X diff --git a/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt b/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt index 1daac7b710bbf..07e8f05a3d185 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 359 371 12 0.3 3586.9 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 181 190 14 0.6 1812.1 2.0X -Coalesce Num Partitions: 100 Num Hosts: 10 178 185 6 0.6 1779.9 2.0X -Coalesce Num Partitions: 100 Num Hosts: 20 153 156 4 0.7 1531.2 2.3X -Coalesce Num Partitions: 100 Num Hosts: 40 148 149 1 0.7 1479.1 2.4X -Coalesce Num Partitions: 100 Num Hosts: 80 166 170 5 0.6 1657.8 2.2X -Coalesce Num Partitions: 500 Num Hosts: 1 1054 1064 14 0.1 10543.7 0.3X -Coalesce Num Partitions: 500 Num Hosts: 5 331 339 13 0.3 3311.1 1.1X -Coalesce Num Partitions: 500 Num Hosts: 10 230 235 8 0.4 2295.7 1.6X -Coalesce Num Partitions: 500 Num Hosts: 20 218 220 1 0.5 2182.0 1.6X -Coalesce Num Partitions: 500 Num Hosts: 40 161 164 2 0.6 1614.8 2.2X -Coalesce Num Partitions: 500 Num Hosts: 80 137 142 7 0.7 1371.6 2.6X -Coalesce Num Partitions: 1000 Num Hosts: 1 1926 1929 3 0.1 19264.6 0.2X -Coalesce Num Partitions: 1000 Num Hosts: 5 501 507 10 0.2 5011.1 0.7X -Coalesce Num Partitions: 1000 Num Hosts: 10 327 331 4 0.3 3268.5 1.1X -Coalesce Num Partitions: 1000 Num Hosts: 20 256 264 8 0.4 2556.1 1.4X -Coalesce Num Partitions: 1000 Num Hosts: 40 185 191 7 0.5 1853.2 1.9X -Coalesce Num Partitions: 1000 Num Hosts: 80 160 166 5 0.6 1603.5 2.2X -Coalesce Num Partitions: 5000 Num Hosts: 1 8672 9054 615 0.0 86716.9 0.0X -Coalesce Num Partitions: 5000 Num Hosts: 5 2016 2020 6 0.0 20159.9 0.2X -Coalesce Num Partitions: 5000 Num Hosts: 10 1084 1096 10 0.1 10844.7 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 20 625 636 11 0.2 6245.6 0.6X -Coalesce Num Partitions: 5000 Num Hosts: 40 418 425 6 0.2 4182.3 0.9X -Coalesce Num Partitions: 5000 Num Hosts: 80 270 276 8 0.4 2704.6 1.3X -Coalesce Num Partitions: 10000 Num Hosts: 1 16208 16391 226 0.0 162076.8 0.0X -Coalesce Num Partitions: 10000 Num Hosts: 5 3930 3949 23 0.0 39300.4 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 10 2021 2031 11 0.0 20213.1 0.2X -Coalesce Num Partitions: 10000 Num Hosts: 20 1114 1115 1 0.1 11139.0 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 40 628 639 17 0.2 6275.3 0.6X -Coalesce Num Partitions: 10000 Num Hosts: 80 402 408 10 0.2 4016.4 0.9X +Coalesce Num Partitions: 100 Num Hosts: 1 268 309 37 0.4 2678.2 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 124 130 5 0.8 1244.1 2.2X +Coalesce Num Partitions: 100 Num Hosts: 10 107 115 7 0.9 1068.7 2.5X +Coalesce Num Partitions: 100 Num Hosts: 20 104 112 6 1.0 1044.6 2.6X +Coalesce Num Partitions: 100 Num Hosts: 40 106 114 10 0.9 1062.8 2.5X +Coalesce Num Partitions: 100 Num Hosts: 80 101 106 5 1.0 1009.3 2.7X +Coalesce Num Partitions: 500 Num Hosts: 1 885 915 34 0.1 8854.5 0.3X +Coalesce Num Partitions: 500 Num Hosts: 5 263 268 5 0.4 2630.1 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 181 184 3 0.6 1806.0 1.5X +Coalesce Num Partitions: 500 Num Hosts: 20 139 144 8 0.7 1387.0 1.9X +Coalesce Num Partitions: 500 Num Hosts: 40 116 118 2 0.9 1163.7 2.3X +Coalesce Num Partitions: 500 Num Hosts: 80 108 115 7 0.9 1078.8 2.5X +Coalesce Num Partitions: 1000 Num Hosts: 1 1683 1735 79 0.1 16828.2 0.2X +Coalesce Num Partitions: 1000 Num Hosts: 5 446 449 2 0.2 4461.4 0.6X +Coalesce Num Partitions: 1000 Num Hosts: 10 256 262 7 0.4 2562.3 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 182 189 5 0.5 1824.3 1.5X +Coalesce Num Partitions: 1000 Num Hosts: 40 141 145 3 0.7 1413.2 1.9X +Coalesce Num Partitions: 1000 Num Hosts: 80 120 126 8 0.8 1203.3 2.2X +Coalesce Num Partitions: 5000 Num Hosts: 1 7913 8247 291 0.0 79127.6 0.0X +Coalesce Num Partitions: 5000 Num Hosts: 5 1818 1846 24 0.1 18177.8 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 10 990 992 2 0.1 9902.6 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 20 543 545 1 0.2 5432.5 0.5X +Coalesce Num Partitions: 5000 Num Hosts: 40 327 337 11 0.3 3272.6 0.8X +Coalesce Num Partitions: 5000 Num Hosts: 80 211 218 6 0.5 2112.1 1.3X +Coalesce Num Partitions: 10000 Num Hosts: 1 14709 15246 580 0.0 147087.6 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 3485 3511 27 0.0 34849.5 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 10 1777 1801 22 0.1 17773.8 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 20 952 953 1 0.1 9517.2 0.3X +Coalesce Num Partitions: 10000 Num Hosts: 40 523 533 9 0.2 5229.6 0.5X +Coalesce Num Partitions: 10000 Num Hosts: 80 316 319 5 0.3 3158.4 0.8X diff --git a/core/benchmarks/CoalescedRDDBenchmark-results.txt b/core/benchmarks/CoalescedRDDBenchmark-results.txt index d370e6956116d..520cb5661a276 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 126 145 24 0.8 1257.7 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 104 105 1 1.0 1039.8 1.2X -Coalesce Num Partitions: 100 Num Hosts: 10 87 89 2 1.1 873.7 1.4X -Coalesce Num Partitions: 100 Num Hosts: 20 89 90 1 1.1 893.4 1.4X -Coalesce Num Partitions: 100 Num Hosts: 40 88 91 6 1.1 880.4 1.4X -Coalesce Num Partitions: 100 Num Hosts: 80 88 94 10 1.1 875.9 1.4X -Coalesce Num Partitions: 500 Num Hosts: 1 308 314 8 0.3 3078.0 0.4X -Coalesce Num Partitions: 500 Num Hosts: 5 133 136 2 0.7 1334.6 0.9X -Coalesce Num Partitions: 500 Num Hosts: 10 112 114 4 0.9 1118.8 1.1X -Coalesce Num Partitions: 500 Num Hosts: 20 100 111 16 1.0 1004.0 1.3X -Coalesce Num Partitions: 500 Num Hosts: 40 100 106 6 1.0 999.7 1.3X -Coalesce Num Partitions: 500 Num Hosts: 80 95 100 6 1.0 954.0 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 1 531 539 14 0.2 5311.0 0.2X -Coalesce Num Partitions: 1000 Num Hosts: 5 197 201 4 0.5 1970.3 0.6X -Coalesce Num Partitions: 1000 Num Hosts: 10 139 141 2 0.7 1392.4 0.9X -Coalesce Num Partitions: 1000 Num Hosts: 20 114 115 1 0.9 1137.9 1.1X -Coalesce Num Partitions: 1000 Num Hosts: 40 105 108 3 0.9 1054.9 1.2X -Coalesce Num Partitions: 1000 Num Hosts: 80 105 109 4 1.0 1047.4 1.2X -Coalesce Num Partitions: 5000 Num Hosts: 1 2336 2354 16 0.0 23362.8 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 5 680 684 4 0.1 6798.7 0.2X -Coalesce Num Partitions: 5000 Num Hosts: 10 381 390 8 0.3 3810.5 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 20 253 255 2 0.4 2529.6 0.5X -Coalesce Num Partitions: 5000 Num Hosts: 40 171 174 3 0.6 1706.1 0.7X -Coalesce Num Partitions: 5000 Num Hosts: 80 137 139 3 0.7 1365.5 0.9X -Coalesce Num Partitions: 10000 Num Hosts: 1 4220 4253 30 0.0 42203.4 0.0X -Coalesce Num Partitions: 10000 Num Hosts: 5 1377 1394 16 0.1 13769.0 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 10 704 717 12 0.1 7036.4 0.2X -Coalesce Num Partitions: 10000 Num Hosts: 20 420 422 1 0.2 4201.7 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 40 267 271 4 0.4 2669.3 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 80 184 191 6 0.5 1842.2 0.7X +Coalesce Num Partitions: 100 Num Hosts: 1 128 134 9 0.8 1278.9 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 102 103 2 1.0 1016.5 1.3X +Coalesce Num Partitions: 100 Num Hosts: 10 86 87 1 1.2 860.3 1.5X +Coalesce Num Partitions: 100 Num Hosts: 20 87 92 7 1.1 872.0 1.5X +Coalesce Num Partitions: 100 Num Hosts: 40 83 86 3 1.2 829.7 1.5X +Coalesce Num Partitions: 100 Num Hosts: 80 83 84 1 1.2 832.5 1.5X +Coalesce Num Partitions: 500 Num Hosts: 1 306 306 1 0.3 3055.9 0.4X +Coalesce Num Partitions: 500 Num Hosts: 5 128 130 2 0.8 1277.2 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 106 110 6 0.9 1061.9 1.2X +Coalesce Num Partitions: 500 Num Hosts: 20 95 96 1 1.1 950.4 1.3X +Coalesce Num Partitions: 500 Num Hosts: 40 92 94 4 1.1 918.5 1.4X +Coalesce Num Partitions: 500 Num Hosts: 80 87 88 1 1.1 871.4 1.5X +Coalesce Num Partitions: 1000 Num Hosts: 1 523 529 9 0.2 5229.5 0.2X +Coalesce Num Partitions: 1000 Num Hosts: 5 185 189 3 0.5 1853.1 0.7X +Coalesce Num Partitions: 1000 Num Hosts: 10 128 131 3 0.8 1278.9 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 106 108 3 0.9 1057.8 1.2X +Coalesce Num Partitions: 1000 Num Hosts: 40 97 97 1 1.0 968.2 1.3X +Coalesce Num Partitions: 1000 Num Hosts: 80 93 98 8 1.1 931.4 1.4X +Coalesce Num Partitions: 5000 Num Hosts: 1 2321 2328 11 0.0 23205.2 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 674 680 5 0.1 6741.0 0.2X +Coalesce Num Partitions: 5000 Num Hosts: 10 374 378 7 0.3 3738.4 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 20 232 238 6 0.4 2316.6 0.6X +Coalesce Num Partitions: 5000 Num Hosts: 40 163 166 3 0.6 1630.1 0.8X +Coalesce Num Partitions: 5000 Num Hosts: 80 127 129 2 0.8 1274.8 1.0X +Coalesce Num Partitions: 10000 Num Hosts: 1 4228 4243 18 0.0 42280.5 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 1387 1400 13 0.1 13870.3 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 10 711 714 4 0.1 7105.0 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 20 401 408 7 0.2 4010.4 0.3X +Coalesce Num Partitions: 10000 Num Hosts: 40 251 253 2 0.4 2513.0 0.5X +Coalesce Num Partitions: 10000 Num Hosts: 80 175 182 8 0.6 1754.8 0.7X diff --git a/core/benchmarks/KryoBenchmark-jdk21-results.txt b/core/benchmarks/KryoBenchmark-jdk21-results.txt index aee420e8ca26a..4cc1e4dd2ba76 100644 --- a/core/benchmarks/KryoBenchmark-jdk21-results.txt +++ b/core/benchmarks/KryoBenchmark-jdk21-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 174 176 1 5.7 174.3 1.0X -basicTypes: Long with unsafe:true 178 184 5 5.6 178.1 1.0X -basicTypes: Float with unsafe:true 185 187 1 5.4 185.3 0.9X -basicTypes: Double with unsafe:true 187 189 1 5.3 187.0 0.9X -Array: Int with unsafe:true 1 1 0 752.1 1.3 131.1X -Array: Long with unsafe:true 2 2 0 490.6 2.0 85.5X -Array: Float with unsafe:true 1 1 0 757.7 1.3 132.1X -Array: Double with unsafe:true 2 2 0 483.9 2.1 84.4X -Map of string->Double with unsafe:true 26 26 2 38.5 26.0 6.7X -basicTypes: Int with unsafe:false 206 207 1 4.9 205.7 0.8X -basicTypes: Long with unsafe:false 222 223 1 4.5 221.7 0.8X -basicTypes: Float with unsafe:false 217 218 1 4.6 216.5 0.8X -basicTypes: Double with unsafe:false 217 218 2 4.6 216.6 0.8X -Array: Int with unsafe:false 13 13 0 79.5 12.6 13.9X -Array: Long with unsafe:false 21 22 0 46.6 21.4 8.1X -Array: Float with unsafe:false 6 6 0 167.8 6.0 29.3X -Array: Double with unsafe:false 16 16 0 64.2 15.6 11.2X -Map of string->Double with unsafe:false 28 28 1 36.3 27.5 6.3X +basicTypes: Int with unsafe:true 167 168 1 6.0 167.0 1.0X +basicTypes: Long with unsafe:true 174 178 2 5.7 174.1 1.0X +basicTypes: Float with unsafe:true 203 204 1 4.9 202.9 0.8X +basicTypes: Double with unsafe:true 206 207 1 4.9 206.1 0.8X +Array: Int with unsafe:true 1 1 0 768.6 1.3 128.4X +Array: Long with unsafe:true 2 2 0 502.0 2.0 83.9X +Array: Float with unsafe:true 1 1 0 773.6 1.3 129.2X +Array: Double with unsafe:true 2 2 0 492.6 2.0 82.3X +Map of string->Double with unsafe:true 27 27 1 37.5 26.6 6.3X +basicTypes: Int with unsafe:false 198 199 1 5.1 197.9 0.8X +basicTypes: Long with unsafe:false 217 219 2 4.6 216.8 0.8X +basicTypes: Float with unsafe:false 201 203 2 5.0 201.0 0.8X +basicTypes: Double with unsafe:false 202 204 1 5.0 201.9 0.8X +Array: Int with unsafe:false 13 13 0 79.7 12.5 13.3X +Array: Long with unsafe:false 20 21 0 49.1 20.4 8.2X +Array: Float with unsafe:false 7 8 0 134.4 7.4 22.5X +Array: Double with unsafe:false 11 12 0 87.2 11.5 14.6X +Map of string->Double with unsafe:false 28 28 1 36.3 27.5 6.1X diff --git a/core/benchmarks/KryoBenchmark-results.txt b/core/benchmarks/KryoBenchmark-results.txt index ca80b13a5346d..6c46724fbd2e8 100644 --- a/core/benchmarks/KryoBenchmark-results.txt +++ b/core/benchmarks/KryoBenchmark-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 171 173 2 5.9 170.8 1.0X -basicTypes: Long with unsafe:true 190 193 3 5.3 189.6 0.9X -basicTypes: Float with unsafe:true 186 188 2 5.4 186.1 0.9X -basicTypes: Double with unsafe:true 189 190 1 5.3 188.5 0.9X -Array: Int with unsafe:true 1 2 0 720.0 1.4 123.0X -Array: Long with unsafe:true 2 3 0 462.4 2.2 79.0X -Array: Float with unsafe:true 1 2 0 719.2 1.4 122.9X -Array: Double with unsafe:true 2 3 0 459.8 2.2 78.5X -Map of string->Double with unsafe:true 27 28 1 37.2 26.9 6.3X -basicTypes: Int with unsafe:false 219 220 1 4.6 218.7 0.8X -basicTypes: Long with unsafe:false 242 244 2 4.1 242.1 0.7X -basicTypes: Float with unsafe:false 215 220 10 4.7 214.8 0.8X -basicTypes: Double with unsafe:false 222 224 2 4.5 221.7 0.8X -Array: Int with unsafe:false 15 15 0 66.9 15.0 11.4X -Array: Long with unsafe:false 22 22 0 45.9 21.8 7.8X -Array: Float with unsafe:false 6 6 1 170.1 5.9 29.1X -Array: Double with unsafe:false 10 10 0 103.0 9.7 17.6X -Map of string->Double with unsafe:false 31 32 2 32.4 30.9 5.5X +basicTypes: Int with unsafe:true 167 169 1 6.0 167.5 1.0X +basicTypes: Long with unsafe:true 189 195 3 5.3 188.7 0.9X +basicTypes: Float with unsafe:true 188 192 5 5.3 187.8 0.9X +basicTypes: Double with unsafe:true 190 192 3 5.3 189.7 0.9X +Array: Int with unsafe:true 1 1 0 734.5 1.4 123.0X +Array: Long with unsafe:true 2 2 0 478.3 2.1 80.1X +Array: Float with unsafe:true 1 1 0 736.7 1.4 123.4X +Array: Double with unsafe:true 2 2 0 475.5 2.1 79.6X +Map of string->Double with unsafe:true 27 27 0 37.5 26.7 6.3X +basicTypes: Int with unsafe:false 210 211 2 4.8 210.0 0.8X +basicTypes: Long with unsafe:false 224 225 1 4.5 224.4 0.7X +basicTypes: Float with unsafe:false 203 204 1 4.9 203.4 0.8X +basicTypes: Double with unsafe:false 210 212 1 4.8 210.0 0.8X +Array: Int with unsafe:false 15 15 0 68.2 14.7 11.4X +Array: Long with unsafe:false 20 21 0 49.2 20.3 8.2X +Array: Float with unsafe:false 6 6 0 167.7 6.0 28.1X +Array: Double with unsafe:false 10 10 2 99.3 10.1 16.6X +Map of string->Double with unsafe:false 28 29 1 35.1 28.4 5.9X diff --git a/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt b/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt index e3922382068dd..36124a13c29d1 100644 --- a/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt +++ b/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt @@ -2,27 +2,27 @@ Benchmark of kryo asIterator on deserialization stream ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark of kryo asIterator on deserialization stream: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------- -Colletion of int with 1 elements, useIterator: true 6 6 0 1.7 590.7 1.0X -Colletion of int with 10 elements, useIterator: true 14 15 1 0.7 1431.1 0.4X -Colletion of int with 100 elements, useIterator: true 94 96 1 0.1 9429.2 0.1X -Colletion of string with 1 elements, useIterator: true 8 8 0 1.3 760.8 0.8X -Colletion of string with 10 elements, useIterator: true 22 23 0 0.4 2246.1 0.3X -Colletion of string with 100 elements, useIterator: true 167 167 1 0.1 16659.4 0.0X -Colletion of Array[int] with 1 elements, useIterator: true 7 8 0 1.4 735.4 0.8X -Colletion of Array[int] with 10 elements, useIterator: true 20 20 1 0.5 1976.5 0.3X -Colletion of Array[int] with 100 elements, useIterator: true 154 154 1 0.1 15356.7 0.0X -Colletion of int with 1 elements, useIterator: false 6 7 0 1.6 623.6 0.9X -Colletion of int with 10 elements, useIterator: false 13 14 0 0.7 1334.3 0.4X -Colletion of int with 100 elements, useIterator: false 82 82 0 0.1 8164.4 0.1X -Colletion of string with 1 elements, useIterator: false 7 8 0 1.4 727.0 0.8X -Colletion of string with 10 elements, useIterator: false 22 22 0 0.5 2166.5 0.3X -Colletion of string with 100 elements, useIterator: false 159 160 0 0.1 15925.0 0.0X -Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.4 712.1 0.8X -Colletion of Array[int] with 10 elements, useIterator: false 19 20 0 0.5 1932.3 0.3X -Colletion of Array[int] with 100 elements, useIterator: false 142 143 1 0.1 14220.2 0.0X +Colletion of int with 1 elements, useIterator: true 6 6 0 1.6 625.1 1.0X +Colletion of int with 10 elements, useIterator: true 15 15 0 0.7 1466.8 0.4X +Colletion of int with 100 elements, useIterator: true 95 96 1 0.1 9536.2 0.1X +Colletion of string with 1 elements, useIterator: true 8 8 0 1.3 771.9 0.8X +Colletion of string with 10 elements, useIterator: true 23 23 0 0.4 2260.7 0.3X +Colletion of string with 100 elements, useIterator: true 162 162 0 0.1 16179.1 0.0X +Colletion of Array[int] with 1 elements, useIterator: true 7 8 1 1.4 730.4 0.9X +Colletion of Array[int] with 10 elements, useIterator: true 20 20 1 0.5 1966.8 0.3X +Colletion of Array[int] with 100 elements, useIterator: true 146 147 1 0.1 14593.4 0.0X +Colletion of int with 1 elements, useIterator: false 6 7 0 1.6 636.6 1.0X +Colletion of int with 10 elements, useIterator: false 14 14 0 0.7 1366.4 0.5X +Colletion of int with 100 elements, useIterator: false 84 85 1 0.1 8439.6 0.1X +Colletion of string with 1 elements, useIterator: false 7 7 0 1.4 725.5 0.9X +Colletion of string with 10 elements, useIterator: false 21 22 0 0.5 2115.1 0.3X +Colletion of string with 100 elements, useIterator: false 173 174 3 0.1 17316.2 0.0X +Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.4 698.9 0.9X +Colletion of Array[int] with 10 elements, useIterator: false 19 19 0 0.5 1894.5 0.3X +Colletion of Array[int] with 100 elements, useIterator: false 141 142 1 0.1 14108.1 0.0X diff --git a/core/benchmarks/KryoIteratorBenchmark-results.txt b/core/benchmarks/KryoIteratorBenchmark-results.txt index 77452144ac01d..6c3496909c6b7 100644 --- a/core/benchmarks/KryoIteratorBenchmark-results.txt +++ b/core/benchmarks/KryoIteratorBenchmark-results.txt @@ -2,27 +2,27 @@ Benchmark of kryo asIterator on deserialization stream ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark of kryo asIterator on deserialization stream: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------- -Colletion of int with 1 elements, useIterator: true 6 6 0 1.6 621.6 1.0X -Colletion of int with 10 elements, useIterator: true 14 14 0 0.7 1422.5 0.4X -Colletion of int with 100 elements, useIterator: true 90 92 1 0.1 9030.9 0.1X -Colletion of string with 1 elements, useIterator: true 7 8 0 1.4 726.3 0.9X -Colletion of string with 10 elements, useIterator: true 23 23 1 0.4 2251.0 0.3X -Colletion of string with 100 elements, useIterator: true 172 172 0 0.1 17183.3 0.0X -Colletion of Array[int] with 1 elements, useIterator: true 7 7 0 1.4 718.6 0.9X -Colletion of Array[int] with 10 elements, useIterator: true 21 21 1 0.5 2078.4 0.3X -Colletion of Array[int] with 100 elements, useIterator: true 162 162 0 0.1 16189.6 0.0X -Colletion of int with 1 elements, useIterator: false 6 6 0 1.7 599.6 1.0X -Colletion of int with 10 elements, useIterator: false 13 13 0 0.8 1313.8 0.5X -Colletion of int with 100 elements, useIterator: false 81 82 0 0.1 8132.1 0.1X -Colletion of string with 1 elements, useIterator: false 7 7 0 1.4 705.6 0.9X -Colletion of string with 10 elements, useIterator: false 22 23 0 0.4 2240.1 0.3X -Colletion of string with 100 elements, useIterator: false 170 170 0 0.1 16995.3 0.0X -Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.5 675.7 0.9X -Colletion of Array[int] with 10 elements, useIterator: false 18 19 0 0.5 1842.7 0.3X -Colletion of Array[int] with 100 elements, useIterator: false 138 139 0 0.1 13801.7 0.0X +Colletion of int with 1 elements, useIterator: true 6 7 0 1.5 646.6 1.0X +Colletion of int with 10 elements, useIterator: true 14 14 0 0.7 1354.6 0.5X +Colletion of int with 100 elements, useIterator: true 82 82 0 0.1 8169.3 0.1X +Colletion of string with 1 elements, useIterator: true 8 8 0 1.3 777.8 0.8X +Colletion of string with 10 elements, useIterator: true 22 23 1 0.4 2237.2 0.3X +Colletion of string with 100 elements, useIterator: true 161 161 1 0.1 16071.4 0.0X +Colletion of Array[int] with 1 elements, useIterator: true 7 8 0 1.4 726.7 0.9X +Colletion of Array[int] with 10 elements, useIterator: true 20 20 0 0.5 1984.8 0.3X +Colletion of Array[int] with 100 elements, useIterator: true 151 151 0 0.1 15059.2 0.0X +Colletion of int with 1 elements, useIterator: false 6 6 0 1.6 609.0 1.1X +Colletion of int with 10 elements, useIterator: false 13 14 0 0.8 1322.7 0.5X +Colletion of int with 100 elements, useIterator: false 81 82 1 0.1 8138.4 0.1X +Colletion of string with 1 elements, useIterator: false 7 8 0 1.4 732.7 0.9X +Colletion of string with 10 elements, useIterator: false 23 23 0 0.4 2254.4 0.3X +Colletion of string with 100 elements, useIterator: false 171 173 4 0.1 17050.7 0.0X +Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.4 705.9 0.9X +Colletion of Array[int] with 10 elements, useIterator: false 20 20 0 0.5 1974.9 0.3X +Colletion of Array[int] with 100 elements, useIterator: false 147 148 1 0.1 14730.9 0.0X diff --git a/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt b/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt index c00cd9152b278..14649cd560327 100644 --- a/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 4166 5737 1977 0.0 8331992.4 1.0X -KryoPool:false 6201 7778 1281 0.0 12402118.8 0.7X +KryoPool:true 3445 5067 1740 0.0 6889852.4 1.0X +KryoPool:false 5594 7457 1439 0.0 11188845.8 0.6X diff --git a/core/benchmarks/KryoSerializerBenchmark-results.txt b/core/benchmarks/KryoSerializerBenchmark-results.txt index a86338957cc37..c08bbfebe993d 100644 --- a/core/benchmarks/KryoSerializerBenchmark-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 3973 5797 1879 0.0 7945107.3 1.0X -KryoPool:false 6041 7623 1484 0.0 12082153.5 0.7X +KryoPool:true 3409 5129 1620 0.0 6817249.8 1.0X +KryoPool:false 5506 7416 1256 0.0 11011835.6 0.6X diff --git a/core/benchmarks/LZFBenchmark-jdk21-results.txt b/core/benchmarks/LZFBenchmark-jdk21-results.txt index 7104879c5c753..1f39e58139e65 100644 --- a/core/benchmarks/LZFBenchmark-jdk21-results.txt +++ b/core/benchmarks/LZFBenchmark-jdk21-results.txt @@ -2,18 +2,18 @@ Benchmark LZFCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Compress small objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Compression 256000000 int values in parallel 599 601 4 427.4 2.3 1.0X -Compression 256000000 int values single-threaded 608 615 7 420.9 2.4 1.0X +Compression 256000000 int values in parallel 605 611 5 423.4 2.4 1.0X +Compression 256000000 int values single-threaded 612 619 5 418.5 2.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Compress large objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Compression 1024 array values in 1 threads 37 45 5 0.0 35857.3 1.0X -Compression 1024 array values single-threaded 31 31 0 0.0 30334.5 1.2X +Compression 1024 array values in 1 threads 44 48 3 0.0 43323.6 1.0X +Compression 1024 array values single-threaded 32 32 0 0.0 30772.9 1.4X diff --git a/core/benchmarks/LZFBenchmark-results.txt b/core/benchmarks/LZFBenchmark-results.txt index 142d3aad2f1ba..92d8ba52412df 100644 --- a/core/benchmarks/LZFBenchmark-results.txt +++ b/core/benchmarks/LZFBenchmark-results.txt @@ -2,18 +2,18 @@ Benchmark LZFCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Compress small objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Compression 256000000 int values in parallel 601 610 9 426.3 2.3 1.0X -Compression 256000000 int values single-threaded 610 619 7 419.3 2.4 1.0X +Compression 256000000 int values in parallel 598 608 7 428.2 2.3 1.0X +Compression 256000000 int values single-threaded 615 623 6 416.1 2.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Compress large objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Compression 1024 array values in 1 threads 35 44 5 0.0 34512.1 1.0X -Compression 1024 array values single-threaded 31 32 1 0.0 30396.7 1.1X +Compression 1024 array values in 1 threads 39 46 5 0.0 37798.8 1.0X +Compression 1024 array values single-threaded 31 32 0 0.0 29960.9 1.3X diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt index 7c1b1eb4ac803..123a40fad3e62 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Num Maps: 50000 Fetch partitions:500 696 705 13 0.0 696430567.0 1.0X -Num Maps: 50000 Fetch partitions:1000 1620 1628 7 0.0 1620094001.0 0.4X -Num Maps: 50000 Fetch partitions:1500 2507 2522 13 0.0 2507485825.0 0.3X +Num Maps: 50000 Fetch partitions:500 716 730 14 0.0 715747604.0 1.0X +Num Maps: 50000 Fetch partitions:1000 1592 1619 29 0.0 1591519021.0 0.4X +Num Maps: 50000 Fetch partitions:1500 2500 2507 7 0.0 2499934291.0 0.3X diff --git a/core/benchmarks/MapStatusesConvertBenchmark-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-results.txt index 4ca2e502b9404..ef390204bb0af 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Num Maps: 50000 Fetch partitions:500 775 782 8 0.0 774584162.0 1.0X -Num Maps: 50000 Fetch partitions:1000 1605 1634 29 0.0 1604801022.0 0.5X -Num Maps: 50000 Fetch partitions:1500 2568 2585 22 0.0 2568404459.0 0.3X +Num Maps: 50000 Fetch partitions:500 612 614 3 0.0 611543498.0 1.0X +Num Maps: 50000 Fetch partitions:1000 1389 1398 15 0.0 1388971632.0 0.4X +Num Maps: 50000 Fetch partitions:1500 2178 2222 39 0.0 2177711722.0 0.3X diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt index 1ffe7594c22cd..708bf8f770d40 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 81 85 4 2.5 406.5 1.0X -Deserialization 147 155 9 1.4 734.2 0.6X +Serialization 84 87 3 2.4 422.4 1.0X +Deserialization 143 151 6 1.4 712.6 0.6X Compressed Serialized MapStatus sizes: 426.0 B Compressed Serialized Broadcast MapStatus sizes: 2.5 MiB -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 73 75 3 2.7 365.9 1.0X -Deserialization 146 153 10 1.4 732.1 0.5X +Serialization 83 85 2 2.4 414.3 1.0X +Deserialization 141 145 7 1.4 703.0 0.6X Compressed Serialized MapStatus sizes: 2.5 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 152 157 6 1.3 759.6 1.0X -Deserialization 162 166 4 1.2 811.8 0.9X +Serialization 154 160 9 1.3 770.3 1.0X +Deserialization 158 164 10 1.3 788.4 1.0X Compressed Serialized MapStatus sizes: 442.0 B Compressed Serialized Broadcast MapStatus sizes: 13.6 MiB -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 141 142 1 1.4 703.4 1.0X -Deserialization 161 165 5 1.2 807.0 0.9X +Serialization 145 146 1 1.4 724.1 1.0X +Deserialization 158 162 7 1.3 790.2 0.9X Compressed Serialized MapStatus sizes: 13.6 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 686 714 31 0.3 3431.6 1.0X -Deserialization 329 346 18 0.6 1645.6 2.1X +Serialization 693 722 42 0.3 3463.5 1.0X +Deserialization 330 357 25 0.6 1648.3 2.1X -Compressed Serialized MapStatus sizes: 569.0 B +Compressed Serialized MapStatus sizes: 568.0 B Compressed Serialized Broadcast MapStatus sizes: 122.3 MiB -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 560 562 3 0.4 2797.7 1.0X -Deserialization 317 334 20 0.6 1587.0 1.8X +Serialization 569 573 3 0.4 2845.7 1.0X +Deserialization 330 350 17 0.6 1647.8 1.7X Compressed Serialized MapStatus sizes: 122.3 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt index edd6207a12f8b..6e69a91cbafdb 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 93 99 8 2.1 466.1 1.0X -Deserialization 140 151 12 1.4 698.8 0.7X +Serialization 88 92 3 2.3 442.0 1.0X +Deserialization 138 143 10 1.5 688.2 0.6X Compressed Serialized MapStatus sizes: 426.0 B Compressed Serialized Broadcast MapStatus sizes: 2.5 MiB -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 82 83 1 2.4 409.0 1.0X -Deserialization 139 142 8 1.4 692.8 0.6X +Serialization 77 79 1 2.6 386.9 1.0X +Deserialization 137 140 5 1.5 685.8 0.6X Compressed Serialized MapStatus sizes: 2.5 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 161 168 7 1.2 802.6 1.0X -Deserialization 155 169 13 1.3 777.3 1.0X +Serialization 159 160 1 1.3 793.9 1.0X +Deserialization 154 160 9 1.3 770.9 1.0X Compressed Serialized MapStatus sizes: 442.0 B Compressed Serialized Broadcast MapStatus sizes: 13.6 MiB -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 145 147 1 1.4 726.2 1.0X -Deserialization 155 160 10 1.3 772.7 0.9X +Serialization 146 147 1 1.4 730.8 1.0X +Deserialization 154 157 3 1.3 772.4 0.9X Compressed Serialized MapStatus sizes: 13.6 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 693 714 23 0.3 3465.9 1.0X -Deserialization 326 351 13 0.6 1628.7 2.1X +Serialization 697 702 9 0.3 3483.3 1.0X +Deserialization 317 323 7 0.6 1583.0 2.2X -Compressed Serialized MapStatus sizes: 568.0 B +Compressed Serialized MapStatus sizes: 569.0 B Compressed Serialized Broadcast MapStatus sizes: 122.3 MiB -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 564 576 13 0.4 2817.6 1.0X -Deserialization 339 354 9 0.6 1694.4 1.7X +Serialization 568 577 7 0.4 2842.2 1.0X +Deserialization 308 316 7 0.6 1540.4 1.8X Compressed Serialized MapStatus sizes: 122.3 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B diff --git a/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt b/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt index 7262ea63a6ef9..c91af8730b49c 100644 --- a/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt +++ b/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt @@ -2,17 +2,17 @@ PersistenceEngineBenchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 1000 Workers: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -ZooKeeperPersistenceEngine with JavaSerializer 5620 5811 236 0.0 5619587.2 1.0X -FileSystemPersistenceEngine with JavaSerializer 2876 2924 42 0.0 2876068.8 2.0X -FileSystemPersistenceEngine with JavaSerializer (lz4) 825 829 4 0.0 824880.6 6.8X -FileSystemPersistenceEngine with JavaSerializer (lzf) 742 774 39 0.0 742492.3 7.6X -FileSystemPersistenceEngine with JavaSerializer (snappy) 785 832 42 0.0 784738.0 7.2X -FileSystemPersistenceEngine with JavaSerializer (zstd) 966 982 14 0.0 965925.8 5.8X -RocksDBPersistenceEngine with JavaSerializer 299 301 2 0.0 299470.1 18.8X -BlackHolePersistenceEngine 0 0 0 6.0 166.6 33740.5X +ZooKeeperPersistenceEngine with JavaSerializer 7133 7390 257 0.0 7132665.6 1.0X +FileSystemPersistenceEngine with JavaSerializer 2449 2470 22 0.0 2448714.2 2.9X +FileSystemPersistenceEngine with JavaSerializer (lz4) 784 805 19 0.0 783603.9 9.1X +FileSystemPersistenceEngine with JavaSerializer (lzf) 719 763 52 0.0 719310.0 9.9X +FileSystemPersistenceEngine with JavaSerializer (snappy) 731 765 42 0.0 731346.6 9.8X +FileSystemPersistenceEngine with JavaSerializer (zstd) 920 971 52 0.0 919508.6 7.8X +RocksDBPersistenceEngine with JavaSerializer 283 284 1 0.0 282641.0 25.2X +BlackHolePersistenceEngine 0 0 0 6.0 167.4 42612.8X diff --git a/core/benchmarks/PersistenceEngineBenchmark-results.txt b/core/benchmarks/PersistenceEngineBenchmark-results.txt index c373d88842d2e..14ca05abad071 100644 --- a/core/benchmarks/PersistenceEngineBenchmark-results.txt +++ b/core/benchmarks/PersistenceEngineBenchmark-results.txt @@ -2,17 +2,17 @@ PersistenceEngineBenchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 1000 Workers: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -ZooKeeperPersistenceEngine with JavaSerializer 6146 6314 215 0.0 6146007.1 1.0X -FileSystemPersistenceEngine with JavaSerializer 2944 2957 17 0.0 2944099.7 2.1X -FileSystemPersistenceEngine with JavaSerializer (lz4) 827 869 37 0.0 827379.0 7.4X -FileSystemPersistenceEngine with JavaSerializer (lzf) 799 826 25 0.0 799318.1 7.7X -FileSystemPersistenceEngine with JavaSerializer (snappy) 775 805 50 0.0 774802.8 7.9X -FileSystemPersistenceEngine with JavaSerializer (zstd) 972 1002 28 0.0 971773.9 6.3X -RocksDBPersistenceEngine with JavaSerializer 310 312 3 0.0 310401.2 19.8X -BlackHolePersistenceEngine 0 0 0 6.0 165.7 37097.7X +ZooKeeperPersistenceEngine with JavaSerializer 6582 6738 184 0.0 6581975.7 1.0X +FileSystemPersistenceEngine with JavaSerializer 2493 2507 12 0.0 2492854.1 2.6X +FileSystemPersistenceEngine with JavaSerializer (lz4) 784 827 40 0.0 783848.3 8.4X +FileSystemPersistenceEngine with JavaSerializer (lzf) 755 774 17 0.0 755155.3 8.7X +FileSystemPersistenceEngine with JavaSerializer (snappy) 739 786 49 0.0 739163.8 8.9X +FileSystemPersistenceEngine with JavaSerializer (zstd) 956 988 33 0.0 955958.8 6.9X +RocksDBPersistenceEngine with JavaSerializer 290 295 7 0.0 289554.4 22.7X +BlackHolePersistenceEngine 0 0 0 6.2 161.8 40674.2X diff --git a/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt b/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt index ccae104413f6a..cfff77298896a 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.3 3296.0 1.0X -Utils.cloneProperties 0 0 0 34.5 29.0 113.7X +SerializationUtils.clone 0 0 0 0.3 3146.0 1.0X +Utils.cloneProperties 0 0 0 11.2 89.0 35.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 156331.0 1.0X -Utils.cloneProperties 0 0 0 0.4 2595.0 60.2X +SerializationUtils.clone 0 0 0 0.0 158717.0 1.0X +Utils.cloneProperties 0 0 0 0.2 4819.0 32.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 255154.0 1.0X -Utils.cloneProperties 0 0 0 0.4 2624.0 97.2X +SerializationUtils.clone 0 0 0 0.0 241783.0 1.0X +Utils.cloneProperties 0 0 0 0.2 6051.0 40.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 0 0.0 887239.0 1.0X -Utils.cloneProperties 0 0 0 0.1 14908.0 59.5X +SerializationUtils.clone 1 1 0 0.0 853297.0 1.0X +Utils.cloneProperties 0 0 0 0.0 30927.0 27.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 2 2 0 0.0 1655264.0 1.0X -Utils.cloneProperties 0 0 0 0.0 30837.0 53.7X +SerializationUtils.clone 2 2 0 0.0 1598481.0 1.0X +Utils.cloneProperties 0 0 0 0.0 63448.0 25.2X diff --git a/core/benchmarks/PropertiesCloneBenchmark-results.txt b/core/benchmarks/PropertiesCloneBenchmark-results.txt index f6c6c8781dc25..cceb3e8710dee 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.3 3466.0 1.0X -Utils.cloneProperties 0 0 0 34.5 29.0 119.5X +SerializationUtils.clone 0 0 0 0.3 3186.0 1.0X +Utils.cloneProperties 0 0 0 11.1 90.0 35.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 156422.0 1.0X -Utils.cloneProperties 0 0 0 0.4 2685.0 58.3X +SerializationUtils.clone 0 0 0 0.0 175435.0 1.0X +Utils.cloneProperties 0 0 0 0.2 4247.0 41.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 277017.0 1.0X -Utils.cloneProperties 0 0 0 0.3 3666.0 75.6X +SerializationUtils.clone 0 0 0 0.0 255744.0 1.0X +Utils.cloneProperties 0 0 0 0.1 7273.0 35.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 0 0.0 920141.0 1.0X -Utils.cloneProperties 0 0 0 0.0 20097.0 45.8X +SerializationUtils.clone 1 1 0 0.0 863683.0 1.0X +Utils.cloneProperties 0 0 0 0.0 36508.0 23.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 2 2 0 0.0 1714798.0 1.0X -Utils.cloneProperties 0 0 0 0.0 40385.0 42.5X +SerializationUtils.clone 2 2 0 0.0 1612893.0 1.0X +Utils.cloneProperties 0 0 0 0.0 73617.0 21.9X diff --git a/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt b/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt index 9f2baa5d9bf80..4b892b7ea2c85 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 451 451 0 222.0 4.5 1.0X -XORShiftRandom 185 185 0 539.4 1.9 2.4X +java.util.Random 453 453 0 220.7 4.5 1.0X +XORShiftRandom 186 186 0 536.6 1.9 2.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 897 897 0 111.5 9.0 1.0X -XORShiftRandom 371 371 0 269.5 3.7 2.4X +java.util.Random 900 901 1 111.1 9.0 1.0X +XORShiftRandom 373 373 1 268.1 3.7 2.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 914 914 0 109.4 9.1 1.0X -XORShiftRandom 371 371 1 269.5 3.7 2.5X +java.util.Random 905 905 0 110.5 9.0 1.0X +XORShiftRandom 373 373 0 268.2 3.7 2.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 3381 3384 3 29.6 33.8 1.0X -XORShiftRandom 2480 2498 29 40.3 24.8 1.4X +java.util.Random 3412 3427 13 29.3 34.1 1.0X +XORShiftRandom 2469 2472 4 40.5 24.7 1.4X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 1 1 0 12973.9 0.1 1.0X +XORShiftRandom.hashSeed 1 1 0 12522.5 0.1 1.0X diff --git a/core/benchmarks/XORShiftRandomBenchmark-results.txt b/core/benchmarks/XORShiftRandomBenchmark-results.txt index de5f7c04fddfc..c45a3c66afafa 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 439 439 0 227.7 4.4 1.0X -XORShiftRandom 185 185 0 539.5 1.9 2.4X +java.util.Random 441 441 0 226.6 4.4 1.0X +XORShiftRandom 186 186 0 536.7 1.9 2.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 911 912 1 109.8 9.1 1.0X -XORShiftRandom 371 371 1 269.7 3.7 2.5X +java.util.Random 918 918 1 109.0 9.2 1.0X +XORShiftRandom 373 373 0 268.2 3.7 2.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 904 904 0 110.6 9.0 1.0X -XORShiftRandom 371 371 0 269.7 3.7 2.4X +java.util.Random 904 905 1 110.7 9.0 1.0X +XORShiftRandom 373 374 1 268.2 3.7 2.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 3997 3998 1 25.0 40.0 1.0X -XORShiftRandom 2926 2929 2 34.2 29.3 1.4X +java.util.Random 3590 3600 10 27.9 35.9 1.0X +XORShiftRandom 2941 2942 1 34.0 29.4 1.2X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 2 2 0 6487.7 0.2 1.0X +XORShiftRandom.hashSeed 2 2 0 6458.4 0.2 1.0X diff --git a/core/benchmarks/ZStandardBenchmark-jdk21-results.txt b/core/benchmarks/ZStandardBenchmark-jdk21-results.txt index f6bd681451d5e..b2a325942cd88 100644 --- a/core/benchmarks/ZStandardBenchmark-jdk21-results.txt +++ b/core/benchmarks/ZStandardBenchmark-jdk21-results.txt @@ -2,48 +2,48 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 659 676 16 0.0 65860.7 1.0X -Compression 10000 times at level 2 without buffer pool 721 723 2 0.0 72135.5 0.9X -Compression 10000 times at level 3 without buffer pool 815 816 1 0.0 81500.6 0.8X -Compression 10000 times at level 1 with buffer pool 608 609 0 0.0 60846.6 1.1X -Compression 10000 times at level 2 with buffer pool 645 647 3 0.0 64476.3 1.0X -Compression 10000 times at level 3 with buffer pool 746 746 1 0.0 74584.0 0.9X +Compression 10000 times at level 1 without buffer pool 656 668 13 0.0 65555.1 1.0X +Compression 10000 times at level 2 without buffer pool 711 713 2 0.0 71147.9 0.9X +Compression 10000 times at level 3 without buffer pool 827 830 2 0.0 82718.7 0.8X +Compression 10000 times at level 1 with buffer pool 598 599 2 0.0 59789.9 1.1X +Compression 10000 times at level 2 with buffer pool 628 630 2 0.0 62774.0 1.0X +Compression 10000 times at level 3 with buffer pool 735 736 1 0.0 73517.1 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 828 829 1 0.0 82822.6 1.0X -Decompression 10000 times from level 2 without buffer pool 829 829 1 0.0 82900.7 1.0X -Decompression 10000 times from level 3 without buffer pool 828 833 8 0.0 82784.4 1.0X -Decompression 10000 times from level 1 with buffer pool 758 760 2 0.0 75756.5 1.1X -Decompression 10000 times from level 2 with buffer pool 758 758 1 0.0 75772.3 1.1X -Decompression 10000 times from level 3 with buffer pool 759 759 0 0.0 75852.7 1.1X +Decompression 10000 times from level 1 without buffer pool 823 824 1 0.0 82271.8 1.0X +Decompression 10000 times from level 2 without buffer pool 823 825 2 0.0 82313.0 1.0X +Decompression 10000 times from level 3 without buffer pool 825 832 10 0.0 82532.5 1.0X +Decompression 10000 times from level 1 with buffer pool 756 757 2 0.0 75593.4 1.1X +Decompression 10000 times from level 2 with buffer pool 757 759 2 0.0 75728.2 1.1X +Decompression 10000 times from level 3 with buffer pool 760 760 0 0.0 75986.2 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 58 59 1 0.0 452489.9 1.0X -Parallel Compression with 1 workers 42 45 4 0.0 330066.0 1.4X -Parallel Compression with 2 workers 40 42 1 0.0 312560.3 1.4X -Parallel Compression with 4 workers 40 42 2 0.0 308802.7 1.5X -Parallel Compression with 8 workers 41 45 3 0.0 321331.3 1.4X -Parallel Compression with 16 workers 44 45 1 0.0 343311.5 1.3X +Parallel Compression with 0 workers 58 60 4 0.0 456002.7 1.0X +Parallel Compression with 1 workers 43 45 3 0.0 332797.0 1.4X +Parallel Compression with 2 workers 41 42 1 0.0 317101.0 1.4X +Parallel Compression with 4 workers 39 41 1 0.0 306350.9 1.5X +Parallel Compression with 8 workers 42 44 1 0.0 326335.3 1.4X +Parallel Compression with 16 workers 46 47 1 0.0 356789.8 1.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 9: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 158 160 2 0.0 1234257.6 1.0X -Parallel Compression with 1 workers 193 194 1 0.0 1507686.4 0.8X -Parallel Compression with 2 workers 113 127 11 0.0 881068.0 1.4X -Parallel Compression with 4 workers 109 111 2 0.0 849241.3 1.5X -Parallel Compression with 8 workers 111 115 3 0.0 869455.2 1.4X -Parallel Compression with 16 workers 113 116 2 0.0 881832.5 1.4X +Parallel Compression with 0 workers 158 160 1 0.0 1237762.1 1.0X +Parallel Compression with 1 workers 189 190 3 0.0 1473899.5 0.8X +Parallel Compression with 2 workers 112 120 9 0.0 874992.3 1.4X +Parallel Compression with 4 workers 108 112 3 0.0 846156.6 1.5X +Parallel Compression with 8 workers 113 117 3 0.0 886576.8 1.4X +Parallel Compression with 16 workers 113 116 2 0.0 881278.0 1.4X diff --git a/core/benchmarks/ZStandardBenchmark-results.txt b/core/benchmarks/ZStandardBenchmark-results.txt index 136f0333590cc..0cd02cc48963a 100644 --- a/core/benchmarks/ZStandardBenchmark-results.txt +++ b/core/benchmarks/ZStandardBenchmark-results.txt @@ -2,48 +2,48 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 257 259 2 0.0 25704.2 1.0X -Compression 10000 times at level 2 without buffer pool 674 676 2 0.0 67396.3 0.4X -Compression 10000 times at level 3 without buffer pool 775 787 11 0.0 77497.9 0.3X -Compression 10000 times at level 1 with buffer pool 573 574 0 0.0 57347.3 0.4X -Compression 10000 times at level 2 with buffer pool 602 603 2 0.0 60162.8 0.4X -Compression 10000 times at level 3 with buffer pool 722 725 3 0.0 72247.3 0.4X +Compression 10000 times at level 1 without buffer pool 263 405 194 0.0 26293.9 1.0X +Compression 10000 times at level 2 without buffer pool 693 694 1 0.0 69337.7 0.4X +Compression 10000 times at level 3 without buffer pool 805 809 4 0.0 80511.1 0.3X +Compression 10000 times at level 1 with buffer pool 576 577 2 0.0 57572.6 0.5X +Compression 10000 times at level 2 with buffer pool 611 612 1 0.0 61149.9 0.4X +Compression 10000 times at level 3 with buffer pool 730 731 1 0.0 73001.9 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 176 177 1 0.1 17641.2 1.0X -Decompression 10000 times from level 2 without buffer pool 176 178 1 0.1 17628.9 1.0X -Decompression 10000 times from level 3 without buffer pool 175 176 0 0.1 17506.1 1.0X -Decompression 10000 times from level 1 with buffer pool 151 152 1 0.1 15051.5 1.2X -Decompression 10000 times from level 2 with buffer pool 150 151 1 0.1 14998.0 1.2X -Decompression 10000 times from level 3 with buffer pool 150 151 0 0.1 15019.4 1.2X +Decompression 10000 times from level 1 without buffer pool 616 616 1 0.0 61555.7 1.0X +Decompression 10000 times from level 2 without buffer pool 617 618 1 0.0 61746.1 1.0X +Decompression 10000 times from level 3 without buffer pool 614 615 1 0.0 61402.4 1.0X +Decompression 10000 times from level 1 with buffer pool 541 542 1 0.0 54078.9 1.1X +Decompression 10000 times from level 2 with buffer pool 541 542 1 0.0 54094.5 1.1X +Decompression 10000 times from level 3 with buffer pool 540 541 1 0.0 54049.5 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 57 57 0 0.0 444425.2 1.0X -Parallel Compression with 1 workers 42 44 3 0.0 325107.6 1.4X -Parallel Compression with 2 workers 38 39 2 0.0 294840.0 1.5X -Parallel Compression with 4 workers 36 37 1 0.0 282143.1 1.6X -Parallel Compression with 8 workers 39 40 1 0.0 303793.6 1.5X -Parallel Compression with 16 workers 41 43 1 0.0 324165.5 1.4X +Parallel Compression with 0 workers 57 58 1 0.0 442501.6 1.0X +Parallel Compression with 1 workers 42 44 3 0.0 325787.4 1.4X +Parallel Compression with 2 workers 38 40 2 0.0 295047.1 1.5X +Parallel Compression with 4 workers 37 38 1 0.0 285755.4 1.5X +Parallel Compression with 8 workers 39 40 1 0.0 301689.5 1.5X +Parallel Compression with 16 workers 42 44 1 0.0 327951.9 1.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.8.0-1014-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 9: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 156 158 1 0.0 1220298.8 1.0X -Parallel Compression with 1 workers 188 189 1 0.0 1467911.4 0.8X -Parallel Compression with 2 workers 111 118 7 0.0 866985.2 1.4X -Parallel Compression with 4 workers 106 109 2 0.0 827592.1 1.5X -Parallel Compression with 8 workers 114 116 2 0.0 888419.5 1.4X -Parallel Compression with 16 workers 111 115 2 0.0 868463.5 1.4X +Parallel Compression with 0 workers 155 158 1 0.0 1213931.6 1.0X +Parallel Compression with 1 workers 189 191 2 0.0 1475730.7 0.8X +Parallel Compression with 2 workers 112 117 5 0.0 878455.7 1.4X +Parallel Compression with 4 workers 107 110 3 0.0 834762.2 1.5X +Parallel Compression with 8 workers 113 116 2 0.0 886435.5 1.4X +Parallel Compression with 16 workers 110 115 3 0.0 859182.0 1.4X diff --git a/core/pom.xml b/core/pom.xml index 7805a3f37ae53..79563c246ec4b 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -426,7 +426,7 @@ net.sf.py4j py4j - 0.10.9.7 + 0.10.9.9 org.apache.spark diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java index f96513f1b1097..de3c41a4b526b 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java @@ -165,7 +165,7 @@ private void writeSortedFile(boolean isFinalFile) { MDC.of(LogKeys.TASK_ATTEMPT_ID$.MODULE$, taskContext.taskAttemptId()), MDC.of(LogKeys.THREAD_ID$.MODULE$, Thread.currentThread().getId()), MDC.of(LogKeys.MEMORY_SIZE$.MODULE$, Utils.bytesToString(getMemoryUsage())), - MDC.of(LogKeys.NUM_SPILL_INFOS$.MODULE$, spills.size()), + MDC.of(LogKeys.NUM_SPILLS$.MODULE$, spills.size()), MDC.of(LogKeys.SPILL_TIMES$.MODULE$, spills.size() != 1 ? "times" : "time")); } diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css index ca7c1f8ba65e2..bf9b230446b26 100755 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.css +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css @@ -361,6 +361,10 @@ a.downloadbutton { width: 170px; } +.shuffle-write-time-checkbox-div { + width: 155px; +} + .result-serialization-time-checkbox-div { width: 185px; } diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index c8d6000cd6282..b5c6033bd9da4 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -17,6 +17,7 @@ package org.apache.spark +import java.io.Closeable import java.util.{Properties, TimerTask} import java.util.concurrent.{ScheduledThreadPoolExecutor, TimeUnit} @@ -62,7 +63,7 @@ class BarrierTaskContext private[spark] ( log"for ${MDC(TOTAL_TIME, System.currentTimeMillis() - st)} ms,") logInfo(log"Task ${MDC(TASK_ATTEMPT_ID, taskAttemptId())}" + log" from Stage ${MDC(STAGE_ID, stageId())}" + - log"(Attempt ${MDC(STAGE_ATTEMPT, stageAttemptNumber())}) " + + log"(Attempt ${MDC(STAGE_ATTEMPT_ID, stageAttemptNumber())}) " + msg + waitMsg + log" current barrier epoch is ${MDC(BARRIER_EPOCH, barrierEpoch)}.") } @@ -273,6 +274,18 @@ class BarrierTaskContext private[spark] ( } override private[spark] def getLocalProperties: Properties = taskContext.getLocalProperties + + override private[spark] def interruptible(): Boolean = taskContext.interruptible() + + override private[spark] def pendingInterrupt(threadToInterrupt: Option[Thread], reason: String) + : Unit = { + taskContext.pendingInterrupt(threadToInterrupt, reason) + } + + override private[spark] def createResourceUninterruptibly[T <: Closeable](resourceBuilder: => T) + : T = { + taskContext.createResourceUninterruptibly(resourceBuilder) + } } @Experimental diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 1fe02eec3a072..dd131e443135f 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -885,7 +885,7 @@ private[spark] class ExecutorAllocationManager( } else { logWarning(log"Should have exactly one resource profile for stage " + log"${MDC(STAGE_ATTEMPT, stageAttempt)}, but have " + - log"${MDC(RESOURCE_PROFILE_ID, rpForStage)}") + log"${MDC(RESOURCE_PROFILE_IDS, rpForStage)}") } } } diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index cfb514913694b..ae6ef1ee55608 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -35,6 +35,228 @@ import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils +trait ReadOnlySparkConf { + /** Get a parameter; throws a NoSuchElementException if it's not set */ + def get(key: String): String = { + getOption(key).getOrElse(throw new NoSuchElementException(key)) + } + + /** Get a parameter, falling back to a default if not set */ + def get(key: String, defaultValue: String): String = { + getOption(key).getOrElse(defaultValue) + } + + /** + * Retrieves the value of a pre-defined configuration entry. + * + * - This is an internal Spark API. + * - The return type if defined by the configuration entry. + * - This will throw an exception is the config is not optional and the value is not set. + */ + private[spark] def get[T](entry: ConfigEntry[T]): T + + /** + * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no + * suffix is provided then seconds are assumed. + * + * @throws java.util.NoSuchElementException If the time parameter is not set + * @throws NumberFormatException If the value cannot be interpreted as seconds + */ + def getTimeAsSeconds(key: String): Long = catchIllegalValue(key) { + Utils.timeStringAsSeconds(get(key)) + } + + /** + * Get a time parameter as seconds, falling back to a default if not set. If no + * suffix is provided then seconds are assumed. + * + * @throws NumberFormatException If the value cannot be interpreted as seconds + */ + def getTimeAsSeconds(key: String, defaultValue: String): Long = catchIllegalValue(key) { + Utils.timeStringAsSeconds(get(key, defaultValue)) + } + + /** + * Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no + * suffix is provided then milliseconds are assumed. + * + * @throws java.util.NoSuchElementException If the time parameter is not set + * @throws NumberFormatException If the value cannot be interpreted as milliseconds + */ + def getTimeAsMs(key: String): Long = catchIllegalValue(key) { + Utils.timeStringAsMs(get(key)) + } + + /** + * Get a time parameter as milliseconds, falling back to a default if not set. If no + * suffix is provided then milliseconds are assumed. + * + * @throws NumberFormatException If the value cannot be interpreted as milliseconds + */ + def getTimeAsMs(key: String, defaultValue: String): Long = catchIllegalValue(key) { + Utils.timeStringAsMs(get(key, defaultValue)) + } + + /** + * Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no + * suffix is provided then bytes are assumed. + * + * @throws java.util.NoSuchElementException If the size parameter is not set + * @throws NumberFormatException If the value cannot be interpreted as bytes + */ + def getSizeAsBytes(key: String): Long = catchIllegalValue(key) { + Utils.byteStringAsBytes(get(key)) + } + + /** + * Get a size parameter as bytes, falling back to a default if not set. If no + * suffix is provided then bytes are assumed. + * + * @throws NumberFormatException If the value cannot be interpreted as bytes + */ + def getSizeAsBytes(key: String, defaultValue: String): Long = catchIllegalValue(key) { + Utils.byteStringAsBytes(get(key, defaultValue)) + } + + /** + * Get a size parameter as bytes, falling back to a default if not set. + * + * @throws NumberFormatException If the value cannot be interpreted as bytes + */ + def getSizeAsBytes(key: String, defaultValue: Long): Long = catchIllegalValue(key) { + Utils.byteStringAsBytes(get(key, s"${defaultValue}B")) + } + + /** + * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no + * suffix is provided then Kibibytes are assumed. + * + * @throws java.util.NoSuchElementException If the size parameter is not set + * @throws NumberFormatException If the value cannot be interpreted as Kibibytes + */ + def getSizeAsKb(key: String): Long = catchIllegalValue(key) { + Utils.byteStringAsKb(get(key)) + } + + /** + * Get a size parameter as Kibibytes, falling back to a default if not set. If no + * suffix is provided then Kibibytes are assumed. + * + * @throws NumberFormatException If the value cannot be interpreted as Kibibytes + */ + def getSizeAsKb(key: String, defaultValue: String): Long = catchIllegalValue(key) { + Utils.byteStringAsKb(get(key, defaultValue)) + } + + /** + * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no + * suffix is provided then Mebibytes are assumed. + * + * @throws java.util.NoSuchElementException If the size parameter is not set + * @throws NumberFormatException If the value cannot be interpreted as Mebibytes + */ + def getSizeAsMb(key: String): Long = catchIllegalValue(key) { + Utils.byteStringAsMb(get(key)) + } + + /** + * Get a size parameter as Mebibytes, falling back to a default if not set. If no + * suffix is provided then Mebibytes are assumed. + * + * @throws NumberFormatException If the value cannot be interpreted as Mebibytes + */ + def getSizeAsMb(key: String, defaultValue: String): Long = catchIllegalValue(key) { + Utils.byteStringAsMb(get(key, defaultValue)) + } + + /** + * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no + * suffix is provided then Gibibytes are assumed. + * + * @throws java.util.NoSuchElementException If the size parameter is not set + * @throws NumberFormatException If the value cannot be interpreted as Gibibytes + */ + def getSizeAsGb(key: String): Long = catchIllegalValue(key) { + Utils.byteStringAsGb(get(key)) + } + + /** + * Get a size parameter as Gibibytes, falling back to a default if not set. If no + * suffix is provided then Gibibytes are assumed. + * + * @throws NumberFormatException If the value cannot be interpreted as Gibibytes + */ + def getSizeAsGb(key: String, defaultValue: String): Long = catchIllegalValue(key) { + Utils.byteStringAsGb(get(key, defaultValue)) + } + + /** Get a parameter as an Option */ + def getOption(key: String): Option[String] + + /** Get all parameters as a list of pairs */ + def getAll: Array[(String, String)] + + /** + * Get a parameter as an integer, falling back to a default if not set + * + * @throws NumberFormatException If the value cannot be interpreted as an integer + */ + def getInt(key: String, defaultValue: Int): Int = catchIllegalValue(key) { + getOption(key).map(_.toInt).getOrElse(defaultValue) + } + + /** + * Get a parameter as a long, falling back to a default if not set + * + * @throws NumberFormatException If the value cannot be interpreted as a long + */ + def getLong(key: String, defaultValue: Long): Long = catchIllegalValue(key) { + getOption(key).map(_.toLong).getOrElse(defaultValue) + } + + /** + * Get a parameter as a double, falling back to a default if not ste + * + * @throws NumberFormatException If the value cannot be interpreted as a double + */ + def getDouble(key: String, defaultValue: Double): Double = catchIllegalValue(key) { + getOption(key).map(_.toDouble).getOrElse(defaultValue) + } + + /** + * Get a parameter as a boolean, falling back to a default if not set + * + * @throws IllegalArgumentException If the value cannot be interpreted as a boolean + */ + def getBoolean(key: String, defaultValue: Boolean): Boolean = catchIllegalValue(key) { + getOption(key).map(_.toBoolean).getOrElse(defaultValue) + } + + /** Does the configuration contain a given parameter? */ + def contains(key: String): Boolean + + /** Does the configuration have the typed config entry? */ + def contains(entry: ConfigEntry[_]): Boolean = contains(entry.key) + + /** + * Wrapper method for get() methods which require some specific value format. This catches + * any [[NumberFormatException]] or [[IllegalArgumentException]] and re-raises it with the + * incorrectly configured key in the exception message. + */ + protected def catchIllegalValue[T](key: String)(getValue: => T): T = { + try { + getValue + } catch { + case e: NumberFormatException => + // NumberFormatException doesn't have a constructor that takes a cause for some reason. + throw new NumberFormatException(s"Illegal value for config key $key: ${e.getMessage}") + .initCause(e) + case e: IllegalArgumentException => + throw new IllegalArgumentException(s"Illegal value for config key $key: ${e.getMessage}", e) + } + } +} + /** * Configuration for a Spark application. Used to set various Spark parameters as key-value pairs. * @@ -53,7 +275,11 @@ import org.apache.spark.util.Utils * @note Once a SparkConf object is passed to Spark, it is cloned and can no longer be modified * by the user. Spark does not support modifying the configuration at runtime. */ -class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Serializable { +class SparkConf(loadDefaults: Boolean) + extends ReadOnlySparkConf + with Cloneable + with Logging + with Serializable { import SparkConf._ @@ -242,16 +468,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria remove(entry.key) } - /** Get a parameter; throws a NoSuchElementException if it's not set */ - def get(key: String): String = { - getOption(key).getOrElse(throw new NoSuchElementException(key)) - } - - /** Get a parameter, falling back to a default if not set */ - def get(key: String, defaultValue: String): String = { - getOption(key).getOrElse(defaultValue) - } - /** * Retrieves the value of a pre-defined configuration entry. * @@ -263,128 +479,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria entry.readFrom(reader) } - /** - * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no - * suffix is provided then seconds are assumed. - * @throws java.util.NoSuchElementException If the time parameter is not set - * @throws NumberFormatException If the value cannot be interpreted as seconds - */ - def getTimeAsSeconds(key: String): Long = catchIllegalValue(key) { - Utils.timeStringAsSeconds(get(key)) - } - - /** - * Get a time parameter as seconds, falling back to a default if not set. If no - * suffix is provided then seconds are assumed. - * @throws NumberFormatException If the value cannot be interpreted as seconds - */ - def getTimeAsSeconds(key: String, defaultValue: String): Long = catchIllegalValue(key) { - Utils.timeStringAsSeconds(get(key, defaultValue)) - } - - /** - * Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no - * suffix is provided then milliseconds are assumed. - * @throws java.util.NoSuchElementException If the time parameter is not set - * @throws NumberFormatException If the value cannot be interpreted as milliseconds - */ - def getTimeAsMs(key: String): Long = catchIllegalValue(key) { - Utils.timeStringAsMs(get(key)) - } - - /** - * Get a time parameter as milliseconds, falling back to a default if not set. If no - * suffix is provided then milliseconds are assumed. - * @throws NumberFormatException If the value cannot be interpreted as milliseconds - */ - def getTimeAsMs(key: String, defaultValue: String): Long = catchIllegalValue(key) { - Utils.timeStringAsMs(get(key, defaultValue)) - } - - /** - * Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no - * suffix is provided then bytes are assumed. - * @throws java.util.NoSuchElementException If the size parameter is not set - * @throws NumberFormatException If the value cannot be interpreted as bytes - */ - def getSizeAsBytes(key: String): Long = catchIllegalValue(key) { - Utils.byteStringAsBytes(get(key)) - } - - /** - * Get a size parameter as bytes, falling back to a default if not set. If no - * suffix is provided then bytes are assumed. - * @throws NumberFormatException If the value cannot be interpreted as bytes - */ - def getSizeAsBytes(key: String, defaultValue: String): Long = catchIllegalValue(key) { - Utils.byteStringAsBytes(get(key, defaultValue)) - } - - /** - * Get a size parameter as bytes, falling back to a default if not set. - * @throws NumberFormatException If the value cannot be interpreted as bytes - */ - def getSizeAsBytes(key: String, defaultValue: Long): Long = catchIllegalValue(key) { - Utils.byteStringAsBytes(get(key, s"${defaultValue}B")) - } - - /** - * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no - * suffix is provided then Kibibytes are assumed. - * @throws java.util.NoSuchElementException If the size parameter is not set - * @throws NumberFormatException If the value cannot be interpreted as Kibibytes - */ - def getSizeAsKb(key: String): Long = catchIllegalValue(key) { - Utils.byteStringAsKb(get(key)) - } - - /** - * Get a size parameter as Kibibytes, falling back to a default if not set. If no - * suffix is provided then Kibibytes are assumed. - * @throws NumberFormatException If the value cannot be interpreted as Kibibytes - */ - def getSizeAsKb(key: String, defaultValue: String): Long = catchIllegalValue(key) { - Utils.byteStringAsKb(get(key, defaultValue)) - } - - /** - * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no - * suffix is provided then Mebibytes are assumed. - * @throws java.util.NoSuchElementException If the size parameter is not set - * @throws NumberFormatException If the value cannot be interpreted as Mebibytes - */ - def getSizeAsMb(key: String): Long = catchIllegalValue(key) { - Utils.byteStringAsMb(get(key)) - } - - /** - * Get a size parameter as Mebibytes, falling back to a default if not set. If no - * suffix is provided then Mebibytes are assumed. - * @throws NumberFormatException If the value cannot be interpreted as Mebibytes - */ - def getSizeAsMb(key: String, defaultValue: String): Long = catchIllegalValue(key) { - Utils.byteStringAsMb(get(key, defaultValue)) - } - - /** - * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no - * suffix is provided then Gibibytes are assumed. - * @throws java.util.NoSuchElementException If the size parameter is not set - * @throws NumberFormatException If the value cannot be interpreted as Gibibytes - */ - def getSizeAsGb(key: String): Long = catchIllegalValue(key) { - Utils.byteStringAsGb(get(key)) - } - - /** - * Get a size parameter as Gibibytes, falling back to a default if not set. If no - * suffix is provided then Gibibytes are assumed. - * @throws NumberFormatException If the value cannot be interpreted as Gibibytes - */ - def getSizeAsGb(key: String, defaultValue: String): Long = catchIllegalValue(key) { - Utils.byteStringAsGb(get(key, defaultValue)) - } - /** Get a parameter as an Option */ def getOption(key: String): Option[String] = { Option(settings.get(key)).orElse(getDeprecatedConfig(key, settings)) @@ -408,38 +502,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria .map { case (k, v) => (k.substring(prefix.length), v) } } - /** - * Get a parameter as an integer, falling back to a default if not set - * @throws NumberFormatException If the value cannot be interpreted as an integer - */ - def getInt(key: String, defaultValue: Int): Int = catchIllegalValue(key) { - getOption(key).map(_.toInt).getOrElse(defaultValue) - } - - /** - * Get a parameter as a long, falling back to a default if not set - * @throws NumberFormatException If the value cannot be interpreted as a long - */ - def getLong(key: String, defaultValue: Long): Long = catchIllegalValue(key) { - getOption(key).map(_.toLong).getOrElse(defaultValue) - } - - /** - * Get a parameter as a double, falling back to a default if not ste - * @throws NumberFormatException If the value cannot be interpreted as a double - */ - def getDouble(key: String, defaultValue: Double): Double = catchIllegalValue(key) { - getOption(key).map(_.toDouble).getOrElse(defaultValue) - } - - /** - * Get a parameter as a boolean, falling back to a default if not set - * @throws IllegalArgumentException If the value cannot be interpreted as a boolean - */ - def getBoolean(key: String, defaultValue: Boolean): Boolean = catchIllegalValue(key) { - getOption(key).map(_.toBoolean).getOrElse(defaultValue) - } - /** Get all executor environment variables set on this SparkConf */ def getExecutorEnv: Seq[(String, String)] = { getAllWithPrefix("spark.executorEnv.").toImmutableArraySeq @@ -457,8 +519,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria configsWithAlternatives.get(key).toSeq.flatten.exists { alt => contains(alt.key) } } - private[spark] def contains(entry: ConfigEntry[_]): Boolean = contains(entry.key) - /** Copy this object */ override def clone: SparkConf = { val cloned = new SparkConf(false) @@ -474,25 +534,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria */ private[spark] def getenv(name: String): String = System.getenv(name) - /** - * Wrapper method for get() methods which require some specific value format. This catches - * any [[NumberFormatException]] or [[IllegalArgumentException]] and re-raises it with the - * incorrectly configured key in the exception message. - */ - private def catchIllegalValue[T](key: String)(getValue: => T): T = { - try { - getValue - } catch { - case e: NumberFormatException => - // NumberFormatException doesn't have a constructor that takes a cause for some reason. - throw new NumberFormatException(s"Illegal value for config key $key: ${e.getMessage}") - .initCause(e) - case e: IllegalArgumentException => - throw new IllegalArgumentException(s"Illegal value for config key $key: ${e.getMessage}", e) - } - } - - /** * Checks for illegal or deprecated config settings. Throws an exception for the former. Not * idempotent - may mutate this conf object to convert deprecated settings to supported ones. @@ -608,6 +649,7 @@ private[spark] object SparkConf extends Logging { "Please use spark.kryoserializer.buffer instead. The default value for " + "spark.kryoserializer.buffer.mb was previously specified as '0.064'. Fractional values " + "are no longer accepted. To specify the equivalent now, one may use '64k'."), + DeprecatedConfig("spark.shuffle.spill", "1.6", "Not used anymore."), DeprecatedConfig("spark.rpc", "2.0", "Not used anymore."), DeprecatedConfig("spark.scheduler.executorTaskBlacklistTime", "2.1.0", "Please use the new excludedOnFailure options, spark.excludeOnFailure.*"), diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 042179d86c31a..30d772bd62d77 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -252,6 +252,9 @@ class SparkContext(config: SparkConf) extends Logging { private[spark] def conf: SparkConf = _conf + /** Get a read-only reference to the spark conf. This is preferred version over [[getConf]]. */ + def getReadOnlyConf: ReadOnlySparkConf = _conf + /** * Return a copy of this SparkContext's configuration. The configuration ''cannot'' be * changed at runtime. @@ -420,9 +423,6 @@ class SparkContext(config: SparkConf) extends Logging { if (!_conf.contains("spark.app.name")) { throw new SparkException("An application name must be set in your configuration") } - // HADOOP-19097 Set fs.s3a.connection.establish.timeout to 30s - // We can remove this after Apache Hadoop 3.4.1 releases - conf.setIfMissing("spark.hadoop.fs.s3a.connection.establish.timeout", "30000") // This should be set as early as possible. SparkContext.fillMissingMagicCommitterConfsIfNeeded(_conf) @@ -1878,6 +1878,7 @@ class SparkContext(config: SparkConf) extends Logging { if (uri.getFragment != null) uri.getFragment else source.getName) logInfo( log"Unpacking an archive ${MDC(LogKeys.PATH, path)}" + + log" (${MDC(LogKeys.BYTE_SIZE, source.length)} bytes)" + log" from ${MDC(LogKeys.SOURCE_PATH, source.getAbsolutePath)}" + log" to ${MDC(LogKeys.DESTINATION_PATH, dest.getAbsolutePath)}") Utils.deleteRecursively(dest) diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala index 15ddd08fb4aef..5384fd86a8f19 100644 --- a/core/src/main/scala/org/apache/spark/TaskContext.scala +++ b/core/src/main/scala/org/apache/spark/TaskContext.scala @@ -17,7 +17,7 @@ package org.apache.spark -import java.io.Serializable +import java.io.Closeable import java.util.Properties import org.apache.spark.annotation.{DeveloperApi, Evolving, Since} @@ -305,4 +305,24 @@ abstract class TaskContext extends Serializable { /** Gets local properties set upstream in the driver. */ private[spark] def getLocalProperties: Properties + + /** Whether the current task is allowed to interrupt. */ + private[spark] def interruptible(): Boolean + + /** + * Pending the interruption request until the task is able to + * interrupt after creating the resource uninterruptibly. + */ + private[spark] def pendingInterrupt(threadToInterrupt: Option[Thread], reason: String): Unit + + /** + * Creating a closeable resource uninterruptibly. A task is not allowed to interrupt in this + * state until the resource creation finishes. E.g., + * {{{ + * val linesReader = TaskContext.get().createResourceUninterruptibly { + * new HadoopFileLinesReader(file, parser.options.lineSeparatorInRead, conf) + * } + * }}} + */ + private[spark] def createResourceUninterruptibly[T <: Closeable](resourceBuilder: => T): T } diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala index 8167952d6b87f..f0e844289b9db 100644 --- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala +++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala @@ -17,6 +17,7 @@ package org.apache.spark +import java.io.Closeable import java.util.{Properties, Stack} import javax.annotation.concurrent.GuardedBy @@ -82,6 +83,13 @@ private[spark] class TaskContextImpl( // If defined, the corresponding task has been killed and this option contains the reason. @volatile private var reasonIfKilled: Option[String] = None + // The pending interruption request, which is blocked by uninterruptible resource creation. + // Should be protected by `TaskContext.synchronized`. + private var pendingInterruptRequest: Option[(Option[Thread], String)] = None + + // Whether this task is able to be interrupted. Should be protected by `TaskContext.synchronized`. + private var _interruptible = true + // Whether the task has completed. private var completed: Boolean = false @@ -296,4 +304,39 @@ private[spark] class TaskContextImpl( private[spark] override def fetchFailed: Option[FetchFailedException] = _fetchFailedException private[spark] override def getLocalProperties: Properties = localProperties + + + override def interruptible(): Boolean = TaskContext.synchronized(_interruptible) + + override def pendingInterrupt(threadToInterrupt: Option[Thread], reason: String): Unit = { + TaskContext.synchronized { + pendingInterruptRequest = Some((threadToInterrupt, reason)) + } + } + + def createResourceUninterruptibly[T <: Closeable](resourceBuilder: => T): T = { + + @inline def interruptIfRequired(): Unit = { + pendingInterruptRequest.foreach { case (threadToInterrupt, reason) => + markInterrupted(reason) + threadToInterrupt.foreach(_.interrupt()) + } + killTaskIfInterrupted() + } + + TaskContext.synchronized { + interruptIfRequired() + _interruptible = false + } + try { + val resource = resourceBuilder + addTaskCompletionListener[Unit](_ => resource.close()) + resource + } finally { + TaskContext.synchronized { + _interruptible = true + interruptIfRequired() + } + } + } } diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 9de350bc3130f..7311ef296363d 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -630,6 +630,9 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { */ def getConf: SparkConf = sc.getConf + /** Return a read-only version of the spark conf. */ + def getReadOnlyConf: ReadOnlySparkConf = sc.getReadOnlyConf + /** * Pass-through to SparkContext.setCallSite. For API support only. */ diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index b7fb22bab844a..e3d10574419b3 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -109,7 +109,8 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( protected val funcs: Seq[ChainedPythonFunctions], protected val evalType: Int, protected val argOffsets: Array[Array[Int]], - protected val jobArtifactUUID: Option[String]) + protected val jobArtifactUUID: Option[String], + protected val metrics: Map[String, AccumulatorV2[Long, Long]]) extends Logging { require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs") @@ -128,6 +129,8 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( protected val pythonExec: String = funcs.head.funcs.head.pythonExec protected val pythonVer: String = funcs.head.funcs.head.pythonVer + protected val batchSizeForPythonUDF: Int = 100 + // WARN: Both configurations, 'spark.python.daemon.module' and 'spark.python.worker.module' are // for very advanced users and they are experimental. This should be considered // as expert-only option, and shouldn't be used before knowing what it means exactly. @@ -211,6 +214,8 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( if (faultHandlerEnabled) { envVars.put("PYTHON_FAULTHANDLER_DIR", BasePythonRunner.faultHandlerLogDir.toString) } + // allow the user to set the batch size for the BatchedSerializer on UDFs + envVars.put("PYTHON_UDF_BATCH_SIZE", batchSizeForPythonUDF.toString) envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default")) @@ -522,6 +527,9 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( log"boot = ${MDC(LogKeys.BOOT_TIME, boot)}, " + log"init = ${MDC(LogKeys.INIT_TIME, init)}, " + log"finish = ${MDC(LogKeys.FINISH_TIME, finish)}") + metrics.get("pythonBootTime").foreach(_.add(boot)) + metrics.get("pythonInitTime").foreach(_.add(init)) + metrics.get("pythonTotalTime").foreach(_.add(total)) val memoryBytesSpilled = stream.readLong() val diskBytesSpilled = stream.readLong() context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled) @@ -824,7 +832,7 @@ private[spark] object PythonRunner { private[spark] class PythonRunner( funcs: Seq[ChainedPythonFunctions], jobArtifactUUID: Option[String]) extends BasePythonRunner[Array[Byte], Array[Byte]]( - funcs, PythonEvalType.NON_UDF, Array(Array(0)), jobArtifactUUID) { + funcs, PythonEvalType.NON_UDF, Array(Array(0)), jobArtifactUUID, Map.empty) { protected override def newWriter( env: SparkEnv, diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala index 045ed0e4c01cb..816ceea327aae 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala @@ -34,7 +34,7 @@ import org.apache.spark.util.ArrayImplicits.SparkArrayOps import org.apache.spark.util.Utils private[spark] object PythonUtils extends Logging { - val PY4J_ZIP_NAME = "py4j-0.10.9.7-src.zip" + val PY4J_ZIP_NAME = "py4j-0.10.9.9-src.zip" /** Get the PYTHONPATH for PySpark, either from SPARK_HOME, if it is set, or from our JAR */ def sparkPythonPath: String = { diff --git a/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala index 0ff2b79ab6623..ce933337afc35 100644 --- a/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala @@ -48,10 +48,10 @@ private[spark] class StreamingPythonRunner( protected val bufferSize: Int = conf.get(BUFFER_SIZE) protected val authSocketTimeout = conf.get(PYTHON_AUTH_SOCKET_TIMEOUT) - private val envVars: java.util.Map[String, String] = func.envVars - private val pythonExec: String = func.pythonExec - private var pythonWorker: Option[PythonWorker] = None - private var pythonWorkerFactory: Option[PythonWorkerFactory] = None + protected val envVars: java.util.Map[String, String] = func.envVars + protected val pythonExec: String = func.pythonExec + protected var pythonWorker: Option[PythonWorker] = None + protected var pythonWorkerFactory: Option[PythonWorkerFactory] = None protected val pythonVer: String = func.pythonVer /** @@ -68,7 +68,9 @@ private[spark] class StreamingPythonRunner( envVars.put("SPARK_AUTH_SOCKET_TIMEOUT", authSocketTimeout.toString) envVars.put("SPARK_BUFFER_SIZE", bufferSize.toString) - envVars.put("SPARK_CONNECT_LOCAL_URL", connectUrl) + if (!connectUrl.isEmpty) { + envVars.put("SPARK_CONNECT_LOCAL_URL", connectUrl) + } val workerFactory = new PythonWorkerFactory(pythonExec, workerModule, envVars.asScala.toMap, false) @@ -83,7 +85,9 @@ private[spark] class StreamingPythonRunner( PythonWorkerUtils.writePythonVersion(pythonVer, dataOut) // Send sessionId - PythonRDD.writeUTF(sessionId, dataOut) + if (!sessionId.isEmpty) { + PythonRDD.writeUTF(sessionId, dataOut) + } // Send the user function to python process PythonWorkerUtils.writePythonFunction(func, dataOut) diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala index cb5996a5097d2..12e031711aa2a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala @@ -205,7 +205,6 @@ private[deploy] object DeployMessages { case class RegisteredApplication(appId: String, master: RpcEndpointRef) extends DeployMessage - // TODO(matei): replace hostPort with host case class ExecutorAdded(id: Int, workerId: String, hostPort: String, cores: Int, memory: Int) { Utils.checkHostPort(hostPort) } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala index 6e0fe69f3bfb6..8caf67ff4680b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala @@ -24,7 +24,7 @@ import scala.jdk.CollectionConverters._ import com.codahale.metrics.{Counter, MetricRegistry, Timer} import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache, RemovalListener, RemovalNotification} import com.google.common.util.concurrent.UncheckedExecutionException -import jakarta.servlet.{DispatcherType, Filter, FilterChain, FilterConfig, ServletException, ServletRequest, ServletResponse} +import jakarta.servlet.{DispatcherType, Filter, FilterChain, ServletException, ServletRequest, ServletResponse} import jakarta.servlet.http.{HttpServletRequest, HttpServletResponse} import org.eclipse.jetty.servlet.FilterHolder @@ -428,9 +428,4 @@ private[history] class ApplicationCacheCheckFilter( httpResponse.sendRedirect(redirectUrl) } } - - override def init(config: FilterConfig): Unit = { } - - override def destroy(): Unit = { } - } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala index f3bb6d5af3358..990ab680f3aaf 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala @@ -187,12 +187,7 @@ object EventLogFileWriter { } def nameForAppAndAttempt(appId: String, appAttemptId: Option[String]): String = { - val base = Utils.sanitizeDirName(appId) - if (appAttemptId.isDefined) { - base + "_" + Utils.sanitizeDirName(appAttemptId.get) - } else { - base - } + Utils.nameForAppAndAttempt(appId, appAttemptId) } def codecName(log: Path): Option[String] = { diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala index 6647b11874d72..0904581d72367 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala @@ -33,12 +33,6 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) exte var webUiPort = 8080 var propertiesFile: String = null - // Check for settings in environment variables - if (System.getenv("SPARK_MASTER_IP") != null) { - logWarning("SPARK_MASTER_IP is deprecated, please use SPARK_MASTER_HOST") - host = System.getenv("SPARK_MASTER_IP") - } - if (System.getenv("SPARK_MASTER_HOST") != null) { host = System.getenv("SPARK_MASTER_HOST") } @@ -63,11 +57,6 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) exte @tailrec private def parse(args: List[String]): Unit = args match { - case ("--ip" | "-i") :: value :: tail => - Utils.checkHost(value) - host = value - parse(tail) - case ("--host" | "-h") :: value :: tail => Utils.checkHost(value) host = value @@ -103,7 +92,6 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) exte "Usage: Master [options]\n" + "\n" + "Options:\n" + - " -i HOST, --ip HOST Hostname to listen on (deprecated, please use --host or -h) \n" + " -h HOST, --host HOST Hostname to listen on\n" + " -p PORT, --port PORT Port to listen on (default: 7077)\n" + " --webui-port PORT Port for web UI (default: 8080)\n" + diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala index f24cd59418300..87ca01fe82a97 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala @@ -70,11 +70,6 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) { @tailrec private def parse(args: List[String]): Unit = args match { - case ("--ip" | "-i") :: value :: tail => - Utils.checkHost(value) - host = value - parse(tail) - case ("--host" | "-h") :: value :: tail => Utils.checkHost(value) host = value @@ -137,7 +132,6 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) { " -c CORES, --cores CORES Number of cores to use\n" + " -m MEM, --memory MEM Amount of memory to use (e.g. 1000M, 2G)\n" + " -d DIR, --work-dir DIR Directory to run apps in (default: SPARK_HOME/work)\n" + - " -i HOST, --ip IP Hostname to listen on (deprecated, please use --host or -h)\n" + " -h HOST, --host HOST Hostname to listen on\n" + " -p PORT, --port PORT Port to listen on (default: random)\n" + " --webui-port PORT Port for web UI (default: 8081)\n" + diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index e880cf8da9ec2..a73380cab690e 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -479,6 +479,29 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { driverConf.set(EXECUTOR_ID, arguments.executorId) cfg.logLevel.foreach(logLevel => Utils.setLogLevelIfNeeded(logLevel)) + // Set executor memory related config here according to resource profile + if (cfg.resourceProfile.id != ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) { + cfg.resourceProfile + .executorResources + .foreach { + case (ResourceProfile.OFFHEAP_MEM, request) => + driverConf.set(MEMORY_OFFHEAP_SIZE.key, request.amount.toString + "m") + logInfo(log"Set executor off-heap memory to " + + log"${MDC(LogKeys.EXECUTOR_MEMORY_OFFHEAP, request)}") + case (ResourceProfile.MEMORY, request) => + driverConf.set(EXECUTOR_MEMORY.key, request.amount.toString + "m") + logInfo(log"Set executor memory to ${MDC(LogKeys.EXECUTOR_MEMORY_SIZE, request)}") + case (ResourceProfile.OVERHEAD_MEM, request) => + // Maybe don't need to set this since it's nearly used by tasks. + driverConf.set(EXECUTOR_MEMORY_OVERHEAD.key, request.amount.toString + "m") + logInfo(log"Set executor memory_overhead to " + + log"${MDC(LogKeys.EXECUTOR_MEMORY_OVERHEAD_SIZE, request)}") + case (ResourceProfile.CORES, request) => + driverConf.set(EXECUTOR_CORES.key, request.amount.toString) + logInfo(log"Set executor cores to ${MDC(LogKeys.NUM_EXECUTOR_CORES, request)}") + case _ => + } + } val env = SparkEnv.createExecutorEnv(driverConf, arguments.executorId, arguments.bindAddress, arguments.hostname, arguments.cores, cfg.ioEncryptionKey, isLocal = false) // Set the application attemptId in the BlockStoreClient if available. diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index c299f38526aeb..f1087b695a7da 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -1210,6 +1210,7 @@ private[spark] class Executor( if (sourceURI.getFragment != null) sourceURI.getFragment else source.getName) logInfo( log"Unpacking an archive ${LogMDC(ARCHIVE_NAME, name)}" + + log" (${LogMDC(BYTE_SIZE, source.length)} bytes)" + log" from ${LogMDC(SOURCE_PATH, source.getAbsolutePath)}" + log" to ${LogMDC(DESTINATION_PATH, dest.getAbsolutePath)}") Utils.deleteRecursively(dest) diff --git a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala index 0c2db21905d1f..1ed53868992ac 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala @@ -45,7 +45,7 @@ private[spark] object Deploy { val RECOVERY_TIMEOUT = ConfigBuilder("spark.deploy.recoveryTimeout") .doc("Configures the timeout for recovery process. The default value is the same " + - "with ${WORKER_TIMEOUT.key}.") + s"with ${Worker.WORKER_TIMEOUT.key}.") .version("4.0.0") .timeConf(TimeUnit.SECONDS) .checkValue(_ > 0, "spark.deploy.recoveryTimeout must be positive.") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/UtilSuite.scala b/core/src/main/scala/org/apache/spark/internal/config/SparkConfigProvider.scala similarity index 58% rename from sql/catalyst/src/test/scala/org/apache/spark/sql/util/UtilSuite.scala rename to core/src/main/scala/org/apache/spark/internal/config/SparkConfigProvider.scala index d95de71e897a2..8739c87a65877 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/UtilSuite.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/SparkConfigProvider.scala @@ -14,18 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.spark.internal.config -package org.apache.spark.sql.util +import java.util.{Map => JMap} -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.SparkConf -class UtilSuite extends SparkFunSuite { - test("truncatedString") { - assert(truncatedString(Nil, "[", ", ", "]", 2) == "[]") - assert(truncatedString(Seq(1, 2), "[", ", ", "]", 2) == "[1, 2]") - assert(truncatedString(Seq(1, 2, 3), "[", ", ", "]", 2) == "[1, ... 2 more fields]") - assert(truncatedString(Seq(1, 2, 3), "[", ", ", "]", -5) == "[, ... 3 more fields]") - assert(truncatedString(Seq(1, 2, 3), ", ", 10) == "1, 2, 3") +/** + * A config provider that only reads Spark config keys. + */ +private[spark] class SparkConfigProvider(conf: JMap[String, String]) extends ConfigProvider { + + override def get(key: String): Option[String] = { + if (key.startsWith("spark.")) { + Option(conf.get(key)).orElse(SparkConf.getDeprecatedConfig(key, conf)) + } else { + None + } } } diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 324ef701c4266..5dda7afc3ebcb 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -162,7 +162,7 @@ package object config { "PySpark shell.") .version("4.0.0") .booleanConf - .createWithDefault(true) + .createWithDefault(false) private[spark] val LEGACY_TASK_NAME_MDC_ENABLED = ConfigBuilder("spark.log.legacyTaskNameMdc.enabled") @@ -1023,8 +1023,7 @@ package object config { private[spark] val MAX_EXECUTOR_FAILURES = ConfigBuilder("spark.executor.maxNumFailures") .doc("The maximum number of executor failures before failing the application. " + - "This configuration only takes effect on YARN, or Kubernetes when " + - "`spark.kubernetes.allocation.pods.allocator` is set to 'direct'.") + "This configuration only takes effect on YARN and Kubernetes.") .version("3.5.0") .intConf .createOptional @@ -1032,8 +1031,8 @@ package object config { private[spark] val EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS = ConfigBuilder("spark.executor.failuresValidityInterval") .doc("Interval after which executor failures will be considered independent and not " + - "accumulate towards the attempt count. This configuration only takes effect on YARN, " + - "or Kubernetes when `spark.kubernetes.allocation.pods.allocator` is set to 'direct'.") + "accumulate towards the attempt count. This configuration only takes effect on YARN " + + "and Kubernetes.") .version("3.5.0") .timeConf(TimeUnit.MILLISECONDS) .createOptional diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 545eafe7a4449..01bc46fc0b623 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -25,11 +25,13 @@ import java.util.{Date, Locale} import scala.reflect.ClassTag import org.apache.hadoop.conf.{Configurable, Configuration} +import org.apache.hadoop.hdfs.BlockMissingException import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapred._ import org.apache.hadoop.mapred.lib.CombineFileSplit import org.apache.hadoop.mapreduce.TaskType import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.hadoop.security.AccessControlException import org.apache.hadoop.util.ReflectionUtils import org.apache.spark._ @@ -319,6 +321,7 @@ class HadoopRDD[K, V]( null // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case e: IOException if ignoreCorruptFiles => logWarning(log"Skipped the rest content in the corrupted file: " + log"${MDC(PATH, split.inputSplit)}", e) @@ -345,6 +348,7 @@ class HadoopRDD[K, V]( finished = true // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case e: IOException if ignoreCorruptFiles => logWarning(log"Skipped the rest content in the corrupted file: " + log"${MDC(PATH, split.inputSplit)}", e) diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index 2b6f322d1805d..d619602305890 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -26,12 +26,14 @@ import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag import org.apache.hadoop.conf.{Configurable, Configuration} +import org.apache.hadoop.hdfs.BlockMissingException import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileInputFormat, FileSplit, InvalidInputException} import org.apache.hadoop.mapreduce.task.{JobContextImpl, TaskAttemptContextImpl} +import org.apache.hadoop.security.AccessControlException import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi @@ -255,6 +257,7 @@ class NewHadoopRDD[K, V]( null // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case e: IOException if ignoreCorruptFiles => logWarning( log"Skipped the rest content in the corrupted file: " + @@ -284,6 +287,7 @@ class NewHadoopRDD[K, V]( finished = true // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case e: IOException if ignoreCorruptFiles => logWarning( log"Skipped the rest content in the corrupted file: " + diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 0db0133f632bf..80db818b77e42 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1834,8 +1834,9 @@ abstract class RDD[T: ClassTag]( * Please read the linked SPIP and design docs to understand the limitations and future plans. * @return an [[RDDBarrier]] instance that provides actions within a barrier stage * @see [[org.apache.spark.BarrierTaskContext]] - * @see SPIP: Barrier Execution Mode - * @see Design Doc + * @see + * SPIP: Barrier Execution Mode + * @see Design Doc */ @Experimental @Since("2.4.0") diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 4f7338f74e298..aee92ba928b4a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1382,9 +1382,9 @@ private[spark] class DAGScheduler( logInfo( log"Got job ${MDC(JOB_ID, job.jobId)} (${MDC(CALL_SITE_SHORT_FORM, callSite.shortForm)}) " + log"with ${MDC(NUM_PARTITIONS, partitions.length)} output partitions") - logInfo(log"Final stage: ${MDC(STAGE_ID, finalStage)} " + + logInfo(log"Final stage: ${MDC(STAGE, finalStage)} " + log"(${MDC(STAGE_NAME, finalStage.name)})") - logInfo(log"Parents of final stage: ${MDC(STAGE_ID, finalStage.parents)}") + logInfo(log"Parents of final stage: ${MDC(STAGES, finalStage.parents)}") logInfo(log"Missing parents: ${MDC(MISSING_PARENT_STAGES, getMissingParentStages(finalStage))}") val jobSubmissionTime = clock.getTimeMillis() @@ -1465,7 +1465,7 @@ private[spark] class DAGScheduler( val missing = getMissingParentStages(stage).sortBy(_.id) logDebug("missing: " + missing) if (missing.isEmpty) { - logInfo(log"Submitting ${MDC(STAGE_ID, stage)} (${MDC(RDD_ID, stage.rdd)}), " + + logInfo(log"Submitting ${MDC(STAGE, stage)} (${MDC(RDD_ID, stage.rdd)}), " + log"which has no missing parents") submitMissingTasks(stage, jobId.get) } else { @@ -1517,12 +1517,12 @@ private[spark] class DAGScheduler( val shuffleId = stage.shuffleDep.shuffleId val shuffleMergeId = stage.shuffleDep.shuffleMergeId if (stage.shuffleDep.shuffleMergeEnabled) { - logInfo(log"Shuffle merge enabled before starting the stage for ${MDC(STAGE_ID, stage)}" + + logInfo(log"Shuffle merge enabled before starting the stage for ${MDC(STAGE, stage)}" + log" with shuffle ${MDC(SHUFFLE_ID, shuffleId)} and shuffle merge" + log" ${MDC(SHUFFLE_MERGE_ID, shuffleMergeId)} with" + log" ${MDC(NUM_MERGER_LOCATIONS, stage.shuffleDep.getMergerLocs.size.toString)} merger locations") } else { - logInfo(log"Shuffle merge disabled for ${MDC(STAGE_ID, stage)} with " + + logInfo(log"Shuffle merge disabled for ${MDC(STAGE, stage)} with " + log"shuffle ${MDC(SHUFFLE_ID, shuffleId)} and " + log"shuffle merge ${MDC(SHUFFLE_MERGE_ID, shuffleMergeId)}, " + log"but can get enabled later adaptively once enough " + @@ -1583,7 +1583,7 @@ private[spark] class DAGScheduler( // merger locations but the corresponding shuffle map stage did not complete // successfully, we would still enable push for its retry. s.shuffleDep.setShuffleMergeAllowed(false) - logInfo(log"Push-based shuffle disabled for ${MDC(STAGE_ID, stage)} " + + logInfo(log"Push-based shuffle disabled for ${MDC(STAGE, stage)} " + log"(${MDC(STAGE_NAME, stage.name)}) since it is already shuffle merge finalized") } } @@ -1707,7 +1707,7 @@ private[spark] class DAGScheduler( if (tasks.nonEmpty) { logInfo(log"Submitting ${MDC(NUM_TASKS, tasks.size)} missing tasks from " + - log"${MDC(STAGE_ID, stage)} (${MDC(RDD_ID, stage.rdd)}) (first 15 tasks are " + + log"${MDC(STAGE, stage)} (${MDC(RDD_ID, stage.rdd)}) (first 15 tasks are " + log"for partitions ${MDC(PARTITION_IDS, tasks.take(15).map(_.partitionId))})") val shuffleId = stage match { case s: ShuffleMapStage => Some(s.shuffleDep.shuffleId) @@ -1964,7 +1964,7 @@ private[spark] class DAGScheduler( } catch { case e: UnsupportedOperationException => logWarning(log"Could not cancel tasks " + - log"for stage ${MDC(STAGE_ID, stageId)}", e) + log"for stage ${MDC(STAGE, stageId)}", e) } listenerBus.post( SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded)) @@ -1996,7 +1996,7 @@ private[spark] class DAGScheduler( logDebug("ShuffleMapTask finished on " + execId) if (executorFailureEpoch.contains(execId) && smt.epoch <= executorFailureEpoch(execId)) { - logInfo(log"Ignoring possibly bogus ${MDC(STAGE_ID, smt)} completion from " + + logInfo(log"Ignoring possibly bogus ${MDC(STAGE, smt)} completion from " + log"executor ${MDC(EXECUTOR_ID, execId)}") } else { // The epoch of the task is acceptable (i.e., the task was launched after the most @@ -2026,8 +2026,8 @@ private[spark] class DAGScheduler( if (failedStage.latestInfo.attemptNumber() != task.stageAttemptId) { logInfo(log"Ignoring fetch failure from " + log"${MDC(TASK_ID, task)} as it's from " + - log"${MDC(STAGE_ID, failedStage)} attempt " + - log"${MDC(STAGE_ATTEMPT, task.stageAttemptId)} and there is a more recent attempt for " + + log"${MDC(FAILED_STAGE, failedStage)} attempt " + + log"${MDC(STAGE_ATTEMPT_ID, task.stageAttemptId)} and there is a more recent attempt for " + log"that stage (attempt " + log"${MDC(NUM_ATTEMPT, failedStage.latestInfo.attemptNumber())}) running") } else { @@ -2035,8 +2035,8 @@ private[spark] class DAGScheduler( isExecutorDecommissioningOrDecommissioned(taskScheduler, bmAddress) if (ignoreStageFailure) { logInfo(log"Ignoring fetch failure from ${MDC(TASK_NAME, task)} of " + - log"${MDC(STAGE, failedStage)} attempt " + - log"${MDC(STAGE_ATTEMPT, task.stageAttemptId)} when count " + + log"${MDC(FAILED_STAGE, failedStage)} attempt " + + log"${MDC(STAGE_ATTEMPT_ID, task.stageAttemptId)} when count " + log"${MDC(MAX_ATTEMPTS, config.STAGE_MAX_CONSECUTIVE_ATTEMPTS.key)} " + log"as executor ${MDC(EXECUTOR_ID, bmAddress.executorId)} is decommissioned and " + log"${MDC(CONFIG, config.STAGE_IGNORE_DECOMMISSION_FETCH_FAILURE.key)}=true") @@ -2937,7 +2937,8 @@ private[spark] class DAGScheduler( } else { // This stage is only used by the job, so finish the stage if it is running. val stage = stageIdToStage(stageId) - if (runningStages.contains(stage)) { + // Stages with failedAttemptIds may have tasks that are running + if (runningStages.contains(stage) || stage.failedAttemptIds.nonEmpty) { try { // killAllTaskAttempts will fail if a SchedulerBackend does not implement killTask taskScheduler.killAllTaskAttempts(stageId, shouldInterruptTaskThread(job), reason) if (legacyAbortStageAfterKillTasks) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala index df28a97a349ea..a769c3fa14b62 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala @@ -149,7 +149,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean) // The task output has been committed successfully case _: TaskCommitDenied => logInfo(log"Task was denied committing, stage: ${MDC(LogKeys.STAGE_ID, stage)}." + - log"${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)}, " + + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, stageAttempt)}, " + log"partition: ${MDC(LogKeys.PARTITION_ID, partition)}, " + log"attempt: ${MDC(LogKeys.NUM_ATTEMPT, attemptNumber)}") case _ => @@ -181,7 +181,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean) stageStates.get(stage) match { case Some(state) if attemptFailed(state, stageAttempt, partition, attemptNumber) => logInfo(log"Commit denied for stage=${MDC(LogKeys.STAGE_ID, stage)}." + - log"${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)}, partition=" + + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, stageAttempt)}, partition=" + log"${MDC(LogKeys.PARTITION_ID, partition)}: task attempt " + log"${MDC(LogKeys.NUM_ATTEMPT, attemptNumber)} already marked as failed.") false diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index f511aed6d2166..e21ec77ce69ec 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -22,6 +22,7 @@ import java.util.Properties import org.apache.spark._ import org.apache.spark.executor.TaskMetrics +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.APP_CALLER_CONTEXT import org.apache.spark.internal.plugin.PluginContainer import org.apache.spark.memory.{MemoryMode, TaskMemoryManager} @@ -70,7 +71,7 @@ private[spark] abstract class Task[T]( val jobId: Option[Int] = None, val appId: Option[String] = None, val appAttemptId: Option[String] = None, - val isBarrier: Boolean = false) extends Serializable { + val isBarrier: Boolean = false) extends Serializable with Logging { @transient lazy val metrics: TaskMetrics = SparkEnv.get.closureSerializer.newInstance().deserialize(ByteBuffer.wrap(serializedTaskMetrics)) @@ -231,10 +232,19 @@ private[spark] abstract class Task[T]( require(reason != null) _reasonIfKilled = reason if (context != null) { - context.markInterrupted(reason) - } - if (interruptThread && taskThread != null) { - taskThread.interrupt() + TaskContext.synchronized { + if (context.interruptible()) { + context.markInterrupted(reason) + if (interruptThread && taskThread != null) { + taskThread.interrupt() + } + } else { + logInfo(log"Task ${MDC(LogKeys.TASK_ID, context.taskAttemptId())} " + + log"is currently not interruptible. ") + val threadToInterrupt = if (interruptThread) Option(taskThread) else None + context.pendingInterrupt(threadToInterrupt, reason) + } + } } } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 8e3cb1379339d..43193dc5366a4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -325,7 +325,7 @@ private[spark] class TaskSchedulerImpl( } tsm.suspend() logInfo(log"Stage ${MDC(LogKeys.STAGE_ID, stageId)}." + - log"${MDC(LogKeys.STAGE_ATTEMPT, tsm.taskSet.stageAttemptId)} was cancelled") + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, tsm.taskSet.stageAttemptId)} was cancelled") } } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala index 2474a1342eb2e..3513cb1f93764 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala @@ -19,7 +19,7 @@ package org.apache.spark.scheduler import java.util.Properties -import org.apache.spark.internal.LogKeys.{STAGE_ATTEMPT, STAGE_ID} +import org.apache.spark.internal.LogKeys.{STAGE_ATTEMPT_ID, STAGE_ID} import org.apache.spark.internal.MessageWithContext /** @@ -42,7 +42,7 @@ private[spark] class TaskSet( lazy val logId: MessageWithContext = { val hashMap = new java.util.HashMap[String, String]() hashMap.put(STAGE_ID.name, stageId.toString) - hashMap.put(STAGE_ATTEMPT.name, stageAttemptId.toString) + hashMap.put(STAGE_ATTEMPT_ID.name, stageAttemptId.toString) MessageWithContext(id, hashMap) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index fdc82285b76bb..0eaf138d3eb8d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -1001,7 +1001,7 @@ private[spark] class TaskSetManager( logError( log"Task ${MDC(TASK_INDEX, info.index)}.${MDC(TASK_ATTEMPT_ID, info.attemptNumber)} " + log"in stage ${MDC(STAGE_ID, taskSet.stageId)}." + - log"${MDC(STAGE_ATTEMPT, taskSet.stageAttemptId)} (TID ${MDC(TASK_ID, tid)}) " + + log"${MDC(STAGE_ATTEMPT_ID, taskSet.stageAttemptId)} (TID ${MDC(TASK_ID, tid)}) " + log"can not write to output file: ${MDC(ERROR, ef.description)}; not retrying") emptyTaskInfoAccumulablesAndNotifyDagScheduler(tid, tasks(index), reason, null, accumUpdates, metricPeaks) diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala index 30bc1382fb021..bf3117a9a9b12 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala @@ -85,6 +85,9 @@ private[spark] class IndexShuffleBlockResolver( private val remoteShuffleMaxDisk: Option[Long] = conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_MAX_DISK_SIZE) + private val checksumEnabled = conf.get(config.SHUFFLE_CHECKSUM_ENABLED) + private lazy val algorithm = conf.get(config.SHUFFLE_CHECKSUM_ALGORITHM) + def getDataFile(shuffleId: Int, mapId: Long): File = getDataFile(shuffleId, mapId, None) /** @@ -195,9 +198,11 @@ private[spark] class IndexShuffleBlockResolver( logWarning(log"Error deleting index ${MDC(PATH, file.getPath())}") } - file = getChecksumFile(shuffleId, mapId, conf.get(config.SHUFFLE_CHECKSUM_ALGORITHM)) - if (file.exists() && !file.delete()) { - logWarning(log"Error deleting checksum ${MDC(PATH, file.getPath())}") + if (checksumEnabled) { + file = getChecksumFile(shuffleId, mapId, algorithm) + if (file.exists() && !file.delete()) { + logWarning(log"Error deleting checksum ${MDC(PATH, file.getPath())}") + } } } @@ -396,8 +401,7 @@ private[spark] class IndexShuffleBlockResolver( val (checksumFileOpt, checksumTmpOpt) = if (checksumEnabled) { assert(lengths.length == checksums.length, "The size of partition lengths and checksums should be equal") - val checksumFile = - getChecksumFile(shuffleId, mapId, conf.get(config.SHUFFLE_CHECKSUM_ALGORITHM)) + val checksumFile = getChecksumFile(shuffleId, mapId, algorithm) (Some(checksumFile), Some(createTempFile(checksumFile))) } else { (None, None) diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala index efffda43695cc..6902fb6d236de 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala @@ -74,12 +74,6 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager import SortShuffleManager._ - if (!conf.getBoolean("spark.shuffle.spill", true)) { - logWarning( - "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." + - " Shuffle will continue to spill to disk when necessary.") - } - /** * A mapping from shuffle ids to the task ids of mappers producing output for those shuffles. */ diff --git a/core/src/main/scala/org/apache/spark/ui/HttpSecurityFilter.scala b/core/src/main/scala/org/apache/spark/ui/HttpSecurityFilter.scala index 551f0eb98cb87..cf881b6ea9900 100644 --- a/core/src/main/scala/org/apache/spark/ui/HttpSecurityFilter.scala +++ b/core/src/main/scala/org/apache/spark/ui/HttpSecurityFilter.scala @@ -44,10 +44,6 @@ private class HttpSecurityFilter( conf: SparkConf, securityMgr: SecurityManager) extends Filter { - override def destroy(): Unit = { } - - override def init(config: FilterConfig): Unit = { } - override def doFilter(req: ServletRequest, res: ServletResponse, chain: FilterChain): Unit = { val hreq = req.asInstanceOf[HttpServletRequest] val hres = res.asInstanceOf[HttpServletResponse] diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/TaskThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/TaskThreadDumpPage.scala index 49b919ce0de97..5fe542d4fa21e 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/TaskThreadDumpPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/TaskThreadDumpPage.scala @@ -71,7 +71,7 @@ private[spark] class TaskThreadDumpPage(
-

Updated at{UIUtils.formatDate(time)}

+

Updated at {UIUtils.formatDate(time)}

diff --git a/core/src/main/scala/org/apache/spark/util/TransientLazy.scala b/core/src/main/scala/org/apache/spark/util/BestEffortLazyVal.scala similarity index 50% rename from core/src/main/scala/org/apache/spark/util/TransientLazy.scala rename to core/src/main/scala/org/apache/spark/util/BestEffortLazyVal.scala index 2833ef93669a6..83044055fe404 100644 --- a/core/src/main/scala/org/apache/spark/util/TransientLazy.scala +++ b/core/src/main/scala/org/apache/spark/util/BestEffortLazyVal.scala @@ -16,15 +16,21 @@ */ package org.apache.spark.util +import java.util.concurrent.atomic.AtomicReference + /** - * Construct to lazily initialize a variable. - * This may be helpful for avoiding deadlocks in certain scenarios. For example, - * a) Thread 1 entered a synchronized method, grabbing a coarse lock on the parent object. - * b) Thread 2 gets spawned off, and tries to initialize a lazy value on the same parent object - * (in our case, this was the logger). This causes scala to also try to grab a coarse lock on - * the parent object. - * c) If thread 1 waits for thread 2 to join, a deadlock occurs. - * The main difference between this and [[LazyTry]] is that this does not cache failures. + * A lock-free implementation of a lazily-initialized variable. + * If there are concurrent initializations then the `compute()` function may be invoked + * multiple times. However, only a single `compute()` result will be stored and all readers + * will receive the same result object instance. + * + * This may be helpful for avoiding deadlocks in certain scenarios where exactly-once + * value computation is not a hard requirement. + * + * @note + * This helper class has additional requirements on the compute function: + * 1) The compute function MUST not return null; + * 2) The computation failure is not cached. * * @note * Scala 3 uses a different implementation of lazy vals which doesn't have this problem. @@ -32,12 +38,24 @@ package org.apache.spark.util * href="https://docs.scala-lang.org/scala3/reference/changed-features/lazy-vals-init.html">Lazy * Vals Initialization for more details. */ -private[spark] class TransientLazy[T](initializer: => T) extends Serializable { +private[spark] class BestEffortLazyVal[T <: AnyRef]( + @volatile private[this] var compute: () => T) extends Serializable { - @transient - private[this] lazy val value: T = initializer + private[this] val cached: AtomicReference[T] = new AtomicReference(null.asInstanceOf[T]) def apply(): T = { - value + val value = cached.get() + if (value != null) { + value + } else { + val f = compute + if (f != null) { + val newValue = f() + assert(newValue != null, "compute function cannot return null.") + cached.compareAndSet(null.asInstanceOf[T], newValue) + compute = null // allow closure to be GC'd + } + cached.get() + } } } diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala index f8f5bb4f72a40..5e50361b278aa 100644 --- a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala @@ -241,6 +241,13 @@ private[spark] object HadoopFSUtils extends Logging { logWarning(log"The directory ${MDC(PATH, path)} " + log"was not found. Was it deleted very recently?") Array.empty[FileStatus] + case u: UnsupportedOperationException => + throw new SparkUnsupportedOperationException( + errorClass = "FAILED_READ_FILE.UNSUPPORTED_FILE_SYSTEM", + messageParameters = Map( + "path" -> path.toString, + "fileSystemClass" -> fs.getClass.getName, + "method" -> u.getStackTrace.head.getMethodName)) } val filteredStatuses = diff --git a/core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala b/core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala index 21184d70b386a..7d01facc1e421 100644 --- a/core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala +++ b/core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala @@ -17,7 +17,7 @@ package org.apache.spark.util -import java.util.concurrent.Callable +import java.util.concurrent.{Callable, TimeUnit} import com.google.common.cache.{Cache, CacheBuilder, CacheLoader, LoadingCache} @@ -68,6 +68,20 @@ private[spark] object NonFateSharingCache { override def load(k: K): V = loadingFunc.apply(k) })) } + + def apply[K, V]( + maximumSize: Long, + expireAfterAccessTime: Long, + expireAfterAccessTimeUnit: TimeUnit): NonFateSharingCache[K, V] = { + val builder = CacheBuilder.newBuilder().asInstanceOf[CacheBuilder[K, V]] + if (maximumSize > 0L) { + builder.maximumSize(maximumSize) + } + if(expireAfterAccessTime > 0) { + builder.expireAfterAccess(expireAfterAccessTime, expireAfterAccessTimeUnit) + } + new NonFateSharingCache(builder.build[K, V]()) + } } private[spark] class NonFateSharingCache[K, V](protected val cache: Cache[K, V]) { diff --git a/core/src/main/scala/org/apache/spark/util/TransientBestEffortLazyVal.scala b/core/src/main/scala/org/apache/spark/util/TransientBestEffortLazyVal.scala new file mode 100644 index 0000000000000..033b783ede40b --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/TransientBestEffortLazyVal.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +import java.io.{IOException, ObjectInputStream} +import java.util.concurrent.atomic.AtomicReference + +/** + * A lock-free implementation of a lazily-initialized variable. + * If there are concurrent initializations then the `compute()` function may be invoked + * multiple times. However, only a single `compute()` result will be stored and all readers + * will receive the same result object instance. + * + * This may be helpful for avoiding deadlocks in certain scenarios where exactly-once + * value computation is not a hard requirement. + * + * The main difference between this and [[BestEffortLazyVal]] is that: + * [[BestEffortLazyVal]] serializes the cached value after computation, while + * [[TransientBestEffortLazyVal]] always serializes the compute function. + * + * @note + * This helper class has additional requirements on the compute function: + * 1) The compute function MUST not return null; + * 2) The computation failure is not cached. + * + * @note + * Scala 3 uses a different implementation of lazy vals which doesn't have this problem. + * Please refer to Lazy + * Vals Initialization for more details. + */ +private[spark] class TransientBestEffortLazyVal[T <: AnyRef]( + private[this] val compute: () => T) extends Serializable { + + @transient + private[this] var cached: AtomicReference[T] = new AtomicReference(null.asInstanceOf[T]) + + def apply(): T = { + val value = cached.get() + if (value != null) { + value + } else { + val newValue = compute() + assert(newValue != null, "compute function cannot return null.") + cached.compareAndSet(null.asInstanceOf[T], newValue) + cached.get() + } + } + + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + cached = new AtomicReference(null.asInstanceOf[T]) + } +} diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 5703128aacbb9..1efe181a8c38a 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -103,7 +103,8 @@ private[spark] object Utils with SparkErrorUtils with SparkFileUtils with SparkSerDeUtils - with SparkStreamUtils { + with SparkStreamUtils + with SparkStringUtils { private val sparkUncaughtExceptionHandler = new SparkUncaughtExceptionHandler @volatile private var cachedLocalDir: String = "" @@ -1354,8 +1355,10 @@ private[spark] object Utils val TRY_WITH_CALLER_STACKTRACE_FULL_STACKTRACE = "Full stacktrace of original doTryWithCallerStacktrace caller" - val TRY_WITH_CALLER_STACKTRACE_TRY_STACKTRACE = - "Stacktrace under doTryWithCallerStacktrace" + class OriginalTryStackTraceException() + extends Exception(TRY_WITH_CALLER_STACKTRACE_FULL_STACKTRACE) { + var doTryWithCallerStacktraceDepth: Int = 0 + } /** * Use Try with stacktrace substitution for the caller retrieving the error. @@ -1383,14 +1386,19 @@ private[spark] object Utils val commonSuffixLen = origStackTrace.reverse.zip(currentStackTrace.reverse).takeWhile { case (exElem, currentElem) => exElem == currentElem }.length - val belowEx = new Exception(TRY_WITH_CALLER_STACKTRACE_TRY_STACKTRACE) - belowEx.setStackTrace(origStackTrace.dropRight(commonSuffixLen)) - ex.addSuppressed(belowEx) - - // keep the full original stack trace in a suppressed exception. - val fullEx = new Exception(TRY_WITH_CALLER_STACKTRACE_FULL_STACKTRACE) - fullEx.setStackTrace(origStackTrace) - ex.addSuppressed(fullEx) + // Add the full stack trace of the original caller as the suppressed exception. + // It may already be there if it's a nested call to doTryWithCallerStacktrace. + val origEx = ex.getSuppressed.find { e => + e.isInstanceOf[OriginalTryStackTraceException] + }.getOrElse { + val fullEx = new OriginalTryStackTraceException() + fullEx.setStackTrace(origStackTrace) + ex.addSuppressed(fullEx) + fullEx + }.asInstanceOf[OriginalTryStackTraceException] + // Update the depth of the stack of the current doTryWithCallerStacktrace, for stitching + // it with the stack of getTryWithCallerStacktrace. + origEx.doTryWithCallerStacktraceDepth = origStackTrace.size - commonSuffixLen case Success(_) => // nothing } t @@ -1406,7 +1414,7 @@ private[spark] object Utils * Full stack trace of the original doTryWithCallerStacktrace caller can be retrieved with * ``` * ex.getSuppressed.find { e => - * e.getMessage == Utils.TRY_WITH_CALLER_STACKTRACE_FULL_STACKTRACE + * e.isInstanceOf[Utils.OriginalTryStackTraceException] * } * ``` * @@ -1416,13 +1424,15 @@ private[spark] object Utils */ def getTryWithCallerStacktrace[T](t: Try[T]): T = t match { case Failure(ex) => - val belowStacktrace = ex.getSuppressed.find { e => + val originalStacktraceEx = ex.getSuppressed.find { e => // added in doTryWithCallerStacktrace - e.getMessage == TRY_WITH_CALLER_STACKTRACE_TRY_STACKTRACE + e.isInstanceOf[OriginalTryStackTraceException] }.getOrElse { // If we don't have the expected stacktrace information, just rethrow throw ex - }.getStackTrace + }.asInstanceOf[OriginalTryStackTraceException] + val belowStacktrace = originalStacktraceEx.getStackTrace + .take(originalStacktraceEx.doTryWithCallerStacktraceDepth) // We are modifying and throwing the original exception. It would be better if we could // return a copy, but we can't easily clone it and preserve. If this is accessed from // multiple threads that then look at the stack trace, this could break. @@ -2512,7 +2522,7 @@ private[spark] object Utils * * @return whether it is local mode */ - def isLocalMaster(conf: SparkConf): Boolean = { + def isLocalMaster(conf: ReadOnlySparkConf): Boolean = { val master = conf.get("spark.master", "") master == "local" || master.startsWith("local[") } @@ -2596,7 +2606,7 @@ private[spark] object Utils /** * Return whether dynamic allocation is enabled in the given conf. */ - def isDynamicAllocationEnabled(conf: SparkConf): Boolean = { + def isDynamicAllocationEnabled(conf: ReadOnlySparkConf): Boolean = { val dynamicAllocationEnabled = conf.get(DYN_ALLOCATION_ENABLED) dynamicAllocationEnabled && (!isLocalMaster(conf) || conf.get(DYN_ALLOCATION_TESTING)) @@ -2679,7 +2689,7 @@ private[spark] object Utils * loading SparkConf. */ def resetStructuredLogging(sparkConf: SparkConf): Unit = { - if (sparkConf.getBoolean(STRUCTURED_LOGGING_ENABLED.key, defaultValue = true)) { + if (sparkConf.get(STRUCTURED_LOGGING_ENABLED)) { Logging.enableStructuredLogging() } else { Logging.disableStructuredLogging() @@ -2799,10 +2809,6 @@ private[spark] object Utils } } - def stringToSeq(str: String): Seq[String] = { - str.split(",").map(_.trim()).filter(_.nonEmpty).toImmutableArraySeq - } - /** * Create instances of extension classes. * @@ -2948,6 +2954,15 @@ private[spark] object Utils str.replaceAll("[ :/]", "-").replaceAll("[.${}'\"]", "_").toLowerCase(Locale.ROOT) } + def nameForAppAndAttempt(appId: String, appAttemptId: Option[String]): String = { + val base = sanitizeDirName(appId) + if (appAttemptId.isDefined) { + base + "_" + sanitizeDirName(appAttemptId.get) + } else { + base + } + } + def isClientMode(conf: SparkConf): Boolean = { "client".equals(conf.get(SparkLauncher.DEPLOY_MODE, "client")) } @@ -2983,7 +2998,7 @@ private[spark] object Utils return props } val resultProps = new Properties() - props.forEach((k, v) => resultProps.put(k, v)) + resultProps.putAll(props.clone().asInstanceOf[Properties]) resultProps } diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala index c3d648dccea73..7f2a1a8419a71 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala @@ -146,6 +146,6 @@ private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager) logInfo(log"Thread ${MDC(LogKeys.THREAD_ID, threadId)} " + log"spilling in-memory map of ${MDC(LogKeys.BYTE_SIZE, org.apache.spark.util.Utils.bytesToString(size))} to disk " + - log"(${MDC(LogKeys.SPILL_TIMES, _spillCount)} times so far)") + log"(${MDC(LogKeys.NUM_SPILLS, _spillCount)} times so far)") } } diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala index ca51e61f5ed44..65ed2684a5b00 100644 --- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala +++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark +import java.io.{File, FileOutputStream, InputStream, ObjectOutputStream} import java.util.concurrent.{Semaphore, TimeUnit} import java.util.concurrent.atomic.AtomicInteger @@ -35,7 +36,7 @@ import org.apache.spark.executor.ExecutorExitCode import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Deploy._ import org.apache.spark.scheduler.{JobFailed, SparkListener, SparkListenerExecutorRemoved, SparkListenerJobEnd, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerTaskEnd, SparkListenerTaskStart} -import org.apache.spark.util.ThreadUtils +import org.apache.spark.util.{ThreadUtils, Utils} /** * Test suite for cancelling running jobs. We run the cancellation tasks for single job action @@ -712,6 +713,142 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft assert(executionOfInterruptibleCounter.get() < numElements) } + Seq(true, false).foreach { interruptible => + + val (hint1, hint2) = if (interruptible) { + (" not", "") + } else { + ("", " not") + } + + val testName = s"SPARK-50768:$hint1 use TaskContext.createResourceUninterruptibly " + + s"would$hint2 cause stream leak on task interruption" + + test(testName) { + import org.apache.spark.JobCancellationSuite._ + withTempDir { dir => + + // `InterruptionSensitiveInputStream` is designed to easily leak the underlying + // stream when task thread interruption happens during its initialization, as + // the reference to the underlying stream is intentionally not available to + // `InterruptionSensitiveInputStream` at that point. + class InterruptionSensitiveInputStream(fileHint: String) extends InputStream { + private var underlying: InputStream = _ + + def initialize(): InputStream = { + val in: InputStream = new InputStream { + + open() + + private def dumpFile(typeName: String): Unit = { + var fileOut: FileOutputStream = null + var objOut: ObjectOutputStream = null + try { + val file = new File(dir, s"$typeName.$fileHint") + fileOut = new FileOutputStream(file) + objOut = new ObjectOutputStream(fileOut) + objOut.writeBoolean(true) + objOut.flush() + } finally { + if (fileOut != null) { + fileOut.close() + } + if (objOut != null) { + objOut.close() + } + } + + } + + private def open(): Unit = { + dumpFile("open") + } + + override def close(): Unit = { + dumpFile("close") + } + + override def read(): Int = -1 + } + + // Leave some time for the task to be interrupted during the + // creation of `InterruptionSensitiveInputStream`. + Thread.sleep(10000) + + underlying = in + underlying + } + + override def read(): Int = -1 + + override def close(): Unit = { + if (underlying != null) { + underlying.close() + } + } + } + + def createStream(fileHint: String): Unit = { + if (interruptible) { + Utils.tryInitializeResource { + new InterruptionSensitiveInputStream(fileHint) + } { + _.initialize() + } + } else { + TaskContext.get().createResourceUninterruptibly[java.io.InputStream] { + Utils.tryInitializeResource { + new InterruptionSensitiveInputStream(fileHint) + } { + _.initialize() + } + } + } + } + + sc = new SparkContext("local[2]", "test interrupt streams") + + sc.addSparkListener(new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + // Sleep some time to ensure task has started + Thread.sleep(2000) + taskStartedSemaphore.release() + } + + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { + if (taskEnd.reason.isInstanceOf[TaskKilled]) { + taskCancelledSemaphore.release() + } + } + }) + + sc.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "true") + + val fileHint = if (interruptible) "interruptible" else "uninterruptible" + val future = sc.parallelize(1 to 100, 1).mapPartitions { _ => + createStream(fileHint) + Iterator.single(1) + }.collectAsync() + + taskStartedSemaphore.acquire() + future.cancel() + taskCancelledSemaphore.acquire() + + val fileOpen = new File(dir, s"open.$fileHint") + val fileClose = new File(dir, s"close.$fileHint") + assert(fileOpen.exists()) + + if (interruptible) { + // The underlying stream leaks when the stream creation is interruptible. + assert(!fileClose.exists()) + } else { + // The underlying stream won't leak when the stream creation is uninterruptible. + assert(fileClose.exists()) + } + } + } + } + def testCount(): Unit = { // Cancel before launching any tasks { diff --git a/core/src/test/scala/org/apache/spark/SerializerTestUtils.scala b/core/src/test/scala/org/apache/spark/SerializerTestUtils.scala new file mode 100644 index 0000000000000..bd81003777317 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/SerializerTestUtils.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} + +trait SerializerTestUtils { + + protected def roundtripSerialize[T](obj: T): T = { + deserializeFromBytes(serializeToBytes(obj)) + } + + protected def serializeToBytes[T](o: T): Array[Byte] = { + val baos = new ByteArrayOutputStream + val oos = new ObjectOutputStream(baos) + try { + oos.writeObject(o) + baos.toByteArray + } finally { + oos.close() + } + } + + protected def deserializeFromBytes[T](bytes: Array[Byte]): T = { + val bais = new ByteArrayInputStream(bytes) + val ois = new ObjectInputStream(bais) + ois.readObject().asInstanceOf[T] + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index 6b2bd90cd4314..10092f416f9e1 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -794,11 +794,6 @@ object HistoryServerSuite { * A filter used for auth tests; sets the request's user to the value of the "HTTP_USER" header. */ class FakeAuthFilter extends Filter { - - override def destroy(): Unit = { } - - override def init(config: FilterConfig): Unit = { } - override def doFilter(req: ServletRequest, res: ServletResponse, chain: FilterChain): Unit = { val hreq = req.asInstanceOf[HttpServletRequest] val wrapped = new HttpServletRequestWrapper(hreq) { diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 243d33fe55a79..3e507df706ba5 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -185,6 +185,8 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti private var firstInit: Boolean = _ /** Set of TaskSets the DAGScheduler has requested executed. */ val taskSets = scala.collection.mutable.Buffer[TaskSet]() + /** Track running tasks, the key is the task's stageId , the value is the task's partitionId */ + var runningTaskInfos = new HashMap[Int, HashSet[Int]]() /** Stages for which the DAGScheduler has called TaskScheduler.killAllTaskAttempts(). */ val cancelledStages = new HashSet[Int]() @@ -206,12 +208,14 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // normally done by TaskSetManager taskSet.tasks.foreach(_.epoch = mapOutputTracker.getEpoch) taskSets += taskSet + runningTaskInfos.put(taskSet.stageId, new HashSet[Int]() ++ taskSet.tasks.map(_.partitionId)) } override def killTaskAttempt( taskId: Long, interruptThread: Boolean, reason: String): Boolean = false override def killAllTaskAttempts( stageId: Int, interruptThread: Boolean, reason: String): Unit = { cancelledStages += stageId + runningTaskInfos.remove(stageId) } override def notifyPartitionCompletion(stageId: Int, partitionId: Int): Unit = { taskSets.filter(_.stageId == stageId).lastOption.foreach { ts => @@ -393,6 +397,14 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti handleShuffleMergeFinalized(shuffleMapStage, shuffleMapStage.shuffleDep.shuffleMergeId) } } + + override private[scheduler] def handleTaskCompletion(event: CompletionEvent): Unit = { + super.handleTaskCompletion(event) + runningTaskInfos.get(event.task.stageId).foreach{ partitions => + partitions -= event.task.partitionId + if (partitions.isEmpty) runningTaskInfos.remove(event.task.stageId) + } + } } override def beforeEach(): Unit = { @@ -2252,6 +2264,46 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti assert(scheduler.activeJobs.isEmpty) } + test("SPARK-50648: when job is cancelled during shuffle retry in parent stage, " + + "should kill all running tasks") { + val shuffleMapRdd = new MyRDD(sc, 2, Nil) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) + submit(reduceRdd, Array(0, 1)) + completeShuffleMapStageSuccessfully(0, 0, 2) + sc.listenerBus.waitUntilEmpty() + + val info = new TaskInfo( + 3, index = 1, attemptNumber = 1, + partitionId = taskSets(1).tasks(0).partitionId, 0L, "", "", TaskLocality.ANY, true) + // result task 0.0 fetch failed, but result task 1.0 is still running + runEvent(makeCompletionEvent(taskSets(1).tasks(0), + FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0L, 0, 1, "ignored"), + null, + Seq.empty, + Array.empty, + info)) + sc.listenerBus.waitUntilEmpty() + + Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2) + // map stage is running by resubmitted, result stage is waiting + // map tasks and the origin result task 1.0 are running + assert(scheduler.runningStages.size == 1, "Map stage should be running") + val mapStage = scheduler.runningStages.head + assert(mapStage.id === 0) + assert(mapStage.latestInfo.failureReason.isEmpty) + assert(scheduler.waitingStages.size == 1, "Result stage should be waiting") + assert(runningTaskInfos.size == 2) + assert(runningTaskInfos(taskSets(1).stageId).size == 1, + "origin result task 1.0 should be running") + + scheduler.doCancelAllJobs() + // all tasks should be killed + assert(runningTaskInfos.isEmpty) + assert(scheduler.runningStages.isEmpty) + assert(scheduler.waitingStages.isEmpty) + } + test("misbehaved accumulator should not crash DAGScheduler and SparkContext") { val acc = new LongAccumulator { override def add(v: java.lang.Long): Unit = throw new DAGSchedulerSuiteDummyException diff --git a/core/src/test/scala/org/apache/spark/shuffle/ChecksumBenchmark.scala b/core/src/test/scala/org/apache/spark/shuffle/ChecksumBenchmark.scala index 16a50fabb7ffd..4eb167930b0dd 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/ChecksumBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/ChecksumBenchmark.scala @@ -19,8 +19,6 @@ package org.apache.spark.shuffle import java.util.zip.{Adler32, CRC32, CRC32C} -import org.apache.hadoop.util.PureJavaCrc32C - import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} /** @@ -41,18 +39,15 @@ object ChecksumBenchmark extends BenchmarkBase { runBenchmark("Benchmark Checksum Algorithms") { val data: Array[Byte] = (1 until 32 * 1024 * 1024).map(_.toByte).toArray val benchmark = new Benchmark("Checksum Algorithms", N, 3, output = output) + benchmark.addCase(s"Adler32") { _ => + (1 to N).foreach(_ => new Adler32().update(data)) + } benchmark.addCase("CRC32") { _ => (1 to N).foreach(_ => new CRC32().update(data)) } benchmark.addCase(s"CRC32C") { _ => (1 to N).foreach(_ => new CRC32C().update(data)) } - benchmark.addCase(s"Adler32") { _ => - (1 to N).foreach(_ => new Adler32().update(data)) - } - benchmark.addCase(s"hadoop PureJavaCrc32C") { _ => - (1 to N).foreach(_ => new PureJavaCrc32C().update(data)) - } benchmark.run() } } diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala index 1b68ed301fb92..6d12e88e8efac 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala @@ -504,8 +504,6 @@ private[spark] class TestFilter extends Filter { private var rc: Int = HttpServletResponse.SC_OK - override def destroy(): Unit = { } - override def init(config: FilterConfig): Unit = { if (config.getInitParameter("responseCode") != null) { rc = config.getInitParameter("responseCode").toInt diff --git a/core/src/test/scala/org/apache/spark/util/BestEffortLazyValSuite.scala b/core/src/test/scala/org/apache/spark/util/BestEffortLazyValSuite.scala new file mode 100644 index 0000000000000..a6555eca8b859 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/BestEffortLazyValSuite.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +import java.io.NotSerializableException +import java.util.concurrent.CountDownLatch +import java.util.concurrent.atomic.AtomicInteger + +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration._ + +import org.apache.spark.{SerializerTestUtils, SparkFunSuite} + +class BestEffortLazyValSuite extends SparkFunSuite with SerializerTestUtils { + + test("BestEffortLazy works") { + val numInitializerCalls = new AtomicInteger(0) + // Simulate a race condition where two threads concurrently + // initialize the lazy value: + val latch = new CountDownLatch(2) + val lazyval = new BestEffortLazyVal(() => { + numInitializerCalls.incrementAndGet() + latch.countDown() + latch.await() + new Object() + }) + + // Ensure no initialization happened before the lazy value was invoked + assert(numInitializerCalls.get() === 0) + + // Two threads concurrently invoke the lazy value + implicit val ec: ExecutionContext = ExecutionContext.global + val future1 = Future { lazyval() } + val future2 = Future { lazyval() } + val value1 = ThreadUtils.awaitResult(future1, 10.seconds) + val value2 = ThreadUtils.awaitResult(future2, 10.seconds) + + // The initializer should have been invoked twice (due to how we set up the + // race condition via the latch): + assert(numInitializerCalls.get() === 2) + + // But the value should only have been computed once: + assert(value1 eq value2) + + // Ensure the subsequent invocation serves the same object + assert(lazyval() eq value1) + assert(numInitializerCalls.get() === 2) + } + + test("BestEffortLazyVal is serializable") { + val lazyval = new BestEffortLazyVal(() => "test") + + // serialize and deserialize before first invocation + val lazyval2 = roundtripSerialize(lazyval) + assert(lazyval2() === "test") + + // first invocation + assert(lazyval() === "test") + + // serialize and deserialize after first invocation + val lazyval3 = roundtripSerialize(lazyval) + assert(lazyval3() === "test") + } + + test("BestEffortLazyVal is serializable: unserializable value") { + val lazyval = new BestEffortLazyVal(() => new Object()) + + // serialize and deserialize before first invocation + val lazyval2 = roundtripSerialize(lazyval) + assert(lazyval2() != null) + + // first invocation + assert(lazyval() != null) + + // serialize and deserialize after first invocation + // try to serialize the cached value and cause NotSerializableException + val e = intercept[NotSerializableException] { + val lazyval3 = roundtripSerialize(lazyval) + } + assert(e.getMessage.contains("java.lang.Object")) + } + + test("BestEffortLazyVal is serializable: initialization failure") { + val lazyval = new BestEffortLazyVal[String](() => throw new RuntimeException("test")) + + // serialize and deserialize before first invocation + val lazyval2 = roundtripSerialize(lazyval) + val e2 = intercept[RuntimeException] { + val v = lazyval2() + } + assert(e2.getMessage.contains("test")) + + // initialization failure + val e = intercept[RuntimeException] { + val v = lazyval() + } + assert(e.getMessage.contains("test")) + + // serialize and deserialize after initialization failure + val lazyval3 = roundtripSerialize(lazyval) + val e3 = intercept[RuntimeException] { + val v = lazyval3() + } + assert(e3.getMessage.contains("test")) + } +} diff --git a/core/src/test/scala/org/apache/spark/util/TransientBestEffortLazyValSuite.scala b/core/src/test/scala/org/apache/spark/util/TransientBestEffortLazyValSuite.scala new file mode 100644 index 0000000000000..3ed9f2958fd9c --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/TransientBestEffortLazyValSuite.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +import java.util.concurrent.CountDownLatch +import java.util.concurrent.atomic.AtomicInteger + +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration._ + +import org.apache.spark.{SerializerTestUtils, SparkFunSuite} + +class TransientBestEffortLazyValSuite extends SparkFunSuite with SerializerTestUtils { + + test("TransientBestEffortLazyVal works") { + val numInitializerCalls = new AtomicInteger(0) + // Simulate a race condition where two threads concurrently + // initialize the lazy value: + val latch = new CountDownLatch(2) + val lazyval = new TransientBestEffortLazyVal(() => { + numInitializerCalls.incrementAndGet() + latch.countDown() + latch.await() + new Object() + }) + + // Ensure no initialization happened before the lazy value was invoked + assert(numInitializerCalls.get() === 0) + + // Two threads concurrently invoke the lazy value + implicit val ec: ExecutionContext = ExecutionContext.global + val future1 = Future { lazyval() } + val future2 = Future { lazyval() } + val value1 = ThreadUtils.awaitResult(future1, 10.seconds) + val value2 = ThreadUtils.awaitResult(future2, 10.seconds) + + // The initializer should have been invoked twice (due to how we set up the + // race condition via the latch): + assert(numInitializerCalls.get() === 2) + + // But the value should only have been computed once: + assert(value1 eq value2) + + // Ensure the subsequent invocation serves the same object + assert(lazyval() eq value1) + assert(numInitializerCalls.get() === 2) + } + + test("TransientBestEffortLazyVal is serializable") { + val lazyval = new TransientBestEffortLazyVal(() => "test") + + // serialize and deserialize before first invocation + val lazyval2 = roundtripSerialize(lazyval) + assert(lazyval2() === "test") + + // first invocation + assert(lazyval() === "test") + + // serialize and deserialize after first invocation + val lazyval3 = roundtripSerialize(lazyval) + assert(lazyval3() === "test") + } + + test("TransientBestEffortLazyVal is serializable: unserializable value") { + val lazyval = new TransientBestEffortLazyVal(() => new Object()) + + // serialize and deserialize before first invocation + val lazyval2 = roundtripSerialize(lazyval) + assert(lazyval2() != null) + + // first invocation + assert(lazyval() != null) + + // serialize and deserialize after first invocation + val lazyval3 = roundtripSerialize(lazyval) + assert(lazyval3() != null) + } + + test("TransientBestEffortLazyVal is serializable: failure in compute function") { + val lazyval = new TransientBestEffortLazyVal[String](() => throw new RuntimeException("test")) + + // serialize and deserialize before first invocation + val lazyval2 = roundtripSerialize(lazyval) + val e2 = intercept[RuntimeException] { + val v = lazyval2() + } + assert(e2.getMessage.contains("test")) + + // initialization failure + val e = intercept[RuntimeException] { + val v = lazyval() + } + assert(e.getMessage.contains("test")) + + // serialize and deserialize after initialization failure + val lazyval3 = roundtripSerialize(lazyval) + val e3 = intercept[RuntimeException] { + val v = lazyval3() + } + assert(e3.getMessage.contains("test")) + } +} diff --git a/core/src/test/scala/org/apache/spark/util/TransientLazySuite.scala b/core/src/test/scala/org/apache/spark/util/TransientLazySuite.scala deleted file mode 100644 index c0754ee063d67..0000000000000 --- a/core/src/test/scala/org/apache/spark/util/TransientLazySuite.scala +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.util - -import java.io.{ByteArrayOutputStream, ObjectOutputStream} - -import org.apache.spark.SparkFunSuite - -class TransientLazySuite extends SparkFunSuite { - - test("TransientLazy val works") { - var test: Option[Object] = None - - val lazyval = new TransientLazy({ - test = Some(new Object()) - test - }) - - // Ensure no initialization happened before the lazy value was dereferenced - assert(test.isEmpty) - - // Ensure the first invocation creates a new object - assert(lazyval() == test && test.isDefined) - - // Ensure the subsequent invocation serves the same object - assert(lazyval() == test && test.isDefined) - } - - test("TransientLazy val is serializable") { - val lazyval = new TransientLazy({ - new Object() - }) - - // Ensure serializable before the dereference - val oos = new ObjectOutputStream(new ByteArrayOutputStream()) - oos.writeObject(lazyval) - - val dereferenced = lazyval() - - // Ensure serializable after the dereference - val oos2 = new ObjectOutputStream(new ByteArrayOutputStream()) - oos2.writeObject(lazyval) - } -} diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index a6e3345fc600c..baa748573d75b 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -22,6 +22,7 @@ import java.lang.reflect.Field import java.net.{BindException, ServerSocket, URI} import java.nio.{ByteBuffer, ByteOrder} import java.nio.charset.StandardCharsets.UTF_8 +import java.nio.file.{Files => JFiles} import java.text.DecimalFormatSymbols import java.util.Locale import java.util.concurrent.TimeUnit @@ -731,6 +732,43 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties { assert(!sourceFile2.exists()) } + test("SPARK-50716: deleteRecursively - SymbolicLink To File") { + val tempDir = Utils.createTempDir() + val sourceFile = new File(tempDir, "foo.txt") + JFiles.write(sourceFile.toPath, "Some content".getBytes) + assert(sourceFile.exists()) + + val symlinkFile = new File(tempDir, "bar.txt") + JFiles.createSymbolicLink(symlinkFile.toPath, sourceFile.toPath) + + // Check that the symlink was created successfully + assert(JFiles.isSymbolicLink(symlinkFile.toPath)) + Utils.deleteRecursively(tempDir) + + // Verify that everything is deleted + assert(!tempDir.exists) + } + + test("SPARK-50716: deleteRecursively - SymbolicLink To Dir") { + val tempDir = Utils.createTempDir() + val sourceDir = new File(tempDir, "sourceDir") + assert(sourceDir.mkdir()) + val sourceFile = new File(sourceDir, "file.txt") + JFiles.write(sourceFile.toPath, "Some content".getBytes) + + val symlinkDir = new File(tempDir, "targetDir") + JFiles.createSymbolicLink(symlinkDir.toPath, sourceDir.toPath) + + // Check that the symlink was created successfully + assert(JFiles.isSymbolicLink(symlinkDir.toPath)) + + // Now delete recursively + Utils.deleteRecursively(tempDir) + + // Verify that everything is deleted + assert(!tempDir.exists) + } + test("loading properties from file") { withTempDir { tmpDir => val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) @@ -1581,26 +1619,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties { // at org.apache.spark.util.UtilsSuite.$anonfun$new$165(UtilsSuite.scala:1658) // ... 56 more // scalastyle:on line.size.limit - val origSt = e1.getSuppressed.find( - _.getMessage == Utils.TRY_WITH_CALLER_STACKTRACE_FULL_STACKTRACE) + val origSt = e1.getSuppressed.find(_.isInstanceOf[Utils.OriginalTryStackTraceException]) assert(origSt.isDefined) assert(origSt.get.getStackTrace.exists(_.getMethodName == "throwException")) assert(origSt.get.getStackTrace.exists(_.getMethodName == "callDoTry")) - // The stack trace under Try should be in the suppressed exceptions. - // Example: - // Suppressed: java.lang.Exception: Stacktrace under doTryWithCallerStacktrace - // at org.apache.spark.util.UtilsSuite.throwException(UtilsSuite.scala: 1640) - // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTry$1(UtilsSuite.scala: 1645) - // at scala.util.Try$.apply(Try.scala: 213) - // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala: 1586) - val trySt = e1.getSuppressed.find( - _.getMessage == Utils.TRY_WITH_CALLER_STACKTRACE_TRY_STACKTRACE) - assert(trySt.isDefined) - // calls under callDoTry should be present. - assert(trySt.get.getStackTrace.exists(_.getMethodName == "throwException")) - // callDoTry should be removed. - assert(!trySt.get.getStackTrace.exists(_.getMethodName == "callDoTry")) + // Should save the depth of the stack trace under doTryWithCallerStacktrace. + assert(origSt.get.asInstanceOf[Utils.OriginalTryStackTraceException] + .doTryWithCallerStacktraceDepth == 4) val e2 = intercept[Exception] { callGetTryAgain(t) @@ -1633,6 +1659,152 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties { assert(st1Again.exists(_.getMethodName == "callGetTryAgain")) assert(!st1Again.exists(_.getMethodName == "callGetTry")) } + + private def callGetTryFromNested(t: Try[String]): String = { + Utils.getTryWithCallerStacktrace(t) + } + + private def callDoTryNested(): Try[String] = { + Utils.doTryWithCallerStacktrace { + val t = callDoTry() + val e = intercept[Exception] { + callGetTryFromNested(t) + } + + // Uncomment for manual inspection + // + // println("\nIntercepted in callDoTryNested:") + // e.printStackTrace() + // + // scalastyle:off line.size.limit + // java.lang.Exception: test + // at org.apache.spark.util.UtilsSuite.throwException(UtilsSuite.scala:1529) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTry$1(UtilsSuite.scala:1534) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1438) + // ----> at org.apache.spark.util.UtilsSuite.callGetTryFromNested(UtilsSuite.scala:1626) <---- STITCHED. + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNested$2(UtilsSuite.scala:1633) + // at org.scalatest.Assertions.intercept(Assertions.scala:749) + // at org.scalatest.Assertions.intercept$(Assertions.scala:746) + // at org.scalatest.funsuite.AnyFunSuite.intercept(AnyFunSuite.scala:1564) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNested$1(UtilsSuite.scala:1632) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTryNested(UtilsSuite.scala:1630) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNestedNested$1(UtilsSuite.scala:1655) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTryNestedNested(UtilsSuite.scala:1654) + // at org.apache.spark.util.UtilsSuite.$anonfun$new$172(UtilsSuite.scala:1674) + // ... + // Suppressed: org.apache.spark.util.Utils$OriginalTryStackTraceException: Full stacktrace of original doTryWithCallerStacktrace caller + // at org.apache.spark.util.UtilsSuite.throwException(UtilsSuite.scala:1529) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTry$1(UtilsSuite.scala:1534) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTry(UtilsSuite.scala:1534) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNested$1(UtilsSuite.scala:1631) + // ... + // scalastyle:on line.size.limit + + assert(e.getStackTrace.exists(_.getMethodName == "callGetTryFromNested")) + assert(!e.getStackTrace.exists(_.getMethodName == "callGetTryFromNestedNested")) + assert(!e.getStackTrace.exists(_.getMethodName == "callGetTry")) + assert(e.getSuppressed.length == 1) + + Utils.getTryWithCallerStacktrace(t) + } + } + + private def callGetTryFromNestedNested(t: Try[String]): String = { + Utils.getTryWithCallerStacktrace(t) + } + + private def callDoTryNestedNested(): Try[String] = { + Utils.doTryWithCallerStacktrace { + val t = callDoTryNested() + val e = intercept[Exception] { + callGetTryFromNestedNested(t) + } + + // Uncomment for manual inspection + // + // println("\nIntercepted in callDoTryNestedNested:") + // e.printStackTrace() + // + // scalastyle:off line.size.limit + // java.lang.Exception: test + // at org.apache.spark.util.UtilsSuite.throwException(UtilsSuite.scala:1529) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTry$1(UtilsSuite.scala:1534) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTry(UtilsSuite.scala:1534) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNested$1(UtilsSuite.scala:1631) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1438) + // ----> at org.apache.spark.util.UtilsSuite.callGetTryFromNestedNested(UtilsSuite.scala:1650) <---- STITCHED. + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNestedNested$2(UtilsSuite.scala:1657) + // at org.scalatest.Assertions.intercept(Assertions.scala:749) + // at org.scalatest.Assertions.intercept$(Assertions.scala:746) + // at org.scalatest.funsuite.AnyFunSuite.intercept(AnyFunSuite.scala:1564) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNestedNested$1(UtilsSuite.scala:1656) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTryNestedNested(UtilsSuite.scala:1654) + // at org.apache.spark.util.UtilsSuite.$anonfun$new$172(UtilsSuite.scala:1674) + // scalastyle:on line.size.limit + + assert(e.getStackTrace.exists(_.getMethodName == "callGetTryFromNestedNested")) + assert(!e.getStackTrace.exists(_.getMethodName == "callGetTryFromNested")) + assert(!e.getStackTrace.exists(_.getMethodName == "callGetTry")) + assert(e.getSuppressed.length == 1) + + Utils.getTryWithCallerStacktrace(t) + } + } + + test("nested doTryWithCallerStacktrace and getTryWithCallerStacktrace") { + val t = callDoTryNestedNested() + + val e = intercept[Exception] { + callGetTry(t) + } + + // Uncomment for manual inspection + // + // println("\nIntercepted in test:") + // e.printStackTrace() + // + // scalastyle:off line.size.limit + // java.lang.Exception: test + // at org.apache.spark.util.UtilsSuite.throwException(UtilsSuite.scala:1529) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTry$1(UtilsSuite.scala:1534) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTry(UtilsSuite.scala:1534) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNested$1(UtilsSuite.scala:1631) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.UtilsSuite.callDoTryNested(UtilsSuite.scala:1630) + // at org.apache.spark.util.UtilsSuite.$anonfun$callDoTryNestedNested$1(UtilsSuite.scala:1655) + // at scala.util.Try$.apply(Try.scala:217) + // at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1377) + // at org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1438) + // ----> at org.apache.spark.util.UtilsSuite.callGetTry(UtilsSuite.scala:1539) <---- STITCHED. + // at org.apache.spark.util.UtilsSuite.$anonfun$new$173(UtilsSuite.scala:1677) + // at org.scalatest.Assertions.intercept(Assertions.scala:749) + // at org.scalatest.Assertions.intercept$(Assertions.scala:746) + // at org.scalatest.funsuite.AnyFunSuite.intercept(AnyFunSuite.scala:1564) + // at org.apache.spark.util.UtilsSuite.$anonfun$new$172(UtilsSuite.scala:1676) + // scalastyle:on line.size.limit + + assert(e.getStackTrace.exists(_.getMethodName == "callGetTry")) + assert(!e.getStackTrace.exists(_.getMethodName == "callGetTryFromNested")) + assert(!e.getStackTrace.exists(_.getMethodName == "callGetTryFromNestedNested")) + assert(e.getSuppressed.length == 1) + } } private class SimpleExtension diff --git a/dev/check-protos.py b/dev/check-protos.py index bfca8b27be21c..4ddd1f1058820 100755 --- a/dev/check-protos.py +++ b/dev/check-protos.py @@ -44,8 +44,8 @@ def run_cmd(cmd): def check_protos(module_name, cmp_path, proto_path): - print(f"Start checking the generated codes in pyspark-${module_name}.") - with tempfile.TemporaryDirectory(prefix=f"check_${module_name}__protos") as tmp: + print(f"Start checking the generated codes in pyspark-{module_name}.") + with tempfile.TemporaryDirectory(prefix=f"check_{module_name}__protos") as tmp: run_cmd(f"{SPARK_HOME}/dev/gen-protos.sh {module_name} {tmp}") result = filecmp.dircmp( f"{SPARK_HOME}/{cmp_path}", @@ -71,12 +71,12 @@ def check_protos(module_name, cmp_path, proto_path): success = False if success: - print(f"Finish checking the generated codes in pyspark-${module_name}: SUCCESS") + print(f"Finish checking the generated codes in pyspark-{module_name}: SUCCESS") else: fail( - "Generated files for pyspark-connect are out of sync! " - f"If you have touched files under ${proto_path}, " - f"please run ./dev/${module_name}-gen-protos.sh. " + f"Generated files for pyspark-{module_name} are out of sync! " + f"If you have touched files under {proto_path}, " + f"please run ./dev/{module_name}-gen-protos.sh. " "If you haven't touched any file above, please rebase your PR against main branch." ) diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml index 4285028109419..6c50718e27fe5 100644 --- a/dev/checkstyle.xml +++ b/dev/checkstyle.xml @@ -172,32 +172,10 @@ - - - - - - - - - - - - - - - - - - - - diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 8a9890bf68dde..7b97aba21205b 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -23,10 +23,10 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Release Manager Image" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE 20240318 +ENV FULL_REFRESH_DATE=20240318 -ENV DEBIAN_FRONTEND noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN true +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN apt-get update && apt-get install -y \ build-essential \ @@ -88,7 +88,7 @@ RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" # See more in SPARK-39735 -ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" +ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" RUN add-apt-repository ppa:pypy/ppa @@ -102,7 +102,7 @@ RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matp ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2 twine==3.4.1" # Python deps for Spark Connect -ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.28.3 googleapis-common-protos==1.65.0" +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0" # Install Python 3.10 packages RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 @@ -130,7 +130,7 @@ RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting $CON # See 'docutils<0.18.0' in SPARK-39421 RUN python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ -'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ +'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \ 'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' RUN python3.9 -m pip list diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 4603ae2fc5548..59aabdf5fff19 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -16,11 +16,11 @@ antlr4-runtime/4.13.1//antlr4-runtime-4.13.1.jar aopalliance-repackaged/3.0.6//aopalliance-repackaged-3.0.6.jar arpack/3.0.3//arpack-3.0.3.jar arpack_combined_all/0.1//arpack_combined_all-0.1.jar -arrow-format/18.0.0//arrow-format-18.0.0.jar -arrow-memory-core/18.0.0//arrow-memory-core-18.0.0.jar -arrow-memory-netty-buffer-patch/18.0.0//arrow-memory-netty-buffer-patch-18.0.0.jar -arrow-memory-netty/18.0.0//arrow-memory-netty-18.0.0.jar -arrow-vector/18.0.0//arrow-vector-18.0.0.jar +arrow-format/18.1.0//arrow-format-18.1.0.jar +arrow-memory-core/18.1.0//arrow-memory-core-18.1.0.jar +arrow-memory-netty-buffer-patch/18.1.0//arrow-memory-netty-buffer-patch-18.1.0.jar +arrow-memory-netty/18.1.0//arrow-memory-netty-18.1.0.jar +arrow-vector/18.1.0//arrow-vector-18.1.0.jar audience-annotations/0.12.0//audience-annotations-0.12.0.jar avro-ipc/1.12.0//avro-ipc-1.12.0.jar avro-mapred/1.12.0//avro-mapred-1.12.0.jar @@ -33,11 +33,11 @@ breeze-macros_2.13/2.1.0//breeze-macros_2.13-2.1.0.jar breeze_2.13/2.1.0//breeze_2.13-2.1.0.jar bundle/2.24.6//bundle-2.24.6.jar cats-kernel_2.13/2.8.0//cats-kernel_2.13-2.8.0.jar -checker-qual/3.42.0//checker-qual-3.42.0.jar +checker-qual/3.43.0//checker-qual-3.43.0.jar chill-java/0.10.0//chill-java-0.10.0.jar chill_2.13/0.10.0//chill_2.13-0.10.0.jar commons-cli/1.9.0//commons-cli-1.9.0.jar -commons-codec/1.17.1//commons-codec-1.17.1.jar +commons-codec/1.17.2//commons-codec-1.17.2.jar commons-collections/3.2.2//commons-collections-3.2.2.jar commons-collections4/4.4//commons-collections4-4.4.jar commons-compiler/3.1.9//commons-compiler-3.1.9.jar @@ -49,7 +49,7 @@ commons-lang/2.6//commons-lang-2.6.jar commons-lang3/3.17.0//commons-lang3-3.17.0.jar commons-math3/3.6.1//commons-math3-3.6.1.jar commons-pool/1.5.4//commons-pool-1.5.4.jar -commons-text/1.12.0//commons-text-1.12.0.jar +commons-text/1.13.0//commons-text-1.13.0.jar compress-lzf/1.1.2//compress-lzf-1.1.2.jar curator-client/5.7.1//curator-client-5.7.1.jar curator-framework/5.7.1//curator-framework-5.7.1.jar @@ -63,14 +63,14 @@ derby/10.16.1.1//derby-10.16.1.1.jar derbyshared/10.16.1.1//derbyshared-10.16.1.1.jar derbytools/10.16.1.1//derbytools-10.16.1.1.jar dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar -error_prone_annotations/2.26.1//error_prone_annotations-2.26.1.jar +error_prone_annotations/2.28.0//error_prone_annotations-2.28.0.jar esdk-obs-java/3.20.4.2//esdk-obs-java-3.20.4.2.jar failureaccess/1.0.2//failureaccess-1.0.2.jar flatbuffers-java/24.3.25//flatbuffers-java-24.3.25.jar gcs-connector/hadoop3-2.2.25/shaded/gcs-connector-hadoop3-2.2.25-shaded.jar gmetric4j/1.0.10//gmetric4j-1.0.10.jar gson/2.11.0//gson-2.11.0.jar -guava/33.2.1-jre//guava-33.2.1-jre.jar +guava/33.3.1-jre//guava-33.3.1-jre.jar hadoop-aliyun/3.4.1//hadoop-aliyun-3.4.1.jar hadoop-annotations/3.4.1//hadoop-annotations-3.4.1.jar hadoop-aws/3.4.1//hadoop-aws-3.4.1.jar @@ -103,18 +103,17 @@ httpcore/4.4.16//httpcore-4.4.16.jar icu4j/76.1//icu4j-76.1.jar ini4j/0.5.4//ini4j-0.5.4.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar -ivy/2.5.2//ivy-2.5.2.jar +ivy/2.5.3//ivy-2.5.3.jar j2objc-annotations/3.0.0//j2objc-annotations-3.0.0.jar -jackson-annotations/2.18.1//jackson-annotations-2.18.1.jar +jackson-annotations/2.18.2//jackson-annotations-2.18.2.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar -jackson-core/2.18.1//jackson-core-2.18.1.jar -jackson-databind/2.18.1//jackson-databind-2.18.1.jar -jackson-dataformat-cbor/2.18.1//jackson-dataformat-cbor-2.18.1.jar -jackson-dataformat-yaml/2.18.1//jackson-dataformat-yaml-2.18.1.jar -jackson-datatype-jdk8/2.17.0//jackson-datatype-jdk8-2.17.0.jar -jackson-datatype-jsr310/2.18.1//jackson-datatype-jsr310-2.18.1.jar +jackson-core/2.18.2//jackson-core-2.18.2.jar +jackson-databind/2.18.2//jackson-databind-2.18.2.jar +jackson-dataformat-cbor/2.18.2//jackson-dataformat-cbor-2.18.2.jar +jackson-dataformat-yaml/2.18.2//jackson-dataformat-yaml-2.18.2.jar +jackson-datatype-jsr310/2.18.2//jackson-datatype-jsr310-2.18.2.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.13/2.18.1//jackson-module-scala_2.13-2.18.1.jar +jackson-module-scala_2.13/2.18.2//jackson-module-scala_2.13-2.18.2.jar jakarta.annotation-api/2.1.1//jakarta.annotation-api-2.1.1.jar jakarta.inject-api/2.0.1//jakarta.inject-api-2.0.1.jar jakarta.servlet-api/5.0.0//jakarta.servlet-api-5.0.0.jar @@ -159,74 +158,75 @@ jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar jul-to-slf4j/2.0.16//jul-to-slf4j-2.0.16.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client-api/6.13.4//kubernetes-client-api-6.13.4.jar -kubernetes-client/6.13.4//kubernetes-client-6.13.4.jar -kubernetes-httpclient-okhttp/6.13.4//kubernetes-httpclient-okhttp-6.13.4.jar -kubernetes-model-admissionregistration/6.13.4//kubernetes-model-admissionregistration-6.13.4.jar -kubernetes-model-apiextensions/6.13.4//kubernetes-model-apiextensions-6.13.4.jar -kubernetes-model-apps/6.13.4//kubernetes-model-apps-6.13.4.jar -kubernetes-model-autoscaling/6.13.4//kubernetes-model-autoscaling-6.13.4.jar -kubernetes-model-batch/6.13.4//kubernetes-model-batch-6.13.4.jar -kubernetes-model-certificates/6.13.4//kubernetes-model-certificates-6.13.4.jar -kubernetes-model-common/6.13.4//kubernetes-model-common-6.13.4.jar -kubernetes-model-coordination/6.13.4//kubernetes-model-coordination-6.13.4.jar -kubernetes-model-core/6.13.4//kubernetes-model-core-6.13.4.jar -kubernetes-model-discovery/6.13.4//kubernetes-model-discovery-6.13.4.jar -kubernetes-model-events/6.13.4//kubernetes-model-events-6.13.4.jar -kubernetes-model-extensions/6.13.4//kubernetes-model-extensions-6.13.4.jar -kubernetes-model-flowcontrol/6.13.4//kubernetes-model-flowcontrol-6.13.4.jar -kubernetes-model-gatewayapi/6.13.4//kubernetes-model-gatewayapi-6.13.4.jar -kubernetes-model-metrics/6.13.4//kubernetes-model-metrics-6.13.4.jar -kubernetes-model-networking/6.13.4//kubernetes-model-networking-6.13.4.jar -kubernetes-model-node/6.13.4//kubernetes-model-node-6.13.4.jar -kubernetes-model-policy/6.13.4//kubernetes-model-policy-6.13.4.jar -kubernetes-model-rbac/6.13.4//kubernetes-model-rbac-6.13.4.jar -kubernetes-model-resource/6.13.4//kubernetes-model-resource-6.13.4.jar -kubernetes-model-scheduling/6.13.4//kubernetes-model-scheduling-6.13.4.jar -kubernetes-model-storageclass/6.13.4//kubernetes-model-storageclass-6.13.4.jar +kubernetes-client-api/7.0.1//kubernetes-client-api-7.0.1.jar +kubernetes-client/7.0.1//kubernetes-client-7.0.1.jar +kubernetes-httpclient-vertx/7.0.1//kubernetes-httpclient-vertx-7.0.1.jar +kubernetes-model-admissionregistration/7.0.1//kubernetes-model-admissionregistration-7.0.1.jar +kubernetes-model-apiextensions/7.0.1//kubernetes-model-apiextensions-7.0.1.jar +kubernetes-model-apps/7.0.1//kubernetes-model-apps-7.0.1.jar +kubernetes-model-autoscaling/7.0.1//kubernetes-model-autoscaling-7.0.1.jar +kubernetes-model-batch/7.0.1//kubernetes-model-batch-7.0.1.jar +kubernetes-model-certificates/7.0.1//kubernetes-model-certificates-7.0.1.jar +kubernetes-model-common/7.0.1//kubernetes-model-common-7.0.1.jar +kubernetes-model-coordination/7.0.1//kubernetes-model-coordination-7.0.1.jar +kubernetes-model-core/7.0.1//kubernetes-model-core-7.0.1.jar +kubernetes-model-discovery/7.0.1//kubernetes-model-discovery-7.0.1.jar +kubernetes-model-events/7.0.1//kubernetes-model-events-7.0.1.jar +kubernetes-model-extensions/7.0.1//kubernetes-model-extensions-7.0.1.jar +kubernetes-model-flowcontrol/7.0.1//kubernetes-model-flowcontrol-7.0.1.jar +kubernetes-model-gatewayapi/7.0.1//kubernetes-model-gatewayapi-7.0.1.jar +kubernetes-model-metrics/7.0.1//kubernetes-model-metrics-7.0.1.jar +kubernetes-model-networking/7.0.1//kubernetes-model-networking-7.0.1.jar +kubernetes-model-node/7.0.1//kubernetes-model-node-7.0.1.jar +kubernetes-model-policy/7.0.1//kubernetes-model-policy-7.0.1.jar +kubernetes-model-rbac/7.0.1//kubernetes-model-rbac-7.0.1.jar +kubernetes-model-resource/7.0.1//kubernetes-model-resource-7.0.1.jar +kubernetes-model-scheduling/7.0.1//kubernetes-model-scheduling-7.0.1.jar +kubernetes-model-storageclass/7.0.1//kubernetes-model-storageclass-7.0.1.jar lapack/3.0.3//lapack-3.0.3.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar libthrift/0.16.0//libthrift-0.16.0.jar listenablefuture/9999.0-empty-to-avoid-conflict-with-guava//listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar -log4j-1.2-api/2.24.1//log4j-1.2-api-2.24.1.jar -log4j-api/2.24.1//log4j-api-2.24.1.jar -log4j-core/2.24.1//log4j-core-2.24.1.jar -log4j-layout-template-json/2.24.1//log4j-layout-template-json-2.24.1.jar -log4j-slf4j2-impl/2.24.1//log4j-slf4j2-impl-2.24.1.jar -logging-interceptor/3.12.12//logging-interceptor-3.12.12.jar +log4j-1.2-api/2.24.3//log4j-1.2-api-2.24.3.jar +log4j-api/2.24.3//log4j-api-2.24.3.jar +log4j-core/2.24.3//log4j-core-2.24.3.jar +log4j-layout-template-json/2.24.3//log4j-layout-template-json-2.24.3.jar +log4j-slf4j2-impl/2.24.3//log4j-slf4j2-impl-2.24.3.jar lz4-java/1.8.0//lz4-java-1.8.0.jar -metrics-core/4.2.28//metrics-core-4.2.28.jar -metrics-graphite/4.2.28//metrics-graphite-4.2.28.jar -metrics-jmx/4.2.28//metrics-jmx-4.2.28.jar -metrics-json/4.2.28//metrics-json-4.2.28.jar -metrics-jvm/4.2.28//metrics-jvm-4.2.28.jar +metrics-core/4.2.29//metrics-core-4.2.29.jar +metrics-graphite/4.2.29//metrics-graphite-4.2.29.jar +metrics-jmx/4.2.29//metrics-jmx-4.2.29.jar +metrics-json/4.2.29//metrics-json-4.2.29.jar +metrics-jvm/4.2.29//metrics-jvm-4.2.29.jar minlog/1.3.0//minlog-1.3.0.jar -netty-all/4.1.114.Final//netty-all-4.1.114.Final.jar -netty-buffer/4.1.114.Final//netty-buffer-4.1.114.Final.jar -netty-codec-http/4.1.114.Final//netty-codec-http-4.1.114.Final.jar -netty-codec-http2/4.1.114.Final//netty-codec-http2-4.1.114.Final.jar -netty-codec-socks/4.1.114.Final//netty-codec-socks-4.1.114.Final.jar -netty-codec/4.1.114.Final//netty-codec-4.1.114.Final.jar -netty-common/4.1.114.Final//netty-common-4.1.114.Final.jar -netty-handler-proxy/4.1.114.Final//netty-handler-proxy-4.1.114.Final.jar -netty-handler/4.1.114.Final//netty-handler-4.1.114.Final.jar -netty-resolver/4.1.114.Final//netty-resolver-4.1.114.Final.jar +netty-all/4.1.115.Final//netty-all-4.1.115.Final.jar +netty-buffer/4.1.115.Final//netty-buffer-4.1.115.Final.jar +netty-codec-dns/4.1.115.Final//netty-codec-dns-4.1.115.Final.jar +netty-codec-http/4.1.115.Final//netty-codec-http-4.1.115.Final.jar +netty-codec-http2/4.1.115.Final//netty-codec-http2-4.1.115.Final.jar +netty-codec-socks/4.1.115.Final//netty-codec-socks-4.1.115.Final.jar +netty-codec/4.1.115.Final//netty-codec-4.1.115.Final.jar +netty-common/4.1.115.Final//netty-common-4.1.115.Final.jar +netty-handler-proxy/4.1.115.Final//netty-handler-proxy-4.1.115.Final.jar +netty-handler/4.1.115.Final//netty-handler-4.1.115.Final.jar +netty-resolver-dns/4.1.115.Final//netty-resolver-dns-4.1.115.Final.jar +netty-resolver/4.1.115.Final//netty-resolver-4.1.115.Final.jar netty-tcnative-boringssl-static/2.0.69.Final/linux-aarch_64/netty-tcnative-boringssl-static-2.0.69.Final-linux-aarch_64.jar netty-tcnative-boringssl-static/2.0.69.Final/linux-x86_64/netty-tcnative-boringssl-static-2.0.69.Final-linux-x86_64.jar netty-tcnative-boringssl-static/2.0.69.Final/osx-aarch_64/netty-tcnative-boringssl-static-2.0.69.Final-osx-aarch_64.jar netty-tcnative-boringssl-static/2.0.69.Final/osx-x86_64/netty-tcnative-boringssl-static-2.0.69.Final-osx-x86_64.jar netty-tcnative-boringssl-static/2.0.69.Final/windows-x86_64/netty-tcnative-boringssl-static-2.0.69.Final-windows-x86_64.jar netty-tcnative-classes/2.0.69.Final//netty-tcnative-classes-2.0.69.Final.jar -netty-transport-classes-epoll/4.1.114.Final//netty-transport-classes-epoll-4.1.114.Final.jar -netty-transport-classes-kqueue/4.1.114.Final//netty-transport-classes-kqueue-4.1.114.Final.jar -netty-transport-native-epoll/4.1.114.Final/linux-aarch_64/netty-transport-native-epoll-4.1.114.Final-linux-aarch_64.jar -netty-transport-native-epoll/4.1.114.Final/linux-riscv64/netty-transport-native-epoll-4.1.114.Final-linux-riscv64.jar -netty-transport-native-epoll/4.1.114.Final/linux-x86_64/netty-transport-native-epoll-4.1.114.Final-linux-x86_64.jar -netty-transport-native-kqueue/4.1.114.Final/osx-aarch_64/netty-transport-native-kqueue-4.1.114.Final-osx-aarch_64.jar -netty-transport-native-kqueue/4.1.114.Final/osx-x86_64/netty-transport-native-kqueue-4.1.114.Final-osx-x86_64.jar -netty-transport-native-unix-common/4.1.114.Final//netty-transport-native-unix-common-4.1.114.Final.jar -netty-transport/4.1.114.Final//netty-transport-4.1.114.Final.jar +netty-transport-classes-epoll/4.1.115.Final//netty-transport-classes-epoll-4.1.115.Final.jar +netty-transport-classes-kqueue/4.1.115.Final//netty-transport-classes-kqueue-4.1.115.Final.jar +netty-transport-native-epoll/4.1.115.Final/linux-aarch_64/netty-transport-native-epoll-4.1.115.Final-linux-aarch_64.jar +netty-transport-native-epoll/4.1.115.Final/linux-riscv64/netty-transport-native-epoll-4.1.115.Final-linux-riscv64.jar +netty-transport-native-epoll/4.1.115.Final/linux-x86_64/netty-transport-native-epoll-4.1.115.Final-linux-x86_64.jar +netty-transport-native-kqueue/4.1.115.Final/osx-aarch_64/netty-transport-native-kqueue-4.1.115.Final-osx-aarch_64.jar +netty-transport-native-kqueue/4.1.115.Final/osx-x86_64/netty-transport-native-kqueue-4.1.115.Final-osx-x86_64.jar +netty-transport-native-unix-common/4.1.115.Final//netty-transport-native-unix-common-4.1.115.Final.jar +netty-transport/4.1.115.Final//netty-transport-4.1.115.Final.jar objenesis/3.3//objenesis-3.3.jar okhttp/3.12.12//okhttp-3.12.12.jar okio/1.17.6//okio-1.17.6.jar @@ -234,21 +234,21 @@ opencsv/2.3//opencsv-2.3.jar opentracing-api/0.33.0//opentracing-api-0.33.0.jar opentracing-noop/0.33.0//opentracing-noop-0.33.0.jar opentracing-util/0.33.0//opentracing-util-0.33.0.jar -orc-core/2.0.3/shaded-protobuf/orc-core-2.0.3-shaded-protobuf.jar +orc-core/2.1.0/shaded-protobuf/orc-core-2.1.0-shaded-protobuf.jar orc-format/1.0.0/shaded-protobuf/orc-format-1.0.0-shaded-protobuf.jar -orc-mapreduce/2.0.3/shaded-protobuf/orc-mapreduce-2.0.3-shaded-protobuf.jar -orc-shims/2.0.3//orc-shims-2.0.3.jar +orc-mapreduce/2.1.0/shaded-protobuf/orc-mapreduce-2.1.0-shaded-protobuf.jar +orc-shims/2.1.0//orc-shims-2.1.0.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar -parquet-column/1.14.4//parquet-column-1.14.4.jar -parquet-common/1.14.4//parquet-common-1.14.4.jar -parquet-encoding/1.14.4//parquet-encoding-1.14.4.jar -parquet-format-structures/1.14.4//parquet-format-structures-1.14.4.jar -parquet-hadoop/1.14.4//parquet-hadoop-1.14.4.jar -parquet-jackson/1.14.4//parquet-jackson-1.14.4.jar +parquet-column/1.15.0//parquet-column-1.15.0.jar +parquet-common/1.15.0//parquet-common-1.15.0.jar +parquet-encoding/1.15.0//parquet-encoding-1.15.0.jar +parquet-format-structures/1.15.0//parquet-format-structures-1.15.0.jar +parquet-hadoop/1.15.0//parquet-hadoop-1.15.0.jar +parquet-jackson/1.15.0//parquet-jackson-1.15.0.jar pickle/1.5//pickle-1.5.jar -py4j/0.10.9.7//py4j-0.10.9.7.jar +py4j/0.10.9.9//py4j-0.10.9.9.jar remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar rocksdbjni/9.7.3//rocksdbjni-9.7.3.jar scala-collection-compat_2.13/2.7.0//scala-collection-compat_2.13-2.7.0.jar @@ -259,7 +259,7 @@ scala-parser-combinators_2.13/2.4.0//scala-parser-combinators_2.13-2.4.0.jar scala-reflect/2.13.15//scala-reflect-2.13.15.jar scala-xml_2.13/2.3.0//scala-xml_2.13-2.3.0.jar slf4j-api/2.0.16//slf4j-api-2.0.16.jar -snakeyaml-engine/2.7//snakeyaml-engine-2.7.jar +snakeyaml-engine/2.8//snakeyaml-engine-2.8.jar snakeyaml/2.3//snakeyaml-2.3.jar snappy-java/1.1.10.7//snappy-java-1.1.10.7.jar spire-macros_2.13/0.18.0//spire-macros_2.13-0.18.0.jar @@ -269,15 +269,19 @@ spire_2.13/0.18.0//spire_2.13-0.18.0.jar stax-api/1.0.1//stax-api-1.0.1.jar stream/2.9.8//stream-2.9.8.jar super-csv/2.2.0//super-csv-2.2.0.jar -threeten-extra/1.7.1//threeten-extra-1.7.1.jar -tink/1.15.0//tink-1.15.0.jar +threeten-extra/1.8.0//threeten-extra-1.8.0.jar +tink/1.16.0//tink-1.16.0.jar transaction-api/1.1//transaction-api-1.1.jar univocity-parsers/2.9.1//univocity-parsers-2.9.1.jar +vertx-auth-common/4.5.11//vertx-auth-common-4.5.11.jar +vertx-core/4.5.11//vertx-core-4.5.11.jar +vertx-web-client/4.5.11//vertx-web-client-4.5.11.jar +vertx-web-common/4.5.11//vertx-web-common-4.5.11.jar wildfly-openssl/1.1.3.Final//wildfly-openssl-1.1.3.Final.jar xbean-asm9-shaded/4.26//xbean-asm9-shaded-4.26.jar xmlschema-core/2.3.1//xmlschema-core-2.3.1.jar xz/1.10//xz-1.10.jar -zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar +zjsonpatch/7.0.1//zjsonpatch-7.0.1.jar zookeeper-jute/3.9.3//zookeeper-jute-3.9.3.jar zookeeper/3.9.3//zookeeper-3.9.3.jar -zstd-jni/1.5.6-7//zstd-jni-1.5.6-7.jar +zstd-jni/1.5.6-9//zstd-jni-1.5.6-9.jar diff --git a/dev/eslint.js b/dev/eslint.js new file mode 100644 index 0000000000000..24b5170b436a9 --- /dev/null +++ b/dev/eslint.js @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +module.exports = { + "env": { + "browser": true, + "es6": true, + "jest": true + }, + "extends": "eslint:recommended", + "rules": { + "indent": [ + "error", + 2, + { + "SwitchCase": 1, + "MemberExpression": "off" + } + ], + "no-unused-vars": ["error", {"argsIgnorePattern": "^_ignored_.*"}] + }, + "ignorePatterns": [ + "*.min.js", + "sorttable.js", + "jquery.mustache.js", + "dataTables.rowsGroup.js" + ], + "parserOptions": { + "sourceType": "module" + } +} diff --git a/dev/eslint.json b/dev/eslint.json deleted file mode 100644 index 29692696a6df2..0000000000000 --- a/dev/eslint.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "env": { - "browser": true, - "es6": true, - "jest": true - }, - "extends": "eslint:recommended", - "rules": { - "indent": [ - "error", - 2, - { - "SwitchCase": 1, - "MemberExpression": "off" - } - ], - "no-unused-vars": ["error", {"argsIgnorePattern": "^_ignored_.*"}] - }, - "ignorePatterns": [ - "*.min.js", - "sorttable.js", - "jquery.mustache.js", - "dataTables.rowsGroup.js" - ], - "parserOptions": { - "sourceType": "module" - } -} diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 2817818cbc4e3..9cd6031023ca5 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -24,10 +24,10 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE 20241119 +ENV FULL_REFRESH_DATE=20241119 -ENV DEBIAN_FRONTEND noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN true +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN apt-get update && apt-get install -y \ build-essential \ @@ -82,7 +82,7 @@ RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" # See more in SPARK-39735 -ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" +ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" RUN add-apt-repository ppa:pypy/ppa @@ -96,7 +96,7 @@ RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matp ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" # Python deps for Spark Connect -ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.28.3 googleapis-common-protos==1.65.0 graphviz==0.20.3" +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0 graphviz==0.20.3" # Install Python 3.10 packages RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 diff --git a/dev/lint-js b/dev/lint-js index f3f7eac4f6b8e..1a94348b7430a 100755 --- a/dev/lint-js +++ b/dev/lint-js @@ -45,7 +45,7 @@ if ! npm ls eslint > /dev/null; then npm ci eslint fi -npx eslint -c "$SPARK_ROOT_DIR/dev/eslint.json" ${LINT_TARGET_FILES[@]} | tee "$LINT_JS_REPORT_FILE_NAME" +npx eslint -c "$SPARK_ROOT_DIR/dev/eslint.js" ${LINT_TARGET_FILES[@]} | tee "$LINT_JS_REPORT_FILE_NAME" lint_status=$? if [ "$lint_status" = "0" ] ; then diff --git a/dev/lint-scala b/dev/lint-scala index 23df146a8d1b4..30642a550401e 100755 --- a/dev/lint-scala +++ b/dev/lint-scala @@ -20,8 +20,10 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" +set -e "$SCRIPT_DIR/scalastyle" "$1" +set +e # For Spark Connect, we actively enforce scalafmt and check that the produced diff is empty. ERRORS=$(./build/mvn \ -Pscala-2.13 \ diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index 4ebd3e4b951f5..415f468a11577 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -419,7 +419,7 @@ def choose_jira_assignee(issue): annotations.append("Commentator") print("[%d] %s (%s)" % (idx, author.displayName, ",".join(annotations))) raw_assignee = bold_input( - "Enter number of user, or userid, to assign to (blank to leave unassigned):" + "Enter number of user, or userid, to assign to (blank to leave unassigned): " ) if raw_assignee == "": return None diff --git a/dev/package-lock.json b/dev/package-lock.json index f676b9cec0762..e6ec1406a7620 100644 --- a/dev/package-lock.json +++ b/dev/package-lock.json @@ -4,6 +4,7 @@ "requires": true, "packages": { "": { + "name": "dev", "devDependencies": { "ansi-regex": "^5.0.1", "eslint": "^7.25.0", @@ -316,10 +317,11 @@ "dev": true }, "node_modules/cross-spawn": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", - "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", "dev": true, + "license": "MIT", "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", @@ -1469,9 +1471,9 @@ "dev": true }, "cross-spawn": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", - "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", "dev": true, "requires": { "path-key": "^3.1.0", diff --git a/dev/pyproject.toml b/dev/pyproject.toml index f19107b3782a6..8b91943009555 100644 --- a/dev/pyproject.toml +++ b/dev/pyproject.toml @@ -27,7 +27,7 @@ testpaths = [ [tool.black] # When changing the version, we have to update # GitHub workflow version and dev/reformat-python -required-version = "23.9.1" +required-version = "23.12.1" line-length = 100 target-version = ['py39'] include = '\.pyi?$' diff --git a/dev/reformat-python b/dev/reformat-python index 46b7efc931aae..9a1199faa938e 100755 --- a/dev/reformat-python +++ b/dev/reformat-python @@ -22,7 +22,7 @@ FWDIR="$( cd "$DIR"/.. && pwd )" cd "$FWDIR" BLACK_BUILD="${PYTHON_EXECUTABLE} -m black" -BLACK_VERSION="23.9.1" +BLACK_VERSION="23.12.1" $PYTHON_EXECUTABLE -c 'import black' 2> /dev/null if [ $? -ne 0 ]; then echo "The Python library providing the 'black' module was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'." diff --git a/dev/requirements.txt b/dev/requirements.txt index a9874f77113ab..36548c2eae408 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -1,11 +1,11 @@ # PySpark dependencies (required) -py4j>=0.10.9.7 +py4j>=0.10.9.9 # PySpark dependencies (optional) numpy>=1.21 -pyarrow>=10.0.0 +pyarrow>=11.0.0 six==1.16.0 -pandas>=2.0.0 +pandas>=2.2.0 scipy plotly>=4.8 mlflow>=2.3.1 @@ -54,14 +54,14 @@ jira>=3.5.2 PyGithub # pandas API on Spark Code formatter. -black==23.9.1 +black==23.12.1 py # Spark Connect (required) grpcio>=1.67.0 grpcio-status>=1.67.0 googleapis-common-protos>=1.65.0 -protobuf==5.28.3 +protobuf==5.29.1 # Spark Connect python proto generation plugin (optional) mypy-protobuf==3.3.0 diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 91399ff1e25ea..f8a547b0c917c 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -87,10 +87,6 @@ for python in "${PYTHON_EXECS[@]}"; do VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python rm -rf "$VIRTUALENV_PATH" if [ -n "$USE_CONDA" ]; then - if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then - # See also https://github.com/conda/conda/issues/7980 - source "$CONDA_PREFIX/etc/profile.d/conda.sh" - fi conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools source activate "$VIRTUALENV_PATH" || conda activate "$VIRTUALENV_PATH" else diff --git a/dev/spark-test-image-util/docs/build-docs b/dev/spark-test-image-util/docs/build-docs new file mode 100755 index 0000000000000..ca59769f24231 --- /dev/null +++ b/dev/spark-test-image-util/docs/build-docs @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if ! [ -x "$(command -v docker)" ]; then + echo "Error: Docker is not installed." >&2 + exit 1 +fi + +DOCKER_CACHE_IMG="ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:master" +REPO_OWNER="apache/spark" +REPOSITORY="apache-spark-ci-image-docs" +IMG_TAG=$(date +%s) +IMG_NAME="${REPOSITORY}:${IMG_TAG}" +IMG_URL="$REPO_OWNER/$IMG_NAME" +DOCKER_MOUNT_SPARK_HOME="/__w/spark/spark" +BUILD_DOCS_SCRIPT_PATH="${DOCKER_MOUNT_SPARK_HOME}/dev/spark-test-image-util/docs/run-in-container" + +FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" +SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/../../..; pwd)" + +# 1.Compile spark outside the container to prepare for generating documents inside the container. +build/sbt -Phive -Pkinesis-asl clean unidoc package + +# 2.Build container image. +docker buildx build \ + --cache-from type=registry,ref="${DOCKER_CACHE_IMG}" \ + --tag "${IMG_URL}" "${FWDIR}" \ + --file "${SPARK_HOME}/dev/spark-test-image/docs/Dockerfile" + +# 3.Build docs on container: `error docs`, `scala doc`, `python doc`, `sql doc`. +docker run \ + --mount type=bind,source="${SPARK_HOME}",target="${DOCKER_MOUNT_SPARK_HOME}" \ + --interactive --tty "${IMG_URL}" \ + /bin/bash -c "sh ${BUILD_DOCS_SCRIPT_PATH}" + +if [[ "$SKIP_RDOC" != "1" ]]; then + # 4.Build docs on host: `r doc`. + # + # Why does `r` document need to be compiled outside the container? + # Because when compiling inside the container, the permission of the directory + # `/__w/spark/spark/R/pkg/docs` automatically generated by `RScript` is `dr-xr--r-x`, + # and when writing to subsequent files, will throw an error as: + # `! [EACCES] Failed to copy '/usr/local/lib/R/site-library/pkgdown/BS5/assets/katex-auto.js' + # to '/__w/spark/spark/R/pkg/docs/katex-auto.js': permission denied` + export SKIP_ERRORDOC=1 + export SKIP_SCALADOC=1 + export SKIP_PYTHONDOC=1 + export SKIP_SQLDOC=1 + cd docs + bundle exec jekyll build +fi + +# 5.Remove container image. +IMG_ID=$(docker images | grep "${IMG_TAG}" | awk '{print $3}') +docker image rm --force "${IMG_ID}" + +echo "Build doc done." diff --git a/dev/spark-test-image-util/docs/run-in-container b/dev/spark-test-image-util/docs/run-in-container new file mode 100644 index 0000000000000..1d43c602f7c72 --- /dev/null +++ b/dev/spark-test-image-util/docs/run-in-container @@ -0,0 +1,37 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# 1.Set env variable. +export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64 +export PATH=$JAVA_HOME/bin:$PATH +export SPARK_DOCS_IS_BUILT_ON_HOST=1 +# We expect to compile the R document on the host. +export SKIP_RDOC=1 + +# 2.Install bundler. +gem install bundler -v 2.4.22 +cd /__w/spark/spark/docs +bundle install + +# 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. +# We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages. +ln -s "$(which python3.9)" "/usr/local/bin/python3" + +# Build docs first with SKIP_API to ensure they are buildable without requiring any +# language docs to be built beforehand. +cd /__w/spark/spark/docs +bundle exec jekyll build diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index 2db7e0717cdfd..f1e33763df468 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -24,10 +24,10 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for Documentat # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE 20241029 +ENV FULL_REFRESH_DATE=20241029 -ENV DEBIAN_FRONTEND noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN true +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN apt-get update && apt-get install -y \ build-essential \ @@ -72,7 +72,7 @@ RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', 'rmarkdown', Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" # See more in SPARK-39735 -ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" +ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" # Install Python 3.9 RUN add-apt-repository ppa:deadsnakes/ppa @@ -85,7 +85,7 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 # See 'docutils<0.18.0' in SPARK-39421 RUN python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ - 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ - 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ + 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \ + 'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' \ && python3.9 -m pip cache purge diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile index f9ea3124291b1..c3ffd7ba4e4b2 100644 --- a/dev/spark-test-image/lint/Dockerfile +++ b/dev/spark-test-image/lint/Dockerfile @@ -24,10 +24,10 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for Linter" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE 20241112 +ENV FULL_REFRESH_DATE=20241112 -ENV DEBIAN_FRONTEND noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN true +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN apt-get update && apt-get install -y \ build-essential \ @@ -63,7 +63,7 @@ RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', 'rmarkdown', && Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" \ # See more in SPARK-39735 -ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" +ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" # Install Python 3.9 RUN add-apt-repository ppa:deadsnakes/ppa @@ -72,7 +72,7 @@ RUN apt-get update && apt-get install -y python3.9 python3.9-distutils \ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 RUN python3.9 -m pip install \ - 'black==23.9.1' \ + 'black==23.12.1' \ 'flake8==3.9.0' \ 'googleapis-common-protos-stubs==2.2.0' \ 'grpc-stubs==1.24.11' \ diff --git a/dev/spark-test-image/pypy-310/Dockerfile b/dev/spark-test-image/pypy-310/Dockerfile new file mode 100644 index 0000000000000..6a309d38f1d55 --- /dev/null +++ b/dev/spark-test-image/pypy-310/Dockerfile @@ -0,0 +1,71 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with PyPy 3.10" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241212 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +RUN add-apt-repository ppa:pypy/ppa +RUN mkdir -p /usr/local/pypy/pypy3.10 && \ + curl -sqL https://downloads.python.org/pypy/pypy3.10-v7.3.17-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.10 --strip-components=1 && \ + ln -sf /usr/local/pypy/pypy3.10/bin/pypy /usr/local/bin/pypy3.10 && \ + ln -sf /usr/local/pypy/pypy3.10/bin/pypy /usr/local/bin/pypy3 +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 +RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml diff --git a/dev/spark-test-image/python-309/Dockerfile b/dev/spark-test-image/python-309/Dockerfile new file mode 100644 index 0000000000000..bfe23bf572add --- /dev/null +++ b/dev/spark-test-image/python-309/Dockerfile @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.09" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241205 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 \ + python3.9-distutils \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0 graphviz==0.20.3" + +# Install Python 3.9 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.9 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.9 -m pip install torcheval && \ + python3.9 -m pip cache purge diff --git a/dev/spark-test-image/python-310/Dockerfile b/dev/spark-test-image/python-310/Dockerfile new file mode 100644 index 0000000000000..b9875ba969f8d --- /dev/null +++ b/dev/spark-test-image/python-310/Dockerfile @@ -0,0 +1,77 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.10" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241205 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + python3.10 \ + python3-psutil \ + qpdf \ + tzdata \ + wget \ + zlib1g-dev \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0 graphviz==0.20.3" + +# Install Python 3.10 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +RUN python3.10 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.10 -m pip install --ignore-installed 'six==1.16.0' # Avoid `python3-six` installation +RUN python3.10 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.10 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.10 -m pip install deepspeed torcheval && \ + python3.10 -m pip cache purge diff --git a/dev/spark-test-image/python-311/Dockerfile b/dev/spark-test-image/python-311/Dockerfile new file mode 100644 index 0000000000000..48f1fede03c05 --- /dev/null +++ b/dev/spark-test-image/python-311/Dockerfile @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.11" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241212 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + +# Install Python 3.11 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.11 \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0 graphviz==0.20.3" + +# Install Python 3.11 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 +RUN python3.11 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.11 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.11 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.11 -m pip install deepspeed torcheval && \ + python3.11 -m pip cache purge diff --git a/dev/spark-test-image/python-312/Dockerfile b/dev/spark-test-image/python-312/Dockerfile new file mode 100644 index 0000000000000..090c20742e652 --- /dev/null +++ b/dev/spark-test-image/python-312/Dockerfile @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.12" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241206 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + +# Install Python 3.12 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.12 \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0 graphviz==0.20.3" + +# Install Python 3.12 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 +RUN python3.12 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.12 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS lxml && \ + python3.12 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.12 -m pip install torcheval && \ + python3.12 -m pip cache purge diff --git a/dev/spark-test-image/python-313/Dockerfile b/dev/spark-test-image/python-313/Dockerfile new file mode 100644 index 0000000000000..473f3df8fdb7c --- /dev/null +++ b/dev/spark-test-image/python-313/Dockerfile @@ -0,0 +1,79 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.13" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241210 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + +# Install Python 3.13 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.13 \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.29.1 googleapis-common-protos==1.65.0 graphviz==0.20.3" + + +# Install Python 3.13 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.13 +# TODO(SPARK-49862) Add BASIC_PIP_PKGS and CONNECT_PIP_PKGS to Python 3.13 image when it supports Python 3.13 +RUN python3.13 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.13 -m pip install numpy>=2.1 pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy coverage matplotlib openpyxl grpcio==1.67.0 grpcio-status==1.67.0 lxml jinja2 && \ + python3.13 -m pip cache purge diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile new file mode 100644 index 0000000000000..82e2508ec6e32 --- /dev/null +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -0,0 +1,81 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with old dependencies" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20241223 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + + +# Should keep the installation consistent with https://apache.github.io/spark/api/python/getting_started/install.html + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 \ + python3.9-distutils \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.0.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" + +# Install Python 3.9 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install --force $BASIC_PIP_PKGS $CONNECT_PIP_PKGS && \ + python3.9 -m pip cache purge diff --git a/dev/spark-test-image/python-ps-minimum/Dockerfile b/dev/spark-test-image/python-ps-minimum/Dockerfile new file mode 100644 index 0000000000000..913da06c551ca --- /dev/null +++ b/dev/spark-test-image/python-ps-minimum/Dockerfile @@ -0,0 +1,81 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For Pandas API on Spark with old dependencies" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE=20250102 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + + +# Should keep the installation consistent with https://apache.github.io/spark/api/python/getting_started/install.html + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 \ + python3.9-distutils \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 numpy scipy coverage unittest-xml-reporting" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" + +# Install Python 3.9 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install --force $BASIC_PIP_PKGS $CONNECT_PIP_PKGS && \ + python3.9 -m pip cache purge diff --git a/dev/spark-test-image/sparkr/Dockerfile b/dev/spark-test-image/sparkr/Dockerfile index 43260c714a550..3312c0852bd77 100644 --- a/dev/spark-test-image/sparkr/Dockerfile +++ b/dev/spark-test-image/sparkr/Dockerfile @@ -24,10 +24,10 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for SparkR" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE 20241114 +ENV FULL_REFRESH_DATE=20241114 -ENV DEBIAN_FRONTEND noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN true +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN apt-get update && apt-get install -y \ build-essential \ @@ -74,4 +74,4 @@ RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" # See more in SPARK-39735 -ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" +ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index b8702113a26c7..f785a72e6a1fe 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -309,6 +309,15 @@ def __hash__(self): ], ) +profiler = Module( + name="profiler", + dependencies=[], + build_profile_flags=["-Pjvm-profiler"], + source_file_regexes=[ + "connector/profiler", + ], +) + protobuf = Module( name="protobuf", dependencies=[sql], @@ -502,10 +511,6 @@ def __hash__(self): "pyspark.sql.observation", "pyspark.sql.tvf", # unittests - "pyspark.sql.tests.test_arrow", - "pyspark.sql.tests.test_arrow_cogrouped_map", - "pyspark.sql.tests.test_arrow_grouped_map", - "pyspark.sql.tests.test_arrow_python_udf", "pyspark.sql.tests.test_catalog", "pyspark.sql.tests.test_column", "pyspark.sql.tests.test_conf", @@ -522,20 +527,24 @@ def __hash__(self): "pyspark.sql.tests.test_functions", "pyspark.sql.tests.test_group", "pyspark.sql.tests.test_sql", + "pyspark.sql.tests.arrow.test_arrow", + "pyspark.sql.tests.arrow.test_arrow_map", + "pyspark.sql.tests.arrow.test_arrow_cogrouped_map", + "pyspark.sql.tests.arrow.test_arrow_grouped_map", + "pyspark.sql.tests.arrow.test_arrow_python_udf", "pyspark.sql.tests.pandas.test_pandas_cogrouped_map", "pyspark.sql.tests.pandas.test_pandas_grouped_map", "pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state", "pyspark.sql.tests.pandas.test_pandas_map", "pyspark.sql.tests.pandas.test_pandas_transform_with_state", - "pyspark.sql.tests.test_arrow_map", "pyspark.sql.tests.pandas.test_pandas_udf", "pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg", "pyspark.sql.tests.pandas.test_pandas_udf_scalar", "pyspark.sql.tests.pandas.test_pandas_udf_typehints", "pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations", "pyspark.sql.tests.pandas.test_pandas_udf_window", + "pyspark.sql.tests.pandas.test_pandas_sqlmetrics", "pyspark.sql.tests.pandas.test_converter", - "pyspark.sql.tests.test_pandas_sqlmetrics", "pyspark.sql.tests.test_python_datasource", "pyspark.sql.tests.test_python_streaming_datasource", "pyspark.sql.tests.test_readwriter", @@ -686,6 +695,7 @@ def __hash__(self): "pyspark.ml.tests.connect.test_legacy_mode_classification", "pyspark.ml.tests.connect.test_legacy_mode_pipeline", "pyspark.ml.tests.connect.test_legacy_mode_tuning", + "pyspark.ml.tests.test_classification", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there @@ -1029,8 +1039,6 @@ def __hash__(self): "pyspark.sql.tests.connect.test_connect_readwriter", "pyspark.sql.tests.connect.test_connect_session", "pyspark.sql.tests.connect.test_connect_stat", - "pyspark.sql.tests.connect.test_parity_arrow", - "pyspark.sql.tests.connect.test_parity_arrow_python_udf", "pyspark.sql.tests.connect.test_parity_datasources", "pyspark.sql.tests.connect.test_parity_errors", "pyspark.sql.tests.connect.test_parity_catalog", @@ -1054,13 +1062,6 @@ def __hash__(self): "pyspark.sql.tests.connect.test_parity_memory_profiler", "pyspark.sql.tests.connect.test_parity_udtf", "pyspark.sql.tests.connect.test_parity_tvf", - "pyspark.sql.tests.connect.test_parity_pandas_udf", - "pyspark.sql.tests.connect.test_parity_pandas_map", - "pyspark.sql.tests.connect.test_parity_arrow_map", - "pyspark.sql.tests.connect.test_parity_pandas_grouped_map", - "pyspark.sql.tests.connect.test_parity_pandas_cogrouped_map", - "pyspark.sql.tests.connect.test_parity_arrow_grouped_map", - "pyspark.sql.tests.connect.test_parity_arrow_cogrouped_map", "pyspark.sql.tests.connect.test_parity_python_datasource", "pyspark.sql.tests.connect.test_parity_python_streaming_datasource", "pyspark.sql.tests.connect.test_parity_frame_plot", @@ -1074,13 +1075,22 @@ def __hash__(self): "pyspark.sql.tests.connect.streaming.test_parity_listener", "pyspark.sql.tests.connect.streaming.test_parity_foreach", "pyspark.sql.tests.connect.streaming.test_parity_foreach_batch", - "pyspark.sql.tests.connect.test_parity_pandas_grouped_map_with_state", - "pyspark.sql.tests.connect.test_parity_pandas_udf_scalar", - "pyspark.sql.tests.connect.test_parity_pandas_udf_grouped_agg", - "pyspark.sql.tests.connect.test_parity_pandas_udf_window", "pyspark.sql.tests.connect.test_resources", "pyspark.sql.tests.connect.shell.test_progress", "pyspark.sql.tests.connect.test_df_debug", + "pyspark.sql.tests.connect.arrow.test_parity_arrow", + "pyspark.sql.tests.connect.arrow.test_parity_arrow_map", + "pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map", + "pyspark.sql.tests.connect.arrow.test_parity_arrow_cogrouped_map", + "pyspark.sql.tests.connect.arrow.test_parity_arrow_python_udf", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_map", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map_with_state", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_cogrouped_map", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_scalar", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_grouped_agg", + "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_window", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -1106,6 +1116,7 @@ def __hash__(self): "pyspark.ml.tests.connect.test_connect_classification", "pyspark.ml.tests.connect.test_connect_pipeline", "pyspark.ml.tests.connect.test_connect_tuning", + "pyspark.ml.tests.connect.test_connect_spark_ml_classification", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -1438,7 +1449,7 @@ def __hash__(self): ], ) -pyspark_logging = Module( +pyspark_logger = Module( name="pyspark-logger", dependencies=[], source_file_regexes=["python/pyspark/logger"], diff --git a/docs/Gemfile b/docs/Gemfile index 8177425cfb681..68727dee9e1fb 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -24,9 +24,7 @@ source "https://rubygems.org" gem "jekyll", "~> 4.3" gem "jekyll-redirect-from", "~> 0.16" -# Rouge 4.0 drops support for Ruby < 2.7, which is EOL. -# See: https://github.com/rouge-ruby/rouge/blob/61bdda18f204a661413daa93d9624bc65ad219a5/CHANGELOG.md#version-400-2022-09-04 -gem "rouge", "~> 3.26" # This resolves a build issue on Apple Silicon. # See: https://issues.apache.org/jira/browse/SPARK-38488 gem "ffi", "~> 1.15" +gem "rexml", "~> 3.3.9" diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index e137f0f039b97..7709f07a1ceea 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -53,8 +53,8 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.6) - rouge (3.30.0) + rexml (3.3.9) + rouge (4.5.1) safe_yaml (1.0.5) sass-embedded (1.63.6) google-protobuf (~> 3.23) @@ -71,7 +71,7 @@ DEPENDENCIES ffi (~> 1.15) jekyll (~> 4.3) jekyll-redirect-from (~> 0.16) - rouge (~> 3.26) + rexml (~> 3.3.9) BUNDLED WITH 2.4.22 diff --git a/docs/README.md b/docs/README.md index 363f1c2076363..1235efe91812b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -126,3 +126,16 @@ To control what API docs get built, you can set any combination of the following * `SKIP_RDOC=1`: Skip the R API docs. * `SKIP_SQLDOC=1`: Skip the SQL API docs. +## Build docs with docker image (Optional) + +As a Spark developer, you can generate all documents locally as follows: + +Note: Before running it, you need to have `docker` installed. + +```sh +$ dev/spark-test-image-util/docs/build-docs +``` + +It will generate all documents on the `container` and `host`. +Especially when there are conflicts between the libraries required by Python development environment +and the libraries required by generating Python docs environment, this is a good choice. diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index 5fc1f3bcf9b5a..b1688aec57f01 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -93,6 +93,8 @@ url: sql-ref-functions.html - text: Identifiers url: sql-ref-identifier.html + - text: IDENTIFIER clause + url: sql-ref-identifier-clause.html - text: Literals url: sql-ref-literals.html - text: Null Semantics diff --git a/docs/_plugins/build_api_docs.rb b/docs/_plugins/build_api_docs.rb index 79aad9695a3c7..e2ddcca6cdde5 100644 --- a/docs/_plugins/build_api_docs.rb +++ b/docs/_plugins/build_api_docs.rb @@ -34,6 +34,11 @@ def print_header(text) end def build_spark_if_necessary + # If spark has already been compiled on the host, skip here. + if ENV['SPARK_DOCS_IS_BUILT_ON_HOST'] == '1' + return + end + if $spark_package_is_built return end @@ -116,6 +121,16 @@ def copy_and_update_java_docs(source, dest, scala_source) File.open(css_file, 'a') { |f| f.write("\n" + css.join()) } end +def build_spark_scala_and_java_docs_if_necessary + # If spark's docs has already been compiled on the host, skip here. + if ENV['SPARK_DOCS_IS_BUILT_ON_HOST'] == '1' + return + end + + command = "build/sbt -Pkinesis-asl unidoc" + puts "Running '#{command}'..." + system(command) || raise("Unidoc generation failed") +end def build_scala_and_java_docs build_spark_if_necessary @@ -123,9 +138,7 @@ def build_scala_and_java_docs print_header "Building Scala and Java API docs." cd(SPARK_PROJECT_ROOT) - command = "build/sbt -Pkinesis-asl unidoc" - puts "Running '#{command}'..." - system(command) || raise("Unidoc generation failed") + build_spark_scala_and_java_docs_if_necessary puts "Moving back into docs dir." cd("docs") diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 7d0e78738095e..6fd14ce31a68c 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -114,8 +114,8 @@ def select_lines(code) range = Range.new(start + 1, endline - 1) trimmed = trim_codeblock(lines[range]) # Filter out possible example tags of overlapped labels. - taggs_filtered = trimmed.select { |l| !l.include? '$example ' } - result += taggs_filtered.join + tags_filtered = trimmed.select { |l| !l.include? '$example ' } + result += tags_filtered.join result += "\n" end result diff --git a/docs/app-dev-spark-connect.md b/docs/app-dev-spark-connect.md new file mode 100644 index 0000000000000..218edd331aa94 --- /dev/null +++ b/docs/app-dev-spark-connect.md @@ -0,0 +1,243 @@ +--- +layout: global +title: Application Development with Spark Connect +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- +**Spark Connect Overview** + +In Apache Spark 3.4, Spark Connect introduced a decoupled client-server +architecture that allows remote connectivity to Spark clusters using the +DataFrame API and unresolved logical plans as the protocol. The separation +between client and server allows Spark and its open ecosystem to be +leveraged from everywhere. It can be embedded in modern data applications, +in IDEs, Notebooks and programming languages. + +To learn more about Spark Connect, see [Spark Connect Overview](spark-connect-overview.html). + +# Redefining Spark Applications using Spark Connect + +With its decoupled client-server architecture, Spark Connect simplifies how Spark Applications are +developed. +The notion of Spark Client Applications and Spark Server Libraries are introduced as follows: +* _Spark Client Applications_ are regular Spark applications that use Spark and its rich ecosystem for +distributed data processing. Examples include ETL pipelines, data preparation, and model training +and inference. +* _Spark Server Libraries_ build on, extend, and complement Spark's functionality, e.g. +[MLlib](ml-guide.html) (distributed ML libraries that use Spark's powerful distributed processing). Spark Connect +can be extended to expose client-side interfaces for Spark Server Libraries. + +With Spark 3.4 and Spark Connect, the development of Spark Client Applications is simplified, and +clear extension points and guidelines are provided on how to build Spark Server Libraries, making +it easy for both types of applications to evolve alongside Spark. As illustrated in Fig.1, Spark +Client applications connect to Spark using the Spark Connect API, which is essentially the +DataFrame API and fully declarative. + +

+ Extending Spark
+Connect Diagram +

+Spark Server Libraries extend Spark. They typically provide additional server-side logic integrated +with Spark, which is exposed to client applications as part of the Spark Connect API, using Spark +Connect extension points. For example, the _Spark Server Library_ consists of custom +service-side logic (as indicated by the blue box labeled _Custom Library Plugin_), which is exposed +to the client via the blue box as part of the Spark Connect API. The client uses this API, e.g., +alongside PySpark or the Spark Scala client, making it easy for Spark client applications to work +with the custom logic/library. + +## Spark Client Applications + +Spark Client Applications are the _regular Spark applications_ that Spark users develop today, e.g., +ETL pipelines, data preparation, or model training or inference. These are typically built using +Sparks declarative DataFrame and DataSet APIs. With Spark Connect, the core behaviour remains the +same, but there are a few differences: +* Lower-level, non-declarative APIs (RDDs) can no longer be directly used from Spark Client +applications. Alternatives for missing RDD functionality are provided as part of the higher-level +DataFrame API. +* Client applications no longer have direct access to the Spark driver JVM; they are fully +separated from the server. + +Client applications based on Spark Connect can be submitted in the same way as any previous job. +In addition, Spark Client Applications based on Spark Connect have several benefits compared to +classic Spark applications using earlier Spark versions (3.4 and below): +* _Upgradability_: Upgrading to new Spark Server versions is seamless, as the Spark Connect API +abstracts any changes/improvements on the server side. Client- and server APIs are cleanly +separated. +* _Simplicity_: The number of APIs exposed to the user is reduced from 3 to 2. The Spark Connect API +is fully declarative and consequently easy to learn for new users familiar with SQL. +* _Stability_: When using Spark Connect, the client applications no longer run on the Spark driver +and, therefore don’t cause and are not affected by any instability on the server. +* _Remote connectivity_: The decoupled architecture allows remote connectivity to Spark beyond SQL +and JDBC: any application can now interactively use Spark “as a service”. +* _Backwards compatibility_: The Spark Connect API is code-compatible with earlier Spark versions, +except for the usage of RDDs, for which a list of alternative APIs is provided in Spark Connect. + +## Spark Server Libraries + +Until Spark 3.4, extensions to Spark (e.g., [Spark ML](ml-guide#:~:text=What%20is%20%E2%80%9CSpark%20ML%E2%80%9D%3F,to%20emphasize%20the%20pipeline%20concept.) +or [Spark-NLP](https://github.com/JohnSnowLabs/spark-nlp)) were built and deployed like Spark +Client Applications. With Spark 3.4 and Spark Connect, explicit extension points are offered to +extend Spark via Spark Server Libraries. These extension points provide functionality that can be +exposed to a client, which differs from existing extension points in Spark such as +[SparkSession extensions](api/java/org/apache/spark/sql/SparkSessionExtensions.html) or +[Spark Plugins](api/java/org/apache/spark/api/plugin/SparkPlugin.html). + +### Getting Started: Extending Spark with Spark Server Libraries + +Spark Connect is available and supports PySpark and Scala +applications. We will walk through how to run an Apache Spark server with Spark +Connect and connect to it from a client application using the Spark Connect client +library. + +A Spark Server Library consists of the following components, illustrated in Fig. 2: + +1. The Spark Connect protocol extension (blue box _Proto_ API) +2. A Spark Connect Plugin. +3. The application logic that extends Spark. +4. The client package that exposes the Spark Server Library application logic to the Spark Client +Application, alongside PySpark or the Scala Spark Client. +

+ Extending Spark
+Connect Diagram - Labelled Steps +

+ +#### (1) Spark Connect Protocol Extension + +To extend Spark with a new Spark Server Library, developers can extend the three main operation +types in the Spark Connect protocol: _Relation_, _Expression_, and _Command_. + +{% highlight protobuf %} +message Relation { + oneof rel_type { + Read read = 1; + // ... + google.protobuf.Any extension = 998; + } +} + +message Expression { + oneof expr_type { + Literal literal = 1; + // ... + google.protobuf.Any extension = 999; + } +} + +message Command { + oneof command_type { + WriteCommand write_command = 1; + // ... + google.protobuf.Any extension = 999; + } +} +{% endhighlight %} +Their extension fields allow serializing arbitrary protobuf messages as part of the Spark Connect +protocol. These messages represent the parameters or state of the extension implementation. +To build a custom expression type, the developer first defines the custom protobuf definition +of the expression. + +{% highlight protobuf %} +message ExamplePluginExpression { + Expression child = 1; + string custom_field = 2; +} +{% endhighlight %} + +#### (2) Spark Connect Plugin implementation with (3) custom application logic + +As a next step, the developer implements the _ExpressionPlugin_ class of Spark Connect with custom +application logic based on the input parameters of the protobuf message. +{% highlight protobuf %} +class ExampleExpressionPlugin extends ExpressionPlugin { + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[Expression] = { + // Check if the serialized value of protobuf.Any matches the type + // of our example expression. + if (!relation.is(classOf[proto.ExamplePluginExpression])) { + return None + } + val exp = relation.unpack(classOf[proto.ExamplePluginExpression]) + Some(Alias(planner.transformExpression( + exp.getChild), exp.getCustomField)(explicitMetadata = None)) + } +} +{% endhighlight %} + +Once the application logic is developed, the code must be packaged as a jar and Spark must be +configured to pick up the additional logic. The relevant Spark configuration options are: +* _spark.jars_ which define the location of the Jar file containing the application logic built for +the custom expression. +* _spark.connect.extensions.expression.classes_ specifying the full class name +of each expression extension loaded by Spark. Based on these configuration options, Spark will +load the values at startup and make them available for processing. + +#### (4) Spark Server Library Client Package + +Once the server component is deployed, any client can use it with the right protobuf messages. +In the example above, the following message payload sent to the Spark Connect endpoint would be +enough to trigger the extension mechanism. +{% highlight json %} +{ + "project": { + "input": { + "sql": { + "query": "select * from samples.nyctaxi.trips" + } + }, + "expressions": [ + { + "extension": { + "typeUrl": "type.googleapis.com/spark.connect.ExamplePluginExpression", + "value": "\n\006\022\004\n\002id\022\006testval" + } + } + ] + } +} +{% endhighlight %} +To make the example available in Python, the application developer provides a Python library that +wraps the new expression and embeds it into PySpark. The easiest way to provide a function for any +expression is to take a PySpark column instance as an argument and return a new Column instance +with the expression applied. + +{% highlight python %} +from pyspark.sql.connect.column import Expression +import pyspark.sql.connect.proto as proto + +from myxample.proto import ExamplePluginExpression + +# Internal class that satisfies the interface by the Python client +# of Spark Connect to generate the protobuf representation from +# an instance of the expression. +class ExampleExpression(Expression): + def to_plan(self, session) -> proto.Expression: + fun = proto.Expression() + plugin = ExamplePluginExpression() + plugin.child.literal.long = 10 + plugin.custom_field = "example" + fun.extension.Pack(plugin) + return fun + +# Defining the function to be used from the consumers. +def example_expression(col: Column) -> Column: + return Column(ExampleExpression()) + + +# Using the expression in the Spark Connect client code. +df = spark.read.table("samples.nyctaxi.trips") +df.select(example_expression(df["fare_amount"])).collect() +{% endhighlight %} \ No newline at end of file diff --git a/docs/configuration.md b/docs/configuration.md index e095ae7a61b22..162165ffe68dd 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -565,8 +565,7 @@ of the most common options to set are:
@@ -576,8 +575,7 @@ of the most common options to set are: @@ -2116,7 +2114,7 @@ Apart from these, the following properties are also available, and may be useful - + @@ -3753,15 +3751,20 @@ Note: When running Spark on YARN in `cluster` mode, environment variables need t # Configuring Logging -Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can configure it by adding a -`log4j2.properties` file in the `conf` directory. One way to start is to copy the existing templates `log4j2.properties.template` or `log4j2.properties.pattern-layout-template` located there. +Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can configure it by adding a `log4j2.properties` file in the `conf` directory. To get started, copy one of the provided templates: `log4j2.properties.template` (for plain text logging) or `log4j2-json-layout.properties.template` (for structured logging). + +## Plain Text Logging +The default logging format is plain text, using Log4j's [Pattern Layout](https://logging.apache.org/log4j/2.x/manual/pattern-layout.html). + +MDC (Mapped Diagnostic Context) information is not included by default in plain text logs. To include it, update the `PatternLayout` configuration in the `log4j2.properties` file. For example, add `%X{task_name}` to include the task name in logs. Additionally, use `spark.sparkContext.setLocalProperty("key", "value")` to add custom data to the MDC. ## Structured Logging -Starting from version 4.0.0, `spark-submit` has adopted the [JSON Template Layout](https://logging.apache.org/log4j/2.x/manual/json-template-layout.html) for logging, which outputs logs in JSON format. This format facilitates querying logs using Spark SQL with the JSON data source. Additionally, the logs include all Mapped Diagnostic Context (MDC) information for search and debugging purposes. +Starting with version 4.0.0, `spark-submit` supports optional structured logging using the [JSON Template Layout](https://logging.apache.org/log4j/2.x/manual/json-template-layout.html). This format enables efficient querying of logs with Spark SQL using the JSON data source and includes all MDC information for improved searchability and debugging. -To configure the layout of structured logging, start with the `log4j2.properties.template` file. +To enable structured logging and include MDC information, set the configuration `spark.log.structuredLogging.enabled` to `true` (default is `false`). For additional customization, copy `log4j2-json-layout.properties.template` to `conf/log4j2.properties` and adjust as needed. -To query Spark logs using Spark SQL, you can use the following code snippets: +### Querying Structured Logs with Spark SQL +To query structured logs in JSON format, use the following code snippet: **Python:** ```python @@ -3777,14 +3780,6 @@ import org.apache.spark.util.LogUtils.SPARK_LOG_SCHEMA val logDf = spark.read.schema(SPARK_LOG_SCHEMA).json("path/to/logs") ``` **Note**: If you're using the interactive shell (pyspark shell or spark-shell), you can omit the import statement in the code because SPARK_LOG_SCHEMA is already available in the shell's context. -## Plain Text Logging -If you prefer plain text logging, you have two options: -- Disable structured JSON logging by setting the Spark configuration `spark.log.structuredLogging.enabled` to `false`. -- Use a custom log4j configuration file. Rename `conf/log4j2.properties.pattern-layout-template` to `conf/log4j2.properties`. This reverts to the default configuration prior to Spark 4.0, which utilizes [PatternLayout](https://logging.apache.org/log4j/2.x/manual/layouts.html#PatternLayout) for logging all messages in plain text. - -MDC information is not included by default when with plain text logging. In order to print it in the logs, you can update the patternLayout in the file. For example, you can add `%X{task_name}` to print the task name in the logs. -Moreover, you can use `spark.sparkContext.setLocalProperty(s"mdc.$name", "value")` to add user specific data into MDC. -The key in MDC will be the string of `mdc.$name`. # Overriding configuration directory diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md index 88bad6c5d1b9f..9dcf4ad8a2984 100644 --- a/docs/core-migration-guide.md +++ b/docs/core-migration-guide.md @@ -44,16 +44,18 @@ license: | - Since Spark 4.0, Spark uses the external shuffle service for deleting shuffle blocks for deallocated executors when the shuffle is no longer needed. To restore the legacy behavior, you can set `spark.shuffle.service.removeShuffle` to `false`. -- Starting with Spark 4.0, the default logging format for `spark-submit` has changed from plain text to JSON lines to improve log analysis. If you prefer plain text logs, you have two options: - - Set the Spark configuration `spark.log.structuredLogging.enabled` to `false`. For example, you can use `JDK_JAVA_OPTIONS=-Dspark.log.structuredLogging.enabled=false`. - - Use a custom log4j configuration file, such as renaming the template file `conf/log4j2.properties.pattern-layout-template` to `conf/log4j2.properties`. - - Since Spark 4.0, the MDC (Mapped Diagnostic Context) key for Spark task names in Spark logs has been changed from `mdc.taskName` to `task_name`. To use the key `mdc.taskName`, you can set `spark.log.legacyTaskNameMdc.enabled` to `true`. - Since Spark 4.0, Spark performs speculative executions less aggressively with `spark.speculation.multiplier=3` and `spark.speculation.quantile=0.9`. To restore the legacy behavior, you can set `spark.speculation.multiplier=1.5` and `spark.speculation.quantile=0.75`. - Since Spark 4.0, `spark.shuffle.unsafe.file.output.buffer` is deprecated though still works. Use `spark.shuffle.localDisk.file.output.buffer` instead. +- Since Spark 4.0, when reading files hits `org.apache.hadoop.security.AccessControlException` and `org.apache.hadoop.hdfs.BlockMissingException`, the exception will be thrown and fail the task, even if `spark.files.ignoreCorruptFiles` is set to `true`. + +## Upgrading from Core 3.5.3 to 3.5.4 + +- Since Spark 3.5.4, when reading files hits `org.apache.hadoop.security.AccessControlException` and `org.apache.hadoop.hdfs.BlockMissingException`, the exception will be thrown and fail the task, even if `spark.files.ignoreCorruptFiles` is set to `true`. + ## Upgrading from Core 3.4 to 3.5 - Since Spark 3.5, `spark.yarn.executor.failuresValidityInterval` is deprecated. Use `spark.executor.failuresValidityInterval` instead. @@ -62,7 +64,7 @@ license: | ## Upgrading from Core 3.3 to 3.4 -- Since Spark 3.4, Spark driver will own `PersistentVolumnClaim`s and try to reuse if they are not assigned to live executors. To restore the behavior before Spark 3.4, you can set `spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and `spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`. +- Since Spark 3.4, Spark driver will own `PersistentVolumeClaim`s and try to reuse if they are not assigned to live executors. To restore the behavior before Spark 3.4, you can set `spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and `spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`. - Since Spark 3.4, Spark driver will track shuffle data when dynamic allocation is enabled without shuffle service. To restore the behavior before Spark 3.4, you can set `spark.dynamicAllocation.shuffleTracking.enabled` to `false`. diff --git a/docs/img/extending-spark-connect-labelled.png b/docs/img/extending-spark-connect-labelled.png new file mode 100644 index 0000000000000..94b8cfdc024cb Binary files /dev/null and b/docs/img/extending-spark-connect-labelled.png differ diff --git a/docs/img/extending-spark-connect.png b/docs/img/extending-spark-connect.png new file mode 100644 index 0000000000000..381d99bdda865 Binary files /dev/null and b/docs/img/extending-spark-connect.png differ diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md index a1adcc2f6eb03..400f8a512e7a7 100644 --- a/docs/rdd-programming-guide.md +++ b/docs/rdd-programming-guide.md @@ -39,7 +39,7 @@ along with if you launch Spark's interactive shell -- either `bin/spark-shell` f
-Spark {{site.SPARK_VERSION}} works with Python 3.8+. It can use the standard CPython interpreter, +Spark {{site.SPARK_VERSION}} works with Python 3.9+. It can use the standard CPython interpreter, so C libraries like NumPy can be used. It also works with PyPy 7.3.6+. Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as: diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index a0c73813612d0..c7f5d67a6cd85 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -394,7 +394,7 @@ spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount. spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.readOnly=false ``` -To enable shuffle data recovery feature via the built-in `KubernetesLocalDiskShuffleDataIO` plugin, we need to have the followings. You may want to enable `spark.kubernetes.driver.waitToReusePersistentVolumeClaim` additionally. +To enable shuffle data recovery feature via the built-in `KubernetesLocalDiskShuffleDataIO` plugin, we need to have the following. You may want to enable `spark.kubernetes.driver.waitToReusePersistentVolumeClaim` additionally. ``` spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.path=/data/spark-x/executor-x diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index d149f9196b345..465f3a9d075a2 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -124,15 +124,15 @@ all environment variables used for launching each container. This process is use classpath problems in particular. (Note that enabling this requires admin privileges on cluster settings and a restart of all node managers. Thus, this is not applicable to hosted clusters). -To use a custom log4j configuration for the application master or executors, here are the options: +To use a custom log4j2 configuration for the application master or executors, here are the options: -- upload a custom `log4j.properties` using `spark-submit`, by adding it to the `--files` list of files +- upload a custom `log4j2.properties` using `spark-submit`, by adding it to the `--files` list of files to be uploaded with the application. -- add `-Dlog4j.configuration=` to `spark.driver.extraJavaOptions` +- add `-Dlog4j.configurationFile=` to `spark.driver.extraJavaOptions` (for the driver) or `spark.executor.extraJavaOptions` (for executors). Note that if using a file, the `file:` protocol should be explicitly provided, and the file needs to exist locally on all the nodes. -- update the `$SPARK_CONF_DIR/log4j.properties` file and it will be automatically uploaded along +- update the `$SPARK_CONF_DIR/log4j2.properties` file and it will be automatically uploaded along with the other configurations. Note that other 2 options has higher priority than this option if multiple options are specified. @@ -673,7 +673,7 @@ To use a custom metrics.properties for the application master and executors, upd
@@ -853,7 +853,7 @@ will include a list of all tokens obtained, and their expiry details To start the Spark Shuffle Service on each `NodeManager` in your YARN cluster, follow these instructions: -1. Build Spark with the [YARN profile](building-spark.html). Skip this step if you are using a +1. Build Spark with the [YARN profile](building-spark.html#specifying-the-hadoop-version-and-enabling-yarn). Skip this step if you are using a pre-packaged distribution. 1. Locate the `spark--yarn-shuffle.jar`. This should be under `$SPARK_HOME/common/network-yarn/target/scala-` if you are building Spark yourself, and under diff --git a/docs/security.md b/docs/security.md index c7d3fd5f8c36f..81173d5f01ce7 100644 --- a/docs/security.md +++ b/docs/security.md @@ -72,7 +72,7 @@ secrets to be secure. diff --git a/docs/spark-connect-overview.md b/docs/spark-connect-overview.md index 1cc409bfbc007..723bae9fd9be5 100644 --- a/docs/spark-connect-overview.md +++ b/docs/spark-connect-overview.md @@ -370,6 +370,8 @@ one may implement their own class extending `ClassFinder` for customized search +For more information on application development with Spark Connect as well as extending Spark Connect +with custom functionality, see [Application Development with Spark Connect](app-dev-spark-connect.html). # Client application authentication While Spark Connect does not have built-in authentication, it is designed to diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 8bc7445d17c71..51f6ca977c991 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -59,10 +59,6 @@ Finally, the following configuration options can be passed to the master and wor - - - - @@ -355,28 +351,28 @@ SPARK_MASTER_OPTS supports the following system properties: - + - + @@ -457,7 +453,7 @@ SPARK_WORKER_OPTS supports the following system properties: @@ -497,8 +493,8 @@ SPARK_WORKER_OPTS supports the following system properties: @@ -508,8 +504,8 @@ SPARK_WORKER_OPTS supports the following system properties: @@ -527,11 +523,11 @@ SPARK_WORKER_OPTS supports the following system properties: - + @@ -615,37 +611,37 @@ via http://[host:port]/[version]/submissions/[action] where action is one of the following supported actions.
Thread IDnumExecutors * 2, with minimum of 3 The maximum number of executor failures before failing the application. - This configuration only takes effect on YARN, or Kubernetes when - spark.kubernetes.allocation.pods.allocator is set to 'direct'. + This configuration only takes effect on YARN and Kubernetes. 3.5.0
Interval after which executor failures will be considered independent and not accumulate towards the attempt count. - This configuration only takes effect on YARN, or Kubernetes when - spark.kubernetes.allocation.pods.allocator is set to 'direct'. + This configuration only takes effect on YARN and Kubernetes. 3.5.0
spark.storage.replication.proactivefalsetrue Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries @@ -2852,7 +2850,7 @@ Apart from these, the following properties are also available, and may be useful If set to "true", prevent Spark from scheduling tasks on executors that have been excluded due to too many task failures. The algorithm used to exclude executors and nodes can be further controlled by the other "spark.excludeOnFailure" configuration options. - This config will be overriden by "spark.excludeOnFailure.application.enabled" and + This config will be overridden by "spark.excludeOnFailure.application.enabled" and "spark.excludeOnFailure.taskAndStage.enabled" to specify exclusion enablement on individual levels. false Set to true for applications that have higher security requirements and prefer that their - secret is not saved in the db. The shuffle data of such applications wll not be recovered after + secret is not saved in the db. The shuffle data of such applications will not be recovered after the External Shuffle Service restarts. 3.5.0false Set to true for applications that have higher security requirements and prefer that their - secret is not saved in the db. The shuffle data of such applications wll not be recovered after + secret is not saved in the db. The shuffle data of such applications will not be recovered after the External Shuffle Service restarts. 3.5.0 -h HOST, --host HOST Hostname to listen on
-i HOST, --ip HOSTHostname to listen on (deprecated, use -h or --host)
-p PORT, --port PORT Port for service to listen on (default: 7077 for master, random for worker)spark.deploy.appNumberModulo (None) - The modulo for app number. By default, the next of `app-yyyyMMddHHmmss-9999` is - `app-yyyyMMddHHmmss-10000`. If we have 10000 as modulo, it will be `app-yyyyMMddHHmmss-0000`. - In most cases, the prefix `app-yyyyMMddHHmmss` is increased already during creating 10000 applications. + The modulo for app number. By default, the next of app-yyyyMMddHHmmss-9999 is + app-yyyyMMddHHmmss-10000. If we have 10000 as modulo, it will be app-yyyyMMddHHmmss-0000. + In most cases, the prefix app-yyyyMMddHHmmss is increased already during creating 10000 applications. 4.0.0
spark.deploy.driverIdPatterndriver-%s-%04ddriver-%s-%04d - The pattern for driver ID generation based on Java `String.format` method. - The default value is `driver-%s-%04d` which represents the existing driver id string, e.g., `driver-20231031224459-0019`. Please be careful to generate unique IDs. + The pattern for driver ID generation based on Java String.format method. + The default value is driver-%s-%04d which represents the existing driver id string, e.g., driver-20231031224459-0019. Please be careful to generate unique IDs. 4.0.0
spark.deploy.appIdPatternapp-%s-%04dapp-%s-%04d - The pattern for app ID generation based on Java `String.format` method. - The default value is `app-%s-%04d` which represents the existing app id string, e.g., - `app-20231031224509-0008`. Plesae be careful to generate unique IDs. + The pattern for app ID generation based on Java String.format method. + The default value is app-%s-%04d which represents the existing app id string, e.g., + app-20231031224509-0008. Please be careful to generate unique IDs. 4.0.0
Enable periodic cleanup of worker / application directories. Note that this only affects standalone mode, as YARN works differently. Only the directories of stopped applications are cleaned up. - This should be enabled if spark.shuffle.service.db.enabled is "true" + This should be enabled if spark.shuffle.service.db.enabled is "true" 1.0.0
ROCKSDB When spark.shuffle.service.db.enabled is true, user can use this to specify the kind of disk-based - store used in shuffle service state store. This supports `ROCKSDB` and `LEVELDB` (deprecated) now and `ROCKSDB` as default value. - The original data store in `RocksDB/LevelDB` will not be automatically convert to another kind of storage now. + store used in shuffle service state store. This supports ROCKSDB and LEVELDB (deprecated) now and ROCKSDB as default value. + The original data store in RocksDB/LevelDB will not be automatically convert to another kind of storage now. 3.4.0
Enable cleanup non-shuffle files(such as temp. shuffle blocks, cached RDD/broadcast blocks, spill files, etc) of worker directories following executor exits. Note that this doesn't - overlap with `spark.worker.cleanup.enabled`, as this enables cleanup of non-shuffle files in - local directories of a dead executor, while `spark.worker.cleanup.enabled` enables cleanup of + overlap with spark.worker.cleanup.enabled, as this enables cleanup of non-shuffle files in + local directories of a dead executor, while spark.worker.cleanup.enabled enables cleanup of all files/subdirectories of a stopped and timeout application. This only affects Standalone mode, support of other cluster managers can be added in the future.
spark.worker.idPatternworker-%s-%s-%dworker-%s-%s-%d - The pattern for worker ID generation based on Java `String.format` method. - The default value is `worker-%s-%s-%d` which represents the existing worker id string, e.g., - `worker-20231109183042-[fe80::1%lo0]-39729`. Please be careful to generate unique IDs + The pattern for worker ID generation based on Java String.format method. + The default value is worker-%s-%s-%d which represents the existing worker id string, e.g., + worker-20231109183042-[fe80::1%lo0]-39729. Please be careful to generate unique IDs 4.0.0
- + + - - + - + - + - +
CommandDescriptionHTTP METHODSince Version
CommandHTTP METHODDescriptionSince Version
createPOST Create a Spark driver via cluster mode. Since 4.0.0, Spark master supports server-side variable replacements for the values of Spark properties and environment variables. POST 1.3.0
killKill a single Spark driver. POSTKill a single Spark driver. 1.3.0
killallKill all running Spark drivers. POSTKill all running Spark drivers. 4.0.0
statusCheck the status of a Spark job. GETCheck the status of a Spark job. 1.3.0
clearClear the completed drivers and applications. POSTClear the completed drivers and applications. 4.0.0
@@ -868,13 +864,13 @@ In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spa spark.deploy.zookeeper.url None - When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to. + When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to. 0.8.1 spark.deploy.zookeeper.dir None - When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state. + When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state. 0.8.1 diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 97a7065e0598f..8008bc562082c 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -60,6 +60,12 @@ Data source options of CSV can be set via: Sets a separator for each field and value. This separator can be one or more characters. read/write + + extension + csv + Sets the file extension for the output files. Limited to letters. Length must equal 3. + write + encoding
charset UTF-8 diff --git a/docs/sql-data-sources-xml.md b/docs/sql-data-sources-xml.md index 6168f570a81a9..949e6239e52ba 100644 --- a/docs/sql-data-sources-xml.md +++ b/docs/sql-data-sources-xml.md @@ -19,7 +19,7 @@ license: | limitations under the License. --- -Spark SQL provides `spark.read().xml("file_1_path","file_2_path")` to read a file or directory of files in XML format into a Spark DataFrame, and `dataframe.write().xml("path")` to write to a xml file. When reading a XML file, the `rowTag` option must be specified to indicate the XML element that maps to a `DataFrame row`. The option() function can be used to customize the behavior of reading or writing, such as controlling behavior of the XML attributes, XSD validation, compression, and so on. +Spark SQL provides `spark.read().xml("file_1_path","file_2_path")` to read a file or directory of files in XML format into a Spark DataFrame, and `dataframe.write().xml("path")` to write to a xml file. The `rowTag` option must be specified to indicate the XML element that maps to a `DataFrame row`. The option() function can be used to customize the behavior of reading or writing, such as controlling behavior of the XML attributes, XSD validation, compression, and so on.
@@ -61,7 +61,7 @@ Data source options of XML can be set via: <books><book></book>...</books> the appropriate value would be book. This is a required option for both read and write. - read + read/write diff --git a/docs/sql-error-conditions-codec-not-available-error-class.md b/docs/sql-error-conditions-codec-not-available-error-class.md deleted file mode 100644 index bb93f56206ba1..0000000000000 --- a/docs/sql-error-conditions-codec-not-available-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: CODEC_NOT_AVAILABLE error class -displayTitle: CODEC_NOT_AVAILABLE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: 56038 - -The codec `` is not available. - -This error class has the following derived error classes: - -## WITH_AVAILABLE_CODECS_SUGGESTION - -Available codecs are ``. - -## WITH_CONF_SUGGESTION - -Consider to set the config `` to ``. - - diff --git a/docs/sql-error-conditions-collation-mismatch-error-class.md b/docs/sql-error-conditions-collation-mismatch-error-class.md deleted file mode 100644 index 79aaaf00ee47c..0000000000000 --- a/docs/sql-error-conditions-collation-mismatch-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: COLLATION_MISMATCH error class -displayTitle: COLLATION_MISMATCH error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42P21](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Could not determine which collation to use for string functions and operators. - -This error class has the following derived error classes: - -## EXPLICIT - -Error occurred due to the mismatch between explicit collations: ``. Decide on a single explicit collation and remove others. - -## IMPLICIT - -Error occurred due to the mismatch between implicit collations: ``. Use COLLATE function to set the collation explicitly. - - diff --git a/docs/sql-error-conditions-failed-read-file-error-class.md b/docs/sql-error-conditions-failed-read-file-error-class.md deleted file mode 100644 index a4344666c59c6..0000000000000 --- a/docs/sql-error-conditions-failed-read-file-error-class.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -layout: global -title: FAILED_READ_FILE error class -displayTitle: FAILED_READ_FILE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: KD001 - -Encountered error while reading file ``. - -This error class has the following derived error classes: - -## CANNOT_READ_FILE_FOOTER - -Could not read footer. Please ensure that the file is in either ORC or Parquet format. -If not, please convert it to a valid format. If the file is in the valid format, please check if it is corrupt. -If it is, you can choose to either ignore it or fix the corruption. - -## FILE_NOT_EXIST - -File does not exist. It is possible the underlying files have been updated. -You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved. - -## NO_HINT - - - -## PARQUET_COLUMN_DATA_TYPE_MISMATCH - -Data type mismatches when reading Parquet column ``. Expected Spark type ``, actual Parquet type ``. - - diff --git a/docs/sql-error-conditions-illegal-state-store-value-error-class.md b/docs/sql-error-conditions-illegal-state-store-value-error-class.md deleted file mode 100644 index e6457e58b7b4d..0000000000000 --- a/docs/sql-error-conditions-illegal-state-store-value-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: ILLEGAL_STATE_STORE_VALUE error class -displayTitle: ILLEGAL_STATE_STORE_VALUE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Illegal value provided to the State Store - -This error class has the following derived error classes: - -## EMPTY_LIST_VALUE - -Cannot write empty list values to State Store for StateName ``. - -## NULL_VALUE - -Cannot write null values to State Store for StateName ``. - - diff --git a/docs/sql-error-conditions-invalid-aggregate-filter-error-class.md b/docs/sql-error-conditions-invalid-aggregate-filter-error-class.md deleted file mode 100644 index 8a3441ca133d4..0000000000000 --- a/docs/sql-error-conditions-invalid-aggregate-filter-error-class.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -layout: global -title: INVALID_AGGREGATE_FILTER error class -displayTitle: INVALID_AGGREGATE_FILTER error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42903](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The FILTER expression `` in an aggregate function is invalid. - -This error class has the following derived error classes: - -## CONTAINS_AGGREGATE - -Expected a FILTER expression without an aggregation, but found ``. - -## CONTAINS_WINDOW_FUNCTION - -Expected a FILTER expression without a window function, but found ``. - -## NON_DETERMINISTIC - -Expected a deterministic FILTER expression. - -## NOT_BOOLEAN - -Expected a FILTER expression of the BOOLEAN type. - - diff --git a/docs/sql-error-conditions-invalid-conf-value-error-class.md b/docs/sql-error-conditions-invalid-conf-value-error-class.md deleted file mode 100644 index ac430956340f8..0000000000000 --- a/docs/sql-error-conditions-invalid-conf-value-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: INVALID_CONF_VALUE error class -displayTitle: INVALID_CONF_VALUE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 22022](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value '``' in the config "``" is invalid. - -This error class has the following derived error classes: - -## DEFAULT_COLLATION - -Cannot resolve the given default collation. Did you mean '``'? - -## TIME_ZONE - -Cannot resolve the given timezone. - - diff --git a/docs/sql-error-conditions-invalid-datetime-pattern-error-class.md b/docs/sql-error-conditions-invalid-datetime-pattern-error-class.md deleted file mode 100644 index 10e9fc97027c0..0000000000000 --- a/docs/sql-error-conditions-invalid-datetime-pattern-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: INVALID_DATETIME_PATTERN error class -displayTitle: INVALID_DATETIME_PATTERN error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 22007](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Unrecognized datetime pattern: ``. - -This error class has the following derived error classes: - -## ILLEGAL_CHARACTER - -Illegal pattern character found in datetime pattern: ``. Please provide legal character. - -## LENGTH - -Too many letters in datetime pattern: ``. Please reduce pattern length. - - diff --git a/docs/sql-error-conditions-invalid-delimiter-value-error-class.md b/docs/sql-error-conditions-invalid-delimiter-value-error-class.md deleted file mode 100644 index 815fe78bce945..0000000000000 --- a/docs/sql-error-conditions-invalid-delimiter-value-error-class.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -layout: global -title: INVALID_DELIMITER_VALUE error class -displayTitle: INVALID_DELIMITER_VALUE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid value for delimiter. - -This error class has the following derived error classes: - -## DELIMITER_LONGER_THAN_EXPECTED - -Delimiter cannot be more than one character: ``. - -## EMPTY_STRING - -Delimiter cannot be empty string. - -## SINGLE_BACKSLASH - -Single backslash is prohibited. It has special meaning as beginning of an escape sequence. To get the backslash character, pass a string with two backslashes as the delimiter. - -## UNSUPPORTED_SPECIAL_CHARACTER - -Unsupported special character for delimiter: ``. - - diff --git a/docs/sql-error-conditions-invalid-interval-format-error-class.md b/docs/sql-error-conditions-invalid-interval-format-error-class.md deleted file mode 100644 index 28cccd5e12887..0000000000000 --- a/docs/sql-error-conditions-invalid-interval-format-error-class.md +++ /dev/null @@ -1,81 +0,0 @@ ---- -layout: global -title: INVALID_INTERVAL_FORMAT error class -displayTitle: INVALID_INTERVAL_FORMAT error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 22006](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Error parsing '``' to interval. Please ensure that the value provided is in a valid format for defining an interval. You can reference the documentation for the correct format. - -This error class has the following derived error classes: - -## ARITHMETIC_EXCEPTION - -Uncaught arithmetic exception while parsing '``'. - -## INPUT_IS_EMPTY - -Interval string cannot be empty. - -## INPUT_IS_NULL - -Interval string cannot be null. - -## INVALID_FRACTION - -`` cannot have fractional part. - -## INVALID_PRECISION - -Interval can only support nanosecond precision, `` is out of range. - -## INVALID_PREFIX - -Invalid interval prefix ``. - -## INVALID_UNIT - -Invalid unit ``. - -## INVALID_VALUE - -Invalid value ``. - -## MISSING_NUMBER - -Expect a number after `` but hit EOL. - -## MISSING_UNIT - -Expect a unit name after `` but hit EOL. - -## UNKNOWN_PARSING_ERROR - -Unknown error when parsing ``. - -## UNRECOGNIZED_NUMBER - -Unrecognized number ``. - - diff --git a/docs/sql-error-conditions-numeric-value-out-of-range-error-class.md b/docs/sql-error-conditions-numeric-value-out-of-range-error-class.md deleted file mode 100644 index 690bbeec07473..0000000000000 --- a/docs/sql-error-conditions-numeric-value-out-of-range-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: NUMERIC_VALUE_OUT_OF_RANGE error class -displayTitle: NUMERIC_VALUE_OUT_OF_RANGE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - - - -This error class has the following derived error classes: - -## WITHOUT_SUGGESTION - -The `` rounded half up from `` cannot be represented as Decimal(``, ``). - -## WITH_SUGGESTION - -`` cannot be represented as Decimal(``, ``). If necessary set `` to "false" to bypass this error, and return NULL instead. - - diff --git a/docs/sql-error-conditions-syntax-discontinued-error-class.md b/docs/sql-error-conditions-syntax-discontinued-error-class.md deleted file mode 100644 index 966e11004364e..0000000000000 --- a/docs/sql-error-conditions-syntax-discontinued-error-class.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: global -title: SYNTAX_DISCONTINUED error class -displayTitle: SYNTAX_DISCONTINUED error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Support of the clause or keyword: `` has been discontinued in this context. - -This error class has the following derived error classes: - -## BANG_EQUALS_NOT - -The '!' keyword is supported as a prefix operator in a logical operation only. -Use the 'NOT' keyword instead for clauses such as `NOT LIKE`, `NOT IN`, `NOT BETWEEN`, etc. -To re-enable the '!' keyword, set "spark.sql.legacy.bangEqualsNot" to "true". - - diff --git a/docs/sql-error-conditions-unsupported-call-error-class.md b/docs/sql-error-conditions-unsupported-call-error-class.md deleted file mode 100644 index 38c7859e88fe6..0000000000000 --- a/docs/sql-error-conditions-unsupported-call-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_CALL error class -displayTitle: UNSUPPORTED_CALL error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot call the method "``" of the class "``". - -This error class has the following derived error classes: - -## FIELD_INDEX - -The row shall have a schema to get an index of the field ``. - -## WITHOUT_SUGGESTION - - - - diff --git a/docs/sql-error-conditions-unsupported-collation-error-class.md b/docs/sql-error-conditions-unsupported-collation-error-class.md deleted file mode 100644 index ae410a30317a1..0000000000000 --- a/docs/sql-error-conditions-unsupported-collation-error-class.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_COLLATION error class -displayTitle: UNSUPPORTED_COLLATION error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Collation `` is not supported for: - -This error class has the following derived error classes: - -## FOR_FUNCTION - -function ``. Please try to use a different collation. - - diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index ea4dbe926d146..254c54a414a7e 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -29,10 +29,12 @@ license: | - Since Spark 4.0, the default behaviour when inserting elements in a map is changed to first normalize keys -0.0 to 0.0. The affected SQL functions are `create_map`, `map_from_arrays`, `map_from_entries`, and `map_concat`. To restore the previous behaviour, set `spark.sql.legacy.disableMapKeyNormalization` to `true`. - Since Spark 4.0, the default value of `spark.sql.maxSinglePartitionBytes` is changed from `Long.MaxValue` to `128m`. To restore the previous behavior, set `spark.sql.maxSinglePartitionBytes` to `9223372036854775807`(`Long.MaxValue`). - Since Spark 4.0, any read of SQL tables takes into consideration the SQL configs `spark.sql.files.ignoreCorruptFiles`/`spark.sql.files.ignoreMissingFiles` instead of the core config `spark.files.ignoreCorruptFiles`/`spark.files.ignoreMissingFiles`. +- Since Spark 4.0, when reading SQL tables hits `org.apache.hadoop.security.AccessControlException` and `org.apache.hadoop.hdfs.BlockMissingException`, the exception will be thrown and fail the task, even if `spark.sql.files.ignoreCorruptFiles` is set to `true`. - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead. - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value. - Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. +- Since Spark 4.0, the `encode()` and `decode()` functions raise `MALFORMED_CHARACTER_CODING` error when handling unmappable characters, while in Spark 3.5 and earlier versions, these characters will be replaced with mojibakes. To restore the previous behavior, set `spark.sql.legacy.codingErrorAction` to `true`. For example, if you try to `decode` a string value `tést` / [116, -23, 115, 116] (encoded in latin1) with 'UTF-8', you get `t�st`. - Since Spark 4.0, the legacy datetime rebasing SQL configs with the prefix `spark.sql.legacy` are removed. To restore the previous behavior, use the following configs: - `spark.sql.parquet.int96RebaseModeInWrite` instead of `spark.sql.legacy.parquet.int96RebaseModeInWrite` - `spark.sql.parquet.datetimeRebaseModeInWrite` instead of `spark.sql.legacy.parquet.datetimeRebaseModeInWrite` @@ -62,6 +64,10 @@ license: | - Since Spark 4.0, The Storage-Partitioned Join feature flag `spark.sql.sources.v2.bucketing.pushPartValues.enabled` is set to `true`. To restore the previous behavior, set `spark.sql.sources.v2.bucketing.pushPartValues.enabled` to `false`. - Since Spark 4.0, the `sentences` function uses `Locale(language)` instead of `Locale.US` when `language` parameter is not `NULL` and `country` parameter is `NULL`. +## Upgrading from Spark SQL 3.5.3 to 3.5.4 + +- Since Spark 3.5.4, when reading SQL tables hits `org.apache.hadoop.security.AccessControlException` and `org.apache.hadoop.hdfs.BlockMissingException`, the exception will be thrown and fail the task, even if `spark.sql.files.ignoreCorruptFiles` is set to `true`. + ## Upgrading from Spark SQL 3.5.1 to 3.5.2 - Since 3.5.2, MySQL JDBC datasource will read TINYINT UNSIGNED as ShortType, while in 3.5.1, it was wrongly read as ByteType. diff --git a/docs/sql-pipe-syntax.md b/docs/sql-pipe-syntax.md index 2c7db9f456475..3d757db966239 100644 --- a/docs/sql-pipe-syntax.md +++ b/docs/sql-pipe-syntax.md @@ -179,7 +179,7 @@ Returns all the output rows from the source table unmodified. For example: ```sql -CREATE TABLE t(a INT, b INT) AS VALUES (1, 2), (3, 4); +CREATE TABLE t AS VALUES (1, 2), (3, 4) AS t(a, b); TABLE t; +---+---+ @@ -198,16 +198,26 @@ TABLE t; Evaluates the provided expressions over each of the rows of the input table. +In general, this operator is not always required with SQL pipe syntax. It is possible to use it at +or near the end of a query to evaluate expressions or specify a list of output columns. + +Since the final query result always comprises the columns returned from the last pipe operator, +when this `SELECT` operator does not appear, the output includes all columns from the full row. +This behavior is similar to `SELECT *` in standard SQL syntax. + It is possible to use `DISTINCT` and `*` as needed.
This works like the outermost `SELECT` in a table subquery in regular Spark SQL. Window functions are supported in the `SELECT` list as well. To use them, the `OVER` clause must be provided. You may provide the window specification in the `WINDOW` clause. +Aggregate functions are not supported in this operator. To perform aggregation, use the `AGGREGATE` +operator instead. + For example: ```sql -CREATE TABLE t(col INT) AS VALUES (0), (1); +CREATE TABLE t AS VALUES (0), (1) AS t(col); FROM t |> SELECT col * 2 AS result; @@ -226,7 +236,12 @@ FROM t |> EXTEND [[AS] alias], ... ``` -Appends new columns to the input table by evaluating the specified expressions over each of the input rows. +Appends new columns to the input table by evaluating the specified expressions over each of the +input rows. + +After an `EXTEND` operation, top-level column names are updated but table aliases still refer to the +original row values (such as an inner join between two tables `lhs` and `rhs` with a subsequent +`EXTEND` and then `SELECT lhs.col, rhs.col`). For example: @@ -248,7 +263,17 @@ VALUES (0), (1) tab(col) |> SET = , ... ``` -Updates columns of the input table by replacing them with the result of evaluating the provided expressions. +Updates columns of the input table by replacing them with the result of evaluating the provided +expressions. Each such column reference must appear in the input table exactly once. + +This is similar to `SELECT * EXCEPT (column), AS column` in regular Spark SQL. + +It is possible to perform multiple assignments in a single `SET` clause. Each assignment may refer +to the result of previous assignments. + +After an assignment, top-level column names are updated but table aliases still refer to the +original row values (such as an inner join between two tables `lhs` and `rhs` with a subsequent +`SET` and then `SELECT lhs.col, rhs.col`). For example: @@ -256,6 +281,16 @@ For example: VALUES (0), (1) tab(col) |> SET col = col * 2; ++---+ +|col| ++---+ +| 0| +| 2| ++---+ + +VALUES (0), (1) tab(col) +|> SET col = col * 2; + +---+ |col| +---+ @@ -270,7 +305,14 @@ VALUES (0), (1) tab(col) |> DROP , ... ``` -Drops columns of the input table by name. +Drops columns of the input table by name. Each such column reference must appear in the input table +exactly once. + +This is similar to `SELECT * EXCEPT (column)` in regular Spark SQL. + +After a `DROP` operation, top-level column names are updated but table aliases still refer to the +original row values (such as an inner join between two tables `lhs` and `rhs` with a subsequent +`DROP` and then `SELECT lhs.col, rhs.col`). For example: @@ -293,18 +335,25 @@ VALUES (0, 1) tab(col1, col2) Retains the same rows and column names of the input table but with a new table alias. +This operator is useful for introducing a new alias for the input table, which can then be referred +to in subsequent operators. Any existing alias for the table is replaced by the new alias. + +It is useful to use this operator after adding new columns with `SELECT` or `EXTEND` or after +performing aggregation with `AGGREGATE`. This simplifies the process of referring to the columns +from subsequent `JOIN` operators and allows for more readable queries. + For example: ```sql VALUES (0, 1) tab(col1, col2) -|> AS new_tab; -|> SELECT * FROM new_tab; +|> AS new_tab +|> SELECT col1 + col2 FROM new_tab; -+----+----+ -|col1|col2| -+----+----+ -| 0| 1| -+----+----+ ++-----------+ +|col1 + col2| ++-----------+ +| 1| ++-----------+ ``` #### WHERE @@ -357,22 +406,48 @@ VALUES (0), (0) tab(col) #### AGGREGATE ```sql +-- Full-table aggregation |> AGGREGATE [[AS] alias], ... -``` - -Performs full-table aggregation, returning one result row with a column for each aggregate expression. -```sql +-- Aggregation with grouping |> AGGREGATE [ [[AS] alias], ...] GROUP BY [AS alias], ... ``` -Performs aggregation with grouping, returning one row per group. The column list includes the -grouping columns first and then the aggregate columns afterward. Aliases can be assigned directly -on grouping expressions. +Performs aggregation across grouped rows or across the entire input table. + +If no `GROUP BY` clause is present, this performs full-table aggregation, returning one result row +with a column for each aggregate expression. Othwrise, this performs aggregation with grouping, +returning one row per group. Aliases can be assigned directly on grouping expressions. + +The output column list of this operator includes the grouping columns first (if any), and then the +aggregate columns afterward. + +Each `` expression can include standard aggregate function(s) like `COUNT`, `SUM`, `AVG`, +`MIN`, or any other aggregate function(s) that Spark SQL supports. Additional expressions may appear +below or above the aggregate function(s), such as `MIN(FLOOR(col)) + 1`. Each `` +expression must contain at least one aggregate function (or otherwise the query returns an error). +Each `` expression may include a column alias with `AS `, and may also +include a `DISTINCT` keyword to remove duplicate values before applying the aggregate function (for +example, `COUNT(DISTINCT col)`). + +If present, the `GROUP BY` clause can include any number of grouping expressions, and each +`` expression will evaluate over each unique combination of values of the grouping +expressions. The output table contains the evaluated grouping expressions followed by the evaluated +aggregate functions. The `GROUP BY` expressions may include one-based ordinals. Unlike regular SQL +in which such ordinals refer to the expressions in the accompanying `SELECT` clause, in SQL pipe +syntax, they refer to the columns of the relation produced by the preceding operator instead. For +example, in `TABLE t |> AGGREGATE COUNT(*) GROUP BY 2`, we refer to the second column of the input +table `t`. + +There is no need to repeat entire expressions between `GROUP BY` and `SELECT`, since the `AGGREGATE` +operator automatically includes the evaluated grouping expressions in its output. By the same token, +after an `AGGREGATE` operator, it is often unnecessary to issue a following `SELECT` operator, since +`AGGREGATE` returns both the grouping columns and the aggregate columns in a single step. For example: ```sql +-- Full-table aggregation VALUES (0), (1) tab(col) |> AGGREGATE COUNT(col) AS count; @@ -382,6 +457,7 @@ VALUES (0), (1) tab(col) | 2| +-----+ +-- Aggregation with grouping VALUES (0, 1), (0, 2) tab(col1, col2) |> AGGREGATE COUNT(col2) AS count GROUP BY col1; @@ -398,19 +474,45 @@ VALUES (0, 1), (0, 2) tab(col1, col2) |> [LEFT | RIGHT | FULL | CROSS | SEMI | ANTI | NATURAL | LATERAL] JOIN [ON | USING(col, ...)] ``` -Joins rows from both inputs, returning a filtered cross-product of the pipe input table and the table expression following the JOIN keyword. +Joins rows from both inputs, returning a filtered cross-product of the pipe input table and the +table expression following the JOIN keyword. This behaves a similar manner as the `JOIN` clause in +regular SQL where the pipe operator input table becomes the left side of the join and the table +argument becomes the right side of the join. + +Standard join modifiers like `LEFT`, `RIGHT`, and `FULL` are supported before the `JOIN` keyword. + +The join predicate may need to refer to columns from both inputs to the join. In this case, it may +be necessary to use table aliases to differentiate between columns in the event that both inputs +have columns with the same names. The `AS` operator can be useful here to introduce a new alias for +the pipe input table that becomes the left side of the join. Use standard syntax to assign an alias +to the table argument that becomes the right side of the join, if needed. For example: ```sql -VALUES (0, 1) tab(a, b) -|> JOIN VALUES (0, 2) tab(c, d) ON a = c; +SELECT 0 AS a, 1 AS b +|> AS lhs +|> JOIN VALUES (0, 2) rhs(a, b) ON (lhs.a = rhs.a); +---+---+---+---+ | a| b| c| d| +---+---+---+---+ | 0| 1| 0| 2| +---+---+---+---+ + +VALUES ('apples', 3), ('bananas', 4) t(item, sales) +|> AS produce_sales +|> LEFT JOIN + (SELECT "apples" AS item, 123 AS id) AS produce_data + USING (item) +|> SELECT produce_sales.item, sales, id; + +/*---------+-------+------+ + | item | sales | id | + +---------+-------+------+ + | apples | 3 | 123 | + | bananas | 4 | NULL | + +---------+-------+------*/ ``` #### ORDER BY @@ -419,7 +521,8 @@ VALUES (0, 1) tab(a, b) |> ORDER BY [ASC | DESC], ... ``` -Returns the input rows after sorting as indicated. Standard modifiers are supported including NULLS FIRST/LAST. +Returns the input rows after sorting as indicated. Standard modifiers are supported including NULLS +FIRST/LAST. For example: @@ -438,10 +541,10 @@ VALUES (0), (1) tab(col) #### UNION, INTERSECT, EXCEPT ```sql -|> {UNION | INTERSECT | EXCEPT} {ALL | DISTINCT} (), (), ... +|> {UNION | INTERSECT | EXCEPT} {ALL | DISTINCT} () ``` -Performs the union or other set operation over the combined rows from the input table plus one or more tables provided as input arguments. +Performs the union or other set operation over the combined rows from the input table or subquery. For example: @@ -469,12 +572,22 @@ For example: ```sql VALUES (0), (0), (0), (0) tab(col) -|> TABLESAMPLE BERNOULLI(1 ROWS); +|> TABLESAMPLE (1 ROWS); + ++---+ +|col| ++---+ +| 0| ++---+ + +VALUES (0), (0) tab(col) +|> TABLESAMPLE (100 PERCENT); +---+ |col| +---+ | 0| +| 0| +---+ ``` diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 7af54850f5da7..3b1138b9ee0e5 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -568,6 +568,7 @@ Below is a list of all the keywords in Spark SQL. |ITEMS|non-reserved|non-reserved|non-reserved| |ITERATE|non-reserved|non-reserved|non-reserved| |JOIN|reserved|strict-non-reserved|reserved| +|JSON|non-reserved|non-reserved|non-reserved| |KEYS|non-reserved|non-reserved|non-reserved| |LANGUAGE|non-reserved|non-reserved|reserved| |LAST|non-reserved|non-reserved|non-reserved| @@ -651,6 +652,7 @@ Below is a list of all the keywords in Spark SQL. |RECORDREADER|non-reserved|non-reserved|non-reserved| |RECORDWRITER|non-reserved|non-reserved|non-reserved| |RECOVER|non-reserved|non-reserved|non-reserved| +|RECURSIVE|reserved|non-reserved|reserved| |REDUCE|non-reserved|non-reserved|non-reserved| |REFERENCES|reserved|non-reserved|reserved| |REFRESH|non-reserved|non-reserved|non-reserved| diff --git a/docs/sql-ref-literals.md b/docs/sql-ref-literals.md index 141f985b0beac..7a10676cce237 100644 --- a/docs/sql-ref-literals.md +++ b/docs/sql-ref-literals.md @@ -46,6 +46,7 @@ A string literal is used to specify a character string value. One character from the character set. Use `\` to escape special characters (e.g., `'` or `\`). To represent unicode characters, use 16-bit or 32-bit unicode escape of the form `\uxxxx` or `\Uxxxxxxxx`, where xxxx and xxxxxxxx are 16-bit and 32-bit code points in hexadecimal respectively (e.g., `\u3042` for `あ` and `\U0001F44D` for `👍`). + An ASCII character can also be represented as an octal number preceded by `\` like `\101`, which represents `A`. * **r** @@ -78,14 +79,14 @@ SELECT "SPARK SQL" AS col; +---------+ | col| +---------+ -|Spark SQL| +|SPARK SQL| +---------+ SELECT 'it\'s $10.' AS col; +---------+ | col| +---------+ -|It's $10.| +|it's $10.| +---------+ SELECT r"'\n' represents newline character." AS col; diff --git a/docs/sql-ref-syntax-aux-describe-function.md b/docs/sql-ref-syntax-aux-describe-function.md index a871fb5bfd406..0c5a3d751a564 100644 --- a/docs/sql-ref-syntax-aux-describe-function.md +++ b/docs/sql-ref-syntax-aux-describe-function.md @@ -85,7 +85,7 @@ DESC FUNCTION max; -- Describe a builtin user defined aggregate function -- Returns function name, implementing class and usage and examples. -DESC FUNCTION EXTENDED explode +DESC FUNCTION EXTENDED explode; +---------------------------------------------------------------+ |function_desc | +---------------------------------------------------------------+ diff --git a/docs/sql-ref-syntax-aux-describe-table.md b/docs/sql-ref-syntax-aux-describe-table.md index 4b6e1e8c3461e..5f5fd27c865e6 100644 --- a/docs/sql-ref-syntax-aux-describe-table.md +++ b/docs/sql-ref-syntax-aux-describe-table.md @@ -29,16 +29,17 @@ to return the metadata pertaining to a partition or column respectively. ### Syntax ```sql -{ DESC | DESCRIBE } [ TABLE ] [ format ] table_identifier [ partition_spec ] [ col_name ] +{ DESC | DESCRIBE } [ TABLE ] [ format ] table_identifier [ partition_spec ] [ col_name ] [ AS JSON ] ``` ### Parameters * **format** - Specifies the optional format of describe output. If `EXTENDED` is specified + Specifies the optional format of describe output. If `EXTENDED` or `FORMATTED` is specified then additional metadata information (such as parent database, owner, and access time) - is returned. + is returned. Also if `EXTENDED` or `FORMATTED` is specified, then the metadata can be returned + in JSON format by specifying `AS JSON` at the end of the statement. * **table_identifier** @@ -60,8 +61,96 @@ to return the metadata pertaining to a partition or column respectively. and `col_name` are mutually exclusive and can not be specified together. Currently nested columns are not allowed to be specified. + JSON format is not currently supported for individual columns. + **Syntax:** `[ database_name. ] [ table_name. ] column_name` +* **AS JSON** + + An optional parameter to return the table metadata in JSON format. Only supported when `EXTENDED` + or `FORMATTED` format is specified (both produce equivalent JSON). + + **Syntax:** `[ AS JSON ]` + + **Schema:** + + Below is the full JSON schema. + In actual output, null fields are omitted and the JSON is not pretty-printed (see Examples). + + ```sql + { + "table_name": "", + "catalog_name": "", + "schema_name": "", + "namespace": [""], + "type": "", + "provider": "", + "columns": [ + { + "name": "", + "type": , + "comment": "", + "nullable": , + "default": "" + } + ], + "partition_values": { + "": "" + }, + "location": "", + "view_text": "", + "view_original_text": "", + "view_schema_mode": "", + "view_catalog_and_namespace": "", + "view_query_output_columns": ["col1", "col2"], + "comment": "", + "table_properties": { + "property1": "", + "property2": "" + }, + "storage_properties": { + "property1": "", + "property2": "" + }, + "serde_library": "", + "input_format": "", + "output_format": "", + "num_buckets": , + "bucket_columns": [""], + "sort_columns": [""], + "created_time": "", + "created_by": "", + "last_access": "", + "partition_provider": "" + } + ``` + + Below are the schema definitions for ``: + +| Spark SQL Data Types | JSON Representation | +|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ByteType | `{ "name" : "tinyint" }` | +| ShortType | `{ "name" : "smallint" }` | +| IntegerType | `{ "name" : "int" }` | +| LongType | `{ "name" : "bigint" }` | +| FloatType | `{ "name" : "float" }` | +| DoubleType | `{ "name" : "double" }` | +| DecimalType | `{ "name" : "decimal", "precision": p, "scale": s }` | +| StringType | `{ "name" : "string" }` | +| VarCharType | `{ "name" : "varchar", "length": n }` | +| CharType | `{ "name" : "char", "length": n }` | +| BinaryType | `{ "name" : "binary" }` | +| BooleanType | `{ "name" : "boolean" }` | +| DateType | `{ "name" : "date" }` | +| VariantType | `{ "name" : "variant" }` | +| TimestampType | `{ "name" : "timestamp_ltz" }` | +| TimestampNTZType | `{ "name" : "timestamp_ntz" }` | +| YearMonthIntervalType | `{ "name" : "interval", "start_unit": "", "end_unit": "" }` | +| DayTimeIntervalType | `{ "name" : "interval", "start_unit": "", "end_unit": "" }` | +| ArrayType | `{ "name" : "array", "element_type": , "element_nullable": }` | +| MapType | `{ "name" : "map", "key_type": , "value_type": , "value_nullable": }` | +| StructType | `{ "name" : "struct", "fields": [ {"name" : "field1", "type" : , “nullable”: , "comment": “”, "default": “”}, ... ] }` | + ### Examples ```sql @@ -173,6 +262,10 @@ DESCRIBE customer salesdb.customer.name; |data_type| string| | comment|Short name| +---------+----------+ + +-- Returns the table metadata in JSON format. +DESC FORMATTED customer AS JSON; +{"table_name":"customer","catalog_name":"spark_catalog","schema_name":"default","namespace":["default"],"columns":[{"name":"cust_id","type":{"name":"integer"},"nullable":true},{"name":"name","type":{"name":"string"},"comment":"Short name","nullable":true},{"name":"state","type":{"name":"varchar","length":20},"nullable":true}],"location": "file:/tmp/salesdb.db/custom...","created_time":"2020-04-07T14:05:43Z","last_access":"UNKNOWN","created_by":"None","type":"MANAGED","provider":"parquet","partition_provider":"Catalog","partition_columns":["state"]} ``` ### Related Statements diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md index 7ad56bf0657b2..9a50db992f9c8 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md @@ -54,9 +54,9 @@ ADD JAR '/some/other.jar'; ADD JAR "/path with space/abc.jar"; ADD JARS "/path with space/def.jar" '/path with space/ghi.jar'; ADD JAR "ivy://group:module:version"; -ADD JAR "ivy://group:module:version?transitive=false" -ADD JAR "ivy://group:module:version?transitive=true" -ADD JAR "ivy://group:module:version?exclude=group:module&transitive=true" +ADD JAR "ivy://group:module:version?transitive=false"; +ADD JAR "ivy://group:module:version?transitive=true"; +ADD JAR "ivy://group:module:version?exclude=group:module&transitive=true"; ``` ### Related Statements diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md index adcfa8db06f12..28ecc44a5bf7e 100644 --- a/docs/sql-ref-syntax-ddl-alter-table.md +++ b/docs/sql-ref-syntax-ddl-alter-table.md @@ -673,12 +673,12 @@ ALTER TABLE loc_orc SET fileformat orc; ALTER TABLE p1 partition (month=2, day=2) SET fileformat parquet; -- Change the file Location -ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION '/path/to/part/ways' +ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION '/path/to/part/ways'; -- SET SERDE/ SERDE Properties ALTER TABLE test_tab SET SERDE 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; -ALTER TABLE dbx.tab1 SET SERDE 'org.apache.hadoop' WITH SERDEPROPERTIES ('k' = 'v', 'kay' = 'vee') +ALTER TABLE dbx.tab1 SET SERDE 'org.apache.hadoop' WITH SERDEPROPERTIES ('k' = 'v', 'kay' = 'vee'); -- SET TABLE PROPERTIES ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('winner' = 'loser'); diff --git a/docs/sql-ref-syntax-ddl-declare-variable.md b/docs/sql-ref-syntax-ddl-declare-variable.md index ba9857bf1917a..41ecba1364361 100644 --- a/docs/sql-ref-syntax-ddl-declare-variable.md +++ b/docs/sql-ref-syntax-ddl-declare-variable.md @@ -83,7 +83,7 @@ DECLARE OR REPLACE five = 55; -- Explicitly declare the default value of a variable using the keyword `DEFAULT` DECLARE VARIABLE size DEFAULT 6; --- STRING variable initialialized to `NULL` +-- STRING variable initialized to `NULL` DECLARE some_var STRING; ``` diff --git a/docs/sql-ref-syntax-dml-insert-table.md b/docs/sql-ref-syntax-dml-insert-table.md index 6ca062e081747..6f85d4401d3b1 100644 --- a/docs/sql-ref-syntax-dml-insert-table.md +++ b/docs/sql-ref-syntax-dml-insert-table.md @@ -379,7 +379,7 @@ SELECT * FROM persons2; +-------------+--------------------------+---------+ -- in an atomic operation, 1) delete rows with ssn = 123456789 and 2) insert rows from persons2 -INSERT INTO persons REPLACE WHERE ssn = 123456789 SELECT * FROM persons2 +INSERT INTO persons REPLACE WHERE ssn = 123456789 SELECT * FROM persons2; SELECT * FROM persons; +-------------+--------------------------+---------+ diff --git a/docs/sql-ref-syntax-qry-star.md b/docs/sql-ref-syntax-qry-star.md index 3a997dad644b9..c575727e820e8 100644 --- a/docs/sql-ref-syntax-qry-star.md +++ b/docs/sql-ref-syntax-qry-star.md @@ -21,7 +21,7 @@ license: | ### Description -A shorthand to name all the referencable columns in the FROM clause or a specific table reference's columns or fields in the FROM clause. +A shorthand to name all the referenceable columns in the FROM clause or a specific table reference's columns or fields in the FROM clause. The star clause is most frequently used in the SELECT list. Spark also supports its use in function invocation and certain n-ary operations within the SELECT list and WHERE clause. @@ -38,11 +38,11 @@ except_clause * **name** - If present limits the columns or fields to be named to those in the specified referencable field, column, or table. + If present limits the columns or fields to be named to those in the specified referenceable field, column, or table. * **except_clause** - Optionally prunes columns or fields from the referencable set of columns identified in the select_star clause. + Optionally prunes columns or fields from the referenceable set of columns identified in the select_star clause. * **column_name** diff --git a/docs/sql-ref.md b/docs/sql-ref.md index 6eb2bf77c6ac0..6d557caaca3d6 100644 --- a/docs/sql-ref.md +++ b/docs/sql-ref.md @@ -26,6 +26,7 @@ Spark SQL is Apache Spark's module for working with structured data. This guide * [Data Types](sql-ref-datatypes.html) * [Datetime Pattern](sql-ref-datetime-pattern.html) * [Number Pattern](sql-ref-number-pattern.html) + * [Operators](sql-ref-operators.html) * [Functions](sql-ref-functions.html) * [Built-in Functions](sql-ref-functions-builtin.html) * [Scalar User-Defined Functions (UDFs)](sql-ref-functions-udf-scalar.html) diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/StreamingExamples.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/StreamingExamples.scala index 20c5eb1700155..9289b005e3ba4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/StreamingExamples.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/StreamingExamples.scala @@ -31,7 +31,7 @@ object StreamingExamples extends Logging { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + - " To override add a custom log4j.properties to the classpath.") + " To override add a custom log4j2.properties to the classpath.") Configurator.setRootLevel(Level.WARN) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala index 1493d8114c699..823143f9b9abb 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala @@ -126,7 +126,7 @@ object Pregel extends Logging { require(maxIterations > 0, s"Maximum number of iterations must be greater than 0," + s" but got ${maxIterations}") - val checkpointInterval = graph.vertices.sparkContext.getConf + val checkpointInterval = graph.vertices.sparkContext.getReadOnlyConf .getInt("spark.graphx.pregel.checkpointInterval", -1) var g = graph.mapVertices((vid, vdata) => vprog(vid, vdata, initialMsg)) val graphCheckpointer = new PeriodicGraphCheckpointer[VD, ED]( diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml index 9c2f21e7ab617..3361081549242 100644 --- a/hadoop-cloud/pom.xml +++ b/hadoop-cloud/pom.xml @@ -34,6 +34,8 @@ hadoop-cloud + 3.12.12 + 1.17.6 @@ -191,6 +193,16 @@ ${jetty.version} ${hadoop.deps.scope} + + com.squareup.okhttp3 + okhttp + ${okhttp.version} + + + com.squareup.okio + okio + ${okio.version} + diff --git a/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt b/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt index 97f88e2fe2de4..b2ec64828b395 100644 --- a/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt +++ b/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt @@ -2,337 +2,311 @@ daxpy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 143 149 3 700.3 1.4 1.0X -java 126 146 8 791.3 1.3 1.1X -native 142 149 4 705.8 1.4 1.0X +f2j 148 155 4 676.7 1.5 1.0X +java 146 153 7 684.9 1.5 1.0X ================================================================================================ saxpy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 75 82 6 1339.1 0.7 1.0X -java 68 71 2 1471.4 0.7 1.1X -native 76 84 6 1321.9 0.8 1.0X +f2j 79 85 4 1270.6 0.8 1.0X +java 69 73 2 1447.1 0.7 1.1X ================================================================================================ dcopy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dcopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 139 145 3 718.2 1.4 1.0X -java 127 143 9 786.7 1.3 1.1X -native 126 145 8 792.5 1.3 1.1X +f2j 131 151 10 766.2 1.3 1.0X +java 132 150 10 757.4 1.3 1.0X ================================================================================================ scopy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 73 80 4 1370.0 0.7 1.0X -java 69 72 2 1450.9 0.7 1.1X -native 73 80 4 1374.3 0.7 1.0X +f2j 73 83 7 1363.1 0.7 1.0X +java 72 75 2 1394.6 0.7 1.0X ================================================================================================ ddot ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 95 0 1052.1 1.0 1.0X -java 45 47 1 2236.4 0.4 2.1X -native 95 95 0 1053.4 0.9 1.0X +f2j 96 96 0 1044.5 1.0 1.0X +java 47 51 3 2147.8 0.5 2.1X ================================================================================================ sdot ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1074.4 0.9 1.0X -java 23 23 1 4444.2 0.2 4.1X -native 93 93 0 1075.1 0.9 1.0X +f2j 94 94 0 1067.9 0.9 1.0X +java 23 25 1 4320.8 0.2 4.0X ================================================================================================ dnrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dnrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 142 143 0 701.9 1.4 1.0X -java 33 33 0 3066.4 0.3 4.4X -native 94 95 1 1060.7 0.9 1.5X +f2j 143 144 1 698.6 1.4 1.0X +java 34 35 1 2981.8 0.3 4.3X ================================================================================================ snrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor snrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 113 114 1 885.5 1.1 1.0X -java 16 16 0 6158.1 0.2 7.0X -native 93 93 0 1073.9 0.9 1.2X +f2j 125 125 2 801.4 1.2 1.0X +java 16 17 1 6092.6 0.2 7.6X ================================================================================================ dscal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 132 137 2 755.3 1.3 1.0X -java 120 125 2 830.5 1.2 1.1X -native 128 133 3 779.8 1.3 1.0X +f2j 141 150 7 710.1 1.4 1.0X +java 127 135 3 784.5 1.3 1.1X ================================================================================================ sscal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 78 86 7 1288.3 0.8 1.0X -java 57 63 4 1756.3 0.6 1.4X -native 69 77 7 1445.6 0.7 1.1X +f2j 79 88 8 1264.4 0.8 1.0X +java 58 67 5 1711.0 0.6 1.4X ================================================================================================ dgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 67 68 1 1491.1 0.7 1.0X -java 22 23 1 4447.1 0.2 3.0X -native 44 46 1 2264.6 0.4 1.5X +f2j 100 100 1 1003.2 1.0 1.0X +java 23 25 1 4266.2 0.2 4.3X ================================================================================================ dgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1072.3 0.9 1.0X -java 23 23 1 4439.7 0.2 4.1X -native 93 93 0 1073.1 0.9 1.0X +f2j 94 94 1 1065.2 0.9 1.0X +java 23 24 1 4374.5 0.2 4.1X ================================================================================================ sgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 95 0 1053.6 0.9 1.0X -java 11 11 0 9024.3 0.1 8.6X -native 34 35 1 2939.9 0.3 2.8X +f2j 96 96 0 1042.4 1.0 1.0X +java 12 12 1 8626.4 0.1 8.3X ================================================================================================ sgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1078.0 0.9 1.0X -java 11 12 0 8986.9 0.1 8.3X -native 93 93 0 1079.0 0.9 1.0X +f2j 93 94 1 1071.4 0.9 1.0X +java 11 12 1 8768.3 0.1 8.2X ================================================================================================ dger ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dger: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 137 141 2 731.0 1.4 1.0X -java 120 123 2 836.1 1.2 1.1X -native 134 139 3 743.8 1.3 1.0X +f2j 139 144 2 717.0 1.4 1.0X +java 121 126 3 828.1 1.2 1.2X ================================================================================================ dspmv[U] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 538.1 1.9 1.0X -java 11 12 0 4370.3 0.2 8.1X -native 47 47 0 1066.7 0.9 2.0X +f2j 92 93 2 541.7 1.8 1.0X +java 12 12 1 4276.6 0.2 7.9X ================================================================================================ dspr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 87 95 7 574.0 1.7 1.0X -java 87 95 7 575.3 1.7 1.0X -native 62 70 6 812.0 1.2 1.4X +f2j 93 96 2 536.8 1.9 1.0X +java 88 96 7 567.6 1.8 1.1X ================================================================================================ dsyr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 142 145 2 353.3 2.8 1.0X -java 142 145 2 352.9 2.8 1.0X -native 117 124 3 427.3 2.3 1.2X +f2j 136 141 2 366.7 2.7 1.0X +java 138 143 4 361.4 2.8 1.0X ================================================================================================ dgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 661 662 1 1512.5 0.7 1.0X -java 63 68 4 15787.8 0.1 10.4X -native 631 633 2 1583.8 0.6 1.0X +f2j 665 667 1 1503.9 0.7 1.0X +java 63 69 4 15804.0 0.1 10.5X ================================================================================================ dgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 669 670 2 1495.3 0.7 1.0X -java 64 70 3 15673.5 0.1 10.5X -native 375 377 5 2665.8 0.4 1.8X +f2j 667 669 2 1499.4 0.7 1.0X +java 64 70 4 15724.9 0.1 10.5X ================================================================================================ dgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 900 901 1 1111.3 0.9 1.0X -java 63 68 4 15822.8 0.1 14.2X -native 903 904 1 1107.9 0.9 1.0X +f2j 911 913 2 1097.3 0.9 1.0X +java 63 69 4 15900.2 0.1 14.5X ================================================================================================ dgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 941 943 3 1062.6 0.9 1.0X -java 63 69 5 15771.6 0.1 14.8X -native 915 916 1 1092.7 0.9 1.0X +f2j 950 953 5 1053.1 0.9 1.0X +java 63 69 4 15828.3 0.1 15.0X ================================================================================================ sgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 650 651 1 1537.8 0.7 1.0X -java 40 41 1 24986.7 0.0 16.2X -native 372 372 1 2691.6 0.4 1.8X +f2j 653 655 3 1530.4 0.7 1.0X +java 40 41 1 25035.2 0.0 16.4X ================================================================================================ sgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 651 652 1 1536.7 0.7 1.0X -java 41 41 1 24643.9 0.0 16.0X -native 372 373 1 2688.8 0.4 1.7X +f2j 655 657 3 1526.9 0.7 1.0X +java 40 41 1 24749.3 0.0 16.2X ================================================================================================ sgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 901 902 0 1109.8 0.9 1.0X -java 40 41 1 25107.2 0.0 22.6X -native 918 919 1 1089.3 0.9 1.0X +f2j 906 907 2 1104.0 0.9 1.0X +java 40 41 1 25083.9 0.0 22.7X ================================================================================================ sgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 942 944 2 1061.1 0.9 1.0X -java 40 41 1 24888.3 0.0 23.5X -native 914 915 0 1093.7 0.9 1.0X +f2j 946 949 4 1056.7 0.9 1.0X +java 40 41 1 24924.3 0.0 23.6X diff --git a/mllib-local/benchmarks/BLASBenchmark-results.txt b/mllib-local/benchmarks/BLASBenchmark-results.txt index db92355b7a3c0..cb9a670c990f4 100644 --- a/mllib-local/benchmarks/BLASBenchmark-results.txt +++ b/mllib-local/benchmarks/BLASBenchmark-results.txt @@ -2,337 +2,311 @@ daxpy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 150 158 4 667.3 1.5 1.0X -java 142 147 3 703.2 1.4 1.1X -native 150 158 4 668.3 1.5 1.0X +f2j 136 141 6 733.6 1.4 1.0X +java 128 131 1 782.3 1.3 1.1X ================================================================================================ saxpy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 85 89 2 1173.5 0.9 1.0X -java 71 74 2 1409.0 0.7 1.2X -native 86 89 2 1158.6 0.9 1.0X +f2j 78 80 1 1278.9 0.8 1.0X +java 65 67 2 1540.9 0.6 1.2X ================================================================================================ dcopy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dcopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 143 151 4 698.9 1.4 1.0X -java 142 150 4 705.2 1.4 1.0X -native 143 148 3 697.2 1.4 1.0X +f2j 131 136 3 766.0 1.3 1.0X +java 136 141 3 734.3 1.4 1.0X ================================================================================================ scopy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 82 85 2 1215.8 0.8 1.0X -java 72 75 2 1398.0 0.7 1.1X -native 80 83 2 1250.7 0.8 1.0X +f2j 77 81 1 1300.7 0.8 1.0X +java 69 72 1 1439.3 0.7 1.1X ================================================================================================ ddot ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 95 0 1051.8 1.0 1.0X -java 44 46 2 2279.3 0.4 2.2X -native 95 95 0 1057.0 0.9 1.0X +f2j 96 96 0 1043.9 1.0 1.0X +java 44 46 1 2251.5 0.4 2.2X ================================================================================================ sdot ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1074.0 0.9 1.0X -java 21 22 1 4768.4 0.2 4.4X -native 93 93 1 1075.7 0.9 1.0X +f2j 94 94 2 1066.8 0.9 1.0X +java 22 23 0 4546.9 0.2 4.3X ================================================================================================ dnrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dnrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 143 143 0 699.8 1.4 1.0X -java 32 33 1 3105.2 0.3 4.4X -native 94 95 1 1061.0 0.9 1.5X +f2j 144 144 1 695.9 1.4 1.0X +java 32 33 0 3089.0 0.3 4.4X ================================================================================================ snrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor snrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 120 120 1 834.9 1.2 1.0X -java 16 16 0 6220.1 0.2 7.5X -native 93 93 2 1074.9 0.9 1.3X +f2j 121 121 1 828.5 1.2 1.0X +java 16 16 0 6186.1 0.2 7.5X ================================================================================================ dscal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 142 147 2 704.2 1.4 1.0X -java 130 134 2 772.1 1.3 1.1X -native 135 142 3 740.7 1.4 1.1X +f2j 125 130 6 799.4 1.3 1.0X +java 120 123 2 832.6 1.2 1.0X ================================================================================================ sscal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 80 82 1 1253.8 0.8 1.0X -java 64 68 1 1554.0 0.6 1.2X -native 80 83 2 1256.6 0.8 1.0X +f2j 73 75 1 1372.9 0.7 1.0X +java 54 59 4 1858.4 0.5 1.4X ================================================================================================ dgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 96 96 0 1043.0 1.0 1.0X -java 22 23 1 4563.6 0.2 4.4X -native 45 47 1 2229.3 0.4 2.1X +f2j 96 97 2 1036.6 1.0 1.0X +java 23 23 1 4407.0 0.2 4.3X ================================================================================================ dgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 94 94 0 1065.7 0.9 1.0X -java 22 24 1 4467.6 0.2 4.2X -native 93 93 0 1073.1 0.9 1.0X +f2j 95 95 0 1056.5 0.9 1.0X +java 23 23 0 4436.2 0.2 4.2X ================================================================================================ sgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 96 0 1050.3 1.0 1.0X -java 11 12 0 8901.1 0.1 8.5X -native 34 35 1 2956.0 0.3 2.8X +f2j 96 96 2 1042.7 1.0 1.0X +java 11 12 0 9009.9 0.1 8.6X ================================================================================================ sgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1077.7 0.9 1.0X -java 11 12 0 8874.5 0.1 8.2X -native 93 93 0 1079.6 0.9 1.0X +f2j 93 94 0 1070.0 0.9 1.0X +java 11 12 0 8956.4 0.1 8.4X ================================================================================================ dger ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dger: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 150 154 2 666.9 1.5 1.0X -java 125 130 2 801.5 1.2 1.2X -native 143 149 3 698.6 1.4 1.0X +f2j 133 136 1 750.3 1.3 1.0X +java 114 116 2 878.5 1.1 1.2X ================================================================================================ dspmv[U] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 99 100 0 503.9 2.0 1.0X -java 11 12 0 4411.4 0.2 8.8X -native 47 47 0 1067.1 0.9 2.1X +f2j 100 101 1 498.5 2.0 1.0X +java 11 12 1 4354.1 0.2 8.7X ================================================================================================ dspr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 96 99 2 520.9 1.9 1.0X -java 97 98 1 517.4 1.9 1.0X -native 73 77 2 681.8 1.5 1.3X +f2j 89 91 1 561.5 1.8 1.0X +java 89 91 1 559.8 1.8 1.0X ================================================================================================ dsyr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 144 149 2 347.0 2.9 1.0X -java 144 148 2 346.2 2.9 1.0X -native 121 126 2 413.6 2.4 1.2X +f2j 130 133 3 384.7 2.6 1.0X +java 129 132 3 386.4 2.6 1.0X ================================================================================================ dgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 665 666 2 1503.6 0.7 1.0X -java 65 71 3 15448.3 0.1 10.3X -native 630 632 2 1586.6 0.6 1.1X +f2j 670 673 4 1493.6 0.7 1.0X +java 72 73 1 13968.9 0.1 9.4X ================================================================================================ dgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 672 674 2 1487.3 0.7 1.0X -java 65 71 3 15366.3 0.1 10.3X -native 376 377 1 2661.0 0.4 1.8X +f2j 675 678 4 1482.3 0.7 1.0X +java 72 73 1 13923.4 0.1 9.4X ================================================================================================ dgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 921 921 1 1086.1 0.9 1.0X -java 64 70 3 15574.7 0.1 14.3X -native 901 902 1 1109.5 0.9 1.0X +f2j 927 929 4 1078.8 0.9 1.0X +java 71 72 1 14079.1 0.1 13.1X ================================================================================================ dgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 950 952 5 1052.4 1.0 1.0X -java 71 72 1 14034.5 0.1 13.3X -native 914 914 1 1094.4 0.9 1.0X +f2j 960 964 6 1041.4 1.0 1.0X +java 71 73 1 13994.1 0.1 13.4X ================================================================================================ sgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 648 649 2 1543.9 0.6 1.0X -java 41 42 1 24403.6 0.0 15.8X -native 371 371 1 2699.0 0.4 1.7X +f2j 653 655 3 1531.0 0.7 1.0X +java 41 41 1 24509.8 0.0 16.0X ================================================================================================ sgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 652 653 2 1533.0 0.7 1.0X -java 42 43 1 24056.0 0.0 15.7X -native 371 372 5 2697.5 0.4 1.8X +f2j 658 663 4 1520.1 0.7 1.0X +java 41 43 1 24199.3 0.0 15.9X ================================================================================================ sgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 900 901 2 1111.2 0.9 1.0X -java 40 41 1 24704.4 0.0 22.2X -native 917 919 2 1090.4 0.9 1.0X +f2j 907 908 2 1103.0 0.9 1.0X +java 40 42 1 24844.4 0.0 22.5X ================================================================================================ sgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 948 950 1 1054.4 0.9 1.0X -java 41 42 1 24366.6 0.0 23.1X -native 913 915 1 1094.8 0.9 1.0X +f2j 955 958 5 1047.1 1.0 1.0X +java 41 42 1 24509.7 0.0 23.4X diff --git a/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt b/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt index 00de1f2d104f9..f3a4fbcb8fc2a 100644 --- a/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt +++ b/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ VectorUDT de/serialization ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor VectorUDT de/serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -serialize 92 103 6 0.0 92038.9 1.0X -deserialize 69 74 3 0.0 69046.7 1.3X +serialize 102 107 2 0.0 101911.3 1.0X +deserialize 75 78 1 0.0 74761.1 1.4X diff --git a/mllib/benchmarks/UDTSerializationBenchmark-results.txt b/mllib/benchmarks/UDTSerializationBenchmark-results.txt index 05004e6f74f3c..e77536e502c3b 100644 --- a/mllib/benchmarks/UDTSerializationBenchmark-results.txt +++ b/mllib/benchmarks/UDTSerializationBenchmark-results.txt @@ -2,11 +2,11 @@ VectorUDT de/serialization ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor VectorUDT de/serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -serialize 92 109 9 0.0 91694.5 1.0X -deserialize 69 71 1 0.0 69297.4 1.3X +serialize 95 97 1 0.0 94889.7 1.0X +deserialize 70 74 3 0.0 69517.2 1.4X diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator new file mode 100644 index 0000000000000..e6902f62c4d60 --- /dev/null +++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Spark Connect ML uses ServiceLoader to find out the supported Spark Ml estimators. +# So register the supported estimator here if you're trying to add a new one. +org.apache.spark.ml.classification.LogisticRegression diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer new file mode 100644 index 0000000000000..004ec8aeff8cf --- /dev/null +++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Spark Connect ML uses ServiceLoader to find out the supported Spark Ml non-model transformer. +# So register the supported transformer here if you're trying to add a new one. +org.apache.spark.ml.feature.VectorAssembler diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 9f3428db484c2..88cfb703fca41 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.Since import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.util.Summary import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{col, lit} @@ -28,7 +29,7 @@ import org.apache.spark.sql.types.DoubleType /** * Abstraction for multiclass classification results for a given model. */ -private[classification] trait ClassificationSummary extends Serializable { +private[classification] trait ClassificationSummary extends Summary with Serializable { /** * Dataframe output by the model's `transform` method. diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 055c1c4d4228e..43016a32e570b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -19,11 +19,11 @@ package org.apache.spark.ml.param import java.lang.reflect.Modifier import java.util.{List => JList} -import java.util.NoSuchElementException import scala.annotation.varargs import scala.collection.mutable import scala.jdk.CollectionConverters._ +import scala.reflect.ClassTag import org.json4s._ import org.json4s.jackson.JsonMethods._ @@ -45,9 +45,14 @@ import org.apache.spark.util.ArrayImplicits._ * See [[ParamValidators]] for factory methods for common validation functions. * @tparam T param value type */ -class Param[T](val parent: String, val name: String, val doc: String, val isValid: T => Boolean) +class Param[T: ClassTag]( + val parent: String, val name: String, val doc: String, val isValid: T => Boolean) extends Serializable { + // Spark Connect ML needs T type information which has been erased when compiling, + // Use classTag to preserve the T type. + val paramValueClassTag = implicitly[ClassTag[T]] + def this(parent: Identifiable, name: String, doc: String, isValid: T => Boolean) = this(parent.uid, name, doc, isValid) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 4c3242c132090..e67b72e090601 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -29,8 +29,9 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes} import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate import org.apache.spark.sql.catalyst.trees.BinaryLike +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.functions.lit -import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} +import org.apache.spark.sql.internal.ExpressionUtils.expression import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -249,13 +250,13 @@ private[ml] class SummaryBuilderImpl( ) extends SummaryBuilder { override def summary(featuresCol: Column, weightCol: Column): Column = { - SummaryBuilderImpl.MetricsAggregate( + Column(SummaryBuilderImpl.MetricsAggregate( requestedMetrics, requestedCompMetrics, - featuresCol, - weightCol, + expression(featuresCol), + expression(weightCol), mutableAggBufferOffset = 0, - inputAggBufferOffset = 0) + inputAggBufferOffset = 0)) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Summary.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Summary.scala new file mode 100644 index 0000000000000..6205fea92ef83 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Summary.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.util + +import org.apache.spark.annotation.Since + +/** + * Trait for the Summary + * All the summaries should extend from this Summary in order to + * support connect. + */ +@Since("4.0.0") +private[spark] trait Summary diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index b5b2233ecb756..100fa13db5180 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -520,9 +520,15 @@ class Word2VecModel private[spark] ( } } + // Auxiliary constructor must begin with call to 'this'. + // Helper constructor for `def this(model: Map[String, Array[Float]])`. + private def this(model: (Map[String, Int], Array[Float])) = { + this(model._1, model._2) + } + @Since("1.5.0") def this(model: Map[String, Array[Float]]) = { - this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model)) + this(Word2VecModel.buildFromVecMap(model)) } @Since("1.4.0") @@ -642,21 +648,22 @@ class Word2VecModel private[spark] ( @Since("1.4.0") object Word2VecModel extends Loader[Word2VecModel] { - private def buildWordIndex(model: Map[String, Array[Float]]): Map[String, Int] = { - CUtils.toMapWithIndex(model.keys) - } - - private def buildWordVectors(model: Map[String, Array[Float]]): Array[Float] = { + private def buildFromVecMap( + model: Map[String, Array[Float]]): (Map[String, Int], Array[Float]) = { require(model.nonEmpty, "Word2VecMap should be non-empty") + val (vectorSize, numWords) = (model.head._2.length, model.size) - val wordList = model.keys.toArray val wordVectors = new Array[Float](vectorSize * numWords) - var i = 0 - while (i < numWords) { - Array.copy(model(wordList(i)), 0, wordVectors, i * vectorSize, vectorSize) - i += 1 + + val wordIndex = collection.immutable.Map.newBuilder[String, Int] + wordIndex.sizeHint(numWords) + + model.iterator.zipWithIndex.foreach { + case ((word, vector), i) => + wordIndex += ((word, i)) + Array.copy(vector, 0, wordVectors, i * vectorSize, vectorSize) } - wordVectors + (wordIndex.result(), wordVectors) } private object SaveLoadV1_0 { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala index b45211c1689c7..2acc49e218f2d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala @@ -204,7 +204,7 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging { // TODO: Fix this issue for real. val memThreshold = 768 if (sc.isLocal) { - val driverMemory = sc.getConf.getOption("spark.driver.memory") + val driverMemory = sc.getReadOnlyConf.getOption("spark.driver.memory") .orElse(Option(System.getenv("SPARK_DRIVER_MEMORY"))) .map(Utils.memoryStringToMb) .getOrElse(Utils.DEFAULT_DRIVER_MEM_MB) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala index 7251dfd07a1fa..af922dda13f6b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala @@ -402,7 +402,7 @@ private[tree] object TreeEnsembleModel extends Logging { // TODO: Fix this issue for real. val memThreshold = 768 if (sc.isLocal) { - val driverMemory = sc.getConf.getOption("spark.driver.memory") + val driverMemory = sc.getReadOnlyConf.getOption("spark.driver.memory") .orElse(Option(System.getenv("SPARK_DRIVER_MEMORY"))) .map(Utils.memoryStringToMb) .getOrElse(Utils.DEFAULT_DRIVER_MEM_MB) diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java index 1ad5f7a442daa..b3993c453e91f 100644 --- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java +++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java @@ -21,6 +21,7 @@ import java.util.List; import org.apache.spark.ml.util.Identifiable$; +import scala.reflect.ClassTag; /** * A subclass of Params for testing. @@ -110,7 +111,7 @@ private void init() { ParamValidators.inRange(0.0, 1.0)); List validStrings = Arrays.asList("a", "b"); myStringParam_ = new Param<>(this, "myStringParam", "this is a string param", - ParamValidators.inArray(validStrings)); + ParamValidators.inArray(validStrings), ClassTag.apply(String.class)); myDoubleArrayParam_ = new DoubleArrayParam(this, "myDoubleArrayParam", "this is a double param"); diff --git a/pom.xml b/pom.xml index 9fa0b3cc8a4b7..41a5ce0c5592c 100644 --- a/pom.xml +++ b/pom.xml @@ -121,11 +121,11 @@ spark 9.7.1 2.0.16 - 2.24.1 + 2.24.3 3.4.1 - 4.28.3 + 4.29.3 3.11.4 3.9.3 5.7.1 @@ -137,21 +137,21 @@ 3.9.0 10.16.1.1 - 1.14.4 - 2.0.3 + 1.15.0 + 2.1.0 shaded-protobuf 11.0.24 5.0.0 4.0.1 0.10.0 - 2.5.2 + 2.5.3 2.0.8 - 4.2.28 + 4.2.29 1.12.0 1.12.0 @@ -180,12 +180,12 @@ true true 1.9.13 - 2.18.1 - 2.18.1 + 2.18.2 + 2.18.2 2.3.1 1.1.10.7 3.0.3 - 1.17.1 + 1.17.2 1.27.1 2.18.0 @@ -195,7 +195,7 @@ 2.12.0 4.1.17 - 33.2.1-jre + 33.3.1-jre 2.11.0 3.1.9 3.0.16 @@ -212,31 +212,30 @@ 1.1.0 1.9.0 1.79 - 1.15.0 + 1.16.0 6.1.1 - 4.1.114.Final + 4.1.115.Final 2.0.69.Final 76.1 - 5.11.3 - 1.11.3 + 5.11.4 + 1.11.4 - 0.13.1 + 0.13.3 - 18.0.0 + 18.1.0 3.0.0 0.12.6 org.fusesource.leveldbjni - 6.13.4 - 1.17.6 + 7.0.1 ${java.home} @@ -295,7 +294,7 @@ true - 33.2.1-jre + 33.3.1-jre 1.0.2 1.67.1 1.1.4 @@ -334,7 +333,7 @@ 42.7.4 11.5.9.0 12.8.1.jre11 - 23.6.0.24.10 + 23.6.0.24.10 ${project.version} @@ -631,7 +630,7 @@ org.apache.commons commons-text - 1.12.0 + 1.13.0 commons-lang @@ -839,7 +838,7 @@ com.github.luben zstd-jni - 1.5.6-7 + 1.5.6-9 com.clearspring.analytics @@ -1348,7 +1347,7 @@ com.oracle.database.jdbc ojdbc17 - ${ojdbc11.version} + ${ojdbc17.version} test @@ -2594,11 +2593,6 @@ javax.servlet-api ${javaxservlet.version} - - com.squareup.okio - okio - ${okio.version} - @@ -3225,7 +3219,7 @@ --> com.puppycrawl.tools checkstyle - 10.20.0 + 10.20.2 diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 1c3e2f16cb0f8..a3a56a6f02dad 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -201,6 +201,11 @@ object MimaExcludes { // SPARK-50112: Moving avro files from connector to sql/core ProblemFilters.exclude[Problem]("org.apache.spark.sql.avro.*"), + + // SPARK-50768: Introduce TaskContext.createResourceUninterruptibly to avoid stream leak by task interruption + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.interruptible"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.pendingInterrupt"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.createResourceUninterruptibly"), ) ++ loggingExcludes("org.apache.spark.sql.DataFrameReader") ++ loggingExcludes("org.apache.spark.sql.streaming.DataStreamReader") ++ loggingExcludes("org.apache.spark.sql.SparkSession#Builder") diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 48b243618eea3..d84c0f17d2b2b 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -65,10 +65,10 @@ object BuildCommons { ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects ++ connectProjects val optionallyEnabledProjects@Seq(kubernetes, yarn, - sparkGangliaLgpl, streamingKinesisAsl, + sparkGangliaLgpl, streamingKinesisAsl, profiler, dockerIntegrationTests, hadoopCloud, kubernetesIntegrationTests) = Seq("kubernetes", "yarn", - "ganglia-lgpl", "streaming-kinesis-asl", + "ganglia-lgpl", "streaming-kinesis-asl", "profiler", "docker-integration-tests", "hadoop-cloud", "kubernetes-integration-tests").map(ProjectRef(buildLocation, _)) val assemblyProjects@Seq(networkYarn, streamingKafka010Assembly, streamingKinesisAslAssembly) = @@ -89,7 +89,7 @@ object BuildCommons { // Google Protobuf version used for generating the protobuf. // SPARK-41247: needs to be consistent with `protobuf.version` in `pom.xml`. - val protoVersion = "4.28.3" + val protoVersion = "4.29.3" // GRPC version used for Spark Connect. val grpcVersion = "1.67.1" } @@ -371,7 +371,7 @@ object SparkBuild extends PomBuild { Seq( spark, hive, hiveThriftServer, repl, networkCommon, networkShuffle, networkYarn, unsafe, tags, tokenProviderKafka010, sqlKafka010, connectCommon, connect, connectClient, - variant, connectShims + variant, connectShims, profiler ).contains(x) } @@ -1057,7 +1057,7 @@ object KubernetesIntegrationTests { * Overrides to work around sbt's dependency resolution being different from Maven's. */ object DependencyOverrides { - lazy val guavaVersion = sys.props.get("guava.version").getOrElse("33.1.0-jre") + lazy val guavaVersion = sys.props.get("guava.version").getOrElse("33.3.1-jre") lazy val settings = Seq( dependencyOverrides += "com.google.guava" % "guava" % guavaVersion, dependencyOverrides += "jline" % "jline" % "2.14.6", @@ -1469,11 +1469,11 @@ object SparkUnidoc extends SharedUnidocSettings { (ScalaUnidoc / unidoc / unidocProjectFilter) := inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, kubernetes, yarn, tags, streamingKafka010, sqlKafka010, connectCommon, connect, connectClient, - connectShims, protobuf), + connectShims, protobuf, profiler), (JavaUnidoc / unidoc / unidocProjectFilter) := inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, kubernetes, yarn, tags, streamingKafka010, sqlKafka010, connectCommon, connect, connectClient, - connectShims, protobuf), + connectShims, protobuf, profiler), ) } @@ -1724,7 +1724,7 @@ object TestSettings { (Test / testOptions) += Tests.Argument(TestFrameworks.ScalaTest, "-W", "120", "300"), (Test / testOptions) += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"), // Enable Junit testing. - libraryDependencies += "com.github.sbt.junit" % "jupiter-interface" % "0.13.1" % "test", + libraryDependencies += "com.github.sbt.junit" % "jupiter-interface" % "0.13.3" % "test", // `parallelExecutionInTest` controls whether test suites belonging to the same SBT project // can run in parallel with one another. It does NOT control whether tests execute in parallel // within the same JVM (which is controlled by `testForkedParallel`) or whether test cases diff --git a/project/plugins.sbt b/project/plugins.sbt index 8ae03c0995132..4e1c282dcf315 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -19,10 +19,10 @@ addSbtPlugin("software.purpledragon" % "sbt-checkstyle-plugin" % "4.0.1") // If you are changing the dependency setting for checkstyle plugin, // please check pom.xml in the root of the source tree too. -libraryDependencies += "com.puppycrawl.tools" % "checkstyle" % "10.20.0" +libraryDependencies += "com.puppycrawl.tools" % "checkstyle" % "10.20.2" -// checkstyle uses guava 33.1.0-jre. -libraryDependencies += "com.google.guava" % "guava" % "33.1.0-jre" +// checkstyle uses guava 33.3.1-jre. +libraryDependencies += "com.google.guava" % "guava" % "33.3.1-jre" addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.0") @@ -44,6 +44,6 @@ addSbtPlugin("com.simplytyped" % "sbt-antlr4" % "0.8.3") addSbtPlugin("com.github.sbt" % "sbt-pom-reader" % "2.4.0") -addSbtPlugin("com.github.sbt.junit" % "sbt-jupiter-interface" % "0.13.1") +addSbtPlugin("com.github.sbt.junit" % "sbt-jupiter-interface" % "0.13.3") addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.7") diff --git a/python/docs/Makefile b/python/docs/Makefile index 428b0d24b568e..045b03a1afd1b 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -21,7 +21,7 @@ SPHINXBUILD ?= sphinx-build SOURCEDIR ?= source BUILDDIR ?= build -export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9.7-src.zip) +export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9.9-src.zip) # Put it first so that "make" without argument is like "make help". help: diff --git a/python/docs/make2.bat b/python/docs/make2.bat index 41e33cd07d418..ff0c8f991b958 100644 --- a/python/docs/make2.bat +++ b/python/docs/make2.bat @@ -25,7 +25,7 @@ if "%SPHINXBUILD%" == "" ( set SOURCEDIR=source set BUILDDIR=build -set PYTHONPATH=..;..\lib\py4j-0.10.9.7-src.zip +set PYTHONPATH=..;..\lib\py4j-0.10.9.9-src.zip if "%1" == "" goto help diff --git a/python/docs/source/_static/css/pyspark.css b/python/docs/source/_static/css/pyspark.css index 565eaea299359..6f47dd80e9503 100644 --- a/python/docs/source/_static/css/pyspark.css +++ b/python/docs/source/_static/css/pyspark.css @@ -91,16 +91,3 @@ u.bd-sidebar .nav>li>ul>.active:hover>a,.bd-sidebar .nav>li>ul>.active>a { .spec_table tr, td, th { border-top: none!important; } - -/* Styling to the version dropdown */ -#version-button { - padding-left: 0.2rem; - padding-right: 3.2rem; -} - -#version_switcher { - height: auto; - max-height: 300px; - width: 165px; - overflow-y: auto; -} diff --git a/python/docs/source/_templates/version-switcher.html b/python/docs/source/_templates/version-switcher.html deleted file mode 100644 index 16c443229f4be..0000000000000 --- a/python/docs/source/_templates/version-switcher.html +++ /dev/null @@ -1,77 +0,0 @@ - - - - - diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index 5640ba151176d..20c13cd768deb 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -188,19 +188,19 @@ # a list of builtin themes. html_theme = 'pydata_sphinx_theme' -html_context = { - # When releasing a new Spark version, please update the file - # "site/static/versions.json" under the code repository "spark-website" - # (item should be added in order), and also set the local environment - # variable "RELEASE_VERSION". - "switcher_json_url": "https://spark.apache.org/static/versions.json", - "switcher_template_url": "https://spark.apache.org/docs/{version}/api/python/index.html", -} - # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { + "check_switcher": False, + "switcher": { + # When releasing a new Spark version, please update the file + # "site/static/versions.json" under the code repository "spark-website" + # (item should be added in order), and also set the local environment + # variable "RELEASE_VERSION". + "json_url": "https://spark.apache.org/static/versions.json", + "version_match": release, + }, "header_links_before_dropdown": 6, "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], "footer_start": ["spark_footer", "sphinx-version"], diff --git a/python/docs/source/development/debugging.rst b/python/docs/source/development/debugging.rst index 9510fe0abde1e..0aa2426cf862d 100644 --- a/python/docs/source/development/debugging.rst +++ b/python/docs/source/development/debugging.rst @@ -669,7 +669,7 @@ Stack Traces There are Spark configurations to control stack traces: -- ``spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled`` is true by default to simplify traceback from Python UDFs. +- ``spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled`` is true by default to simplify traceback from Python UDFs and Data Sources. - ``spark.sql.pyspark.jvmStacktrace.enabled`` is false by default to hide JVM stacktrace and to show a Python-friendly exception only. diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index d0dc285b5257c..c60839025eef6 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -177,7 +177,7 @@ PySpark requires the following dependencies. ========================== ========================= ============================= Package Supported version Note ========================== ========================= ============================= -`py4j` >=0.10.9.7 Required to interact with JVM +`py4j` >=0.10.9.9 Required to interact with JVM ========================== ========================= ============================= Additional libraries that enhance functionality but are not included in the installation packages: @@ -207,7 +207,7 @@ Installable with ``pip install "pyspark[connect]"``. Package Supported version Note ========================== ================= ========================== `pandas` >=2.0.0 Required for Spark Connect -`pyarrow` >=10.0.0 Required for Spark Connect +`pyarrow` >=11.0.0 Required for Spark Connect `grpcio` >=1.67.0 Required for Spark Connect `grpcio-status` >=1.67.0 Required for Spark Connect `googleapis-common-protos` >=1.65.0 Required for Spark Connect @@ -223,7 +223,7 @@ Installable with ``pip install "pyspark[sql]"``. Package Supported version Note ========= ================= ====================== `pandas` >=2.0.0 Required for Spark SQL -`pyarrow` >=10.0.0 Required for Spark SQL +`pyarrow` >=11.0.0 Required for Spark SQL ========= ================= ====================== Additional libraries that enhance functionality but are not included in the installation packages: @@ -239,8 +239,8 @@ Installable with ``pip install "pyspark[pandas_on_spark]"``. ========= ================= ================================ Package Supported version Note ========= ================= ================================ -`pandas` >=2.0.0 Required for Pandas API on Spark -`pyarrow` >=10.0.0 Required for Pandas API on Spark +`pandas` >=2.2.0 Required for Pandas API on Spark +`pyarrow` >=11.0.0 Required for Pandas API on Spark ========= ================= ================================ Additional libraries that enhance functionality but are not included in the installation packages: diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 5292530420025..55d067eb5fa2d 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -25,7 +25,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, Python 3.8 support was dropped in PySpark. * In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 2.0.0 in PySpark. * In Spark 4.0, the minimum supported version for Numpy has been raised from 1.15 to 1.21 in PySpark. -* In Spark 4.0, the minimum supported version for PyArrow has been raised from 4.0.0 to 10.0.0 in PySpark. +* In Spark 4.0, the minimum supported version for PyArrow has been raised from 4.0.0 to 11.0.0 in PySpark. * In Spark 4.0, ``Int64Index`` and ``Float64Index`` have been removed from pandas API on Spark, ``Index`` should be used directly. * In Spark 4.0, ``DataFrame.iteritems`` has been removed from pandas API on Spark, use ``DataFrame.items`` instead. * In Spark 4.0, ``Series.iteritems`` has been removed from pandas API on Spark, use ``Series.items`` instead. diff --git a/python/docs/source/reference/pyspark.sql/dataframe.rst b/python/docs/source/reference/pyspark.sql/dataframe.rst index 569c5cec69557..5aaea4c32577f 100644 --- a/python/docs/source/reference/pyspark.sql/dataframe.rst +++ b/python/docs/source/reference/pyspark.sql/dataframe.rst @@ -30,6 +30,7 @@ DataFrame DataFrame.agg DataFrame.alias DataFrame.approxQuantile + DataFrame.asTable DataFrame.cache DataFrame.checkpoint DataFrame.coalesce @@ -56,6 +57,7 @@ DataFrame DataFrame.dtypes DataFrame.exceptAll DataFrame.executionInfo + DataFrame.exists DataFrame.explain DataFrame.fillna DataFrame.filter @@ -75,9 +77,11 @@ DataFrame DataFrame.isStreaming DataFrame.join DataFrame.limit + DataFrame.lateralJoin DataFrame.localCheckpoint DataFrame.mapInPandas DataFrame.mapInArrow + DataFrame.metadataColumn DataFrame.melt DataFrame.na DataFrame.observe @@ -96,6 +100,7 @@ DataFrame DataFrame.sameSemantics DataFrame.sample DataFrame.sampleBy + DataFrame.scalar DataFrame.schema DataFrame.select DataFrame.selectExpr @@ -117,6 +122,7 @@ DataFrame DataFrame.toLocalIterator DataFrame.toPandas DataFrame.transform + DataFrame.transpose DataFrame.union DataFrame.unionAll DataFrame.unionByName diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index 430e353dd701c..a1ba153110f10 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -451,6 +451,8 @@ Aggregate Functions kurtosis last last_value + listagg + listagg_distinct max max_by mean @@ -476,6 +478,8 @@ Aggregate Functions stddev stddev_pop stddev_samp + string_agg + string_agg_distinct sum sum_distinct try_avg diff --git a/python/docs/source/reference/pyspark.sql/spark_session.rst b/python/docs/source/reference/pyspark.sql/spark_session.rst index 859332fa5e428..0d6a1bc79b902 100644 --- a/python/docs/source/reference/pyspark.sql/spark_session.rst +++ b/python/docs/source/reference/pyspark.sql/spark_session.rst @@ -44,13 +44,20 @@ See also :class:`SparkSession`. .. autosummary:: :toctree: api/ + SparkSession.addTag SparkSession.catalog + SparkSession.clearTags SparkSession.conf SparkSession.createDataFrame SparkSession.dataSource SparkSession.getActiveSession + SparkSession.getTags + SparkSession.interruptAll + SparkSession.interruptOperation + SparkSession.interruptTag SparkSession.newSession SparkSession.profile + SparkSession.removeTag SparkSession.range SparkSession.read SparkSession.readStream @@ -79,15 +86,8 @@ Spark Connect Only SparkSession.addArtifact SparkSession.addArtifacts - SparkSession.addTag SparkSession.clearProgressHandlers - SparkSession.clearTags SparkSession.client SparkSession.copyFromLocalToFs - SparkSession.getTags - SparkSession.interruptAll - SparkSession.interruptOperation - SparkSession.interruptTag SparkSession.registerProgressHandler SparkSession.removeProgressHandler - SparkSession.removeTag diff --git a/python/docs/source/reference/pyspark.sql/variant_val.rst b/python/docs/source/reference/pyspark.sql/variant_val.rst index 8630ae8aace14..883b4c8fdc3d5 100644 --- a/python/docs/source/reference/pyspark.sql/variant_val.rst +++ b/python/docs/source/reference/pyspark.sql/variant_val.rst @@ -26,3 +26,4 @@ VariantVal VariantVal.toPython VariantVal.toJson + VariantVal.parseJson diff --git a/python/docs/source/reference/pyspark.ss/index.rst b/python/docs/source/reference/pyspark.ss/index.rst index 2cb0b1216eff9..440228134fac9 100644 --- a/python/docs/source/reference/pyspark.ss/index.rst +++ b/python/docs/source/reference/pyspark.ss/index.rst @@ -20,7 +20,7 @@ Structured Streaming ==================== -This page gives an overview of all public Structed Streaming API. +This page gives an overview of all public Structured Streaming API. .. toctree:: :maxdepth: 2 diff --git a/python/docs/source/user_guide/sql/python_data_source.rst b/python/docs/source/user_guide/sql/python_data_source.rst index 832987d19e5a4..22b2a0b5f3c7b 100644 --- a/python/docs/source/user_guide/sql/python_data_source.rst +++ b/python/docs/source/user_guide/sql/python_data_source.rst @@ -516,3 +516,8 @@ The following example demonstrates how to implement a basic Data Source using Ar df = spark.read.format("arrowbatch").load() df.show() + +Usage Notes +----------- + +- During Data Source resolution, built-in and Scala/Java Data Sources take precedence over Python Data Sources with the same name; to explicitly use a Python Data Source, make sure its name does not conflict with the other Data Sources. diff --git a/python/lib/py4j-0.10.9.7-src.zip b/python/lib/py4j-0.10.9.7-src.zip deleted file mode 100644 index 6abba4efa0f42..0000000000000 Binary files a/python/lib/py4j-0.10.9.7-src.zip and /dev/null differ diff --git a/python/lib/py4j-0.10.9.9-src.zip b/python/lib/py4j-0.10.9.9-src.zip new file mode 100644 index 0000000000000..035bbd38ba852 Binary files /dev/null and b/python/lib/py4j-0.10.9.9-src.zip differ diff --git a/python/packaging/classic/setup.py b/python/packaging/classic/setup.py index d799af1216345..ae20fc1efdef6 100755 --- a/python/packaging/classic/setup.py +++ b/python/packaging/classic/setup.py @@ -152,7 +152,7 @@ def _supports_symlinks(): # python/packaging/connect/setup.py _minimum_pandas_version = "2.0.0" _minimum_numpy_version = "1.21" -_minimum_pyarrow_version = "10.0.0" +_minimum_pyarrow_version = "11.0.0" _minimum_grpc_version = "1.67.0" _minimum_googleapis_common_protos_version = "1.65.0" @@ -343,7 +343,7 @@ def run(self): license="http://www.apache.org/licenses/LICENSE-2.0", # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. - install_requires=["py4j==0.10.9.7"], + install_requires=["py4j==0.10.9.9"], extras_require={ "ml": ["numpy>=%s" % _minimum_numpy_version], "mllib": ["numpy>=%s" % _minimum_numpy_version], diff --git a/python/packaging/connect/setup.py b/python/packaging/connect/setup.py index de76d51d0cfdc..51d0a4c9e3601 100755 --- a/python/packaging/connect/setup.py +++ b/python/packaging/connect/setup.py @@ -72,9 +72,12 @@ "pyspark.testing", "pyspark.resource.tests", "pyspark.sql.tests", + "pyspark.sql.tests.arrow", "pyspark.sql.tests.connect", + "pyspark.sql.tests.connect.arrow", "pyspark.sql.tests.connect.streaming", "pyspark.sql.tests.connect.client", + "pyspark.sql.tests.connect.pandas", "pyspark.sql.tests.connect.shell", "pyspark.sql.tests.pandas", "pyspark.sql.tests.plot", @@ -129,7 +132,7 @@ # python/packaging/classic/setup.py _minimum_pandas_version = "2.0.0" _minimum_numpy_version = "1.21" - _minimum_pyarrow_version = "10.0.0" + _minimum_pyarrow_version = "11.0.0" _minimum_grpc_version = "1.59.3" _minimum_googleapis_common_protos_version = "1.56.4" diff --git a/python/pyspark/cloudpickle/__init__.py b/python/pyspark/cloudpickle/__init__.py index a3348e8b3da28..bdb1738611b3b 100644 --- a/python/pyspark/cloudpickle/__init__.py +++ b/python/pyspark/cloudpickle/__init__.py @@ -3,7 +3,7 @@ __doc__ = cloudpickle.__doc__ -__version__ = "3.0.0" +__version__ = "3.1.1" __all__ = [ # noqa "__version__", diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index eb43a9676bbb1..4d532e5de9f2c 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -63,7 +63,7 @@ import logging import opcode import pickle -from pickle import _getattribute +from pickle import _getattribute as _pickle_getattribute import platform import struct import sys @@ -126,7 +126,7 @@ def _lookup_class_or_track(class_tracker_id, class_def): def register_pickle_by_value(module): - """Register a module to make it functions and classes picklable by value. + """Register a module to make its functions and classes picklable by value. By default, functions and classes that are attributes of an importable module are to be pickled by reference, that is relying on re-importing @@ -192,6 +192,14 @@ def _is_registered_pickle_by_value(module): return False +if sys.version_info >= (3, 14): + def _getattribute(obj, name): + return _pickle_getattribute(obj, name.split('.')) +else: + def _getattribute(obj, name): + return _pickle_getattribute(obj, name)[0] + + def _whichmodule(obj, name): """Find the module an object belongs to. @@ -213,12 +221,13 @@ def _whichmodule(obj, name): # sys.modules if ( module_name == "__main__" + or module_name == "__mp_main__" or module is None or not isinstance(module, types.ModuleType) ): continue try: - if _getattribute(module, name)[0] is obj: + if _getattribute(module, name) is obj: return module_name except Exception: pass @@ -292,7 +301,7 @@ def _lookup_module_and_qualname(obj, name=None): return None try: - obj2, parent = _getattribute(module, name) + obj2 = _getattribute(module, name) except AttributeError: # obj was not found inside the module it points to return None @@ -409,7 +418,10 @@ def _walk_global_ops(code): def _extract_class_dict(cls): """Retrieve a copy of the dict of a class without the inherited method.""" - clsdict = dict(cls.__dict__) # copy dict proxy to a dict + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + clsdict = {"".join(k): cls.__dict__[k] for k in sorted(cls.__dict__)} + if len(cls.__bases__) == 1: inherited_dict = cls.__bases__[0].__dict__ else: @@ -533,9 +545,15 @@ class id will also reuse this class definition. The "extra" variable is meant to be a dict (or None) that can be used for forward compatibility shall the need arise. """ + # We need to intern the keys of the type_kwargs dict to avoid having + # different pickles for the same dynamic class depending on whether it was + # dynamically created or reconstructed from a pickled stream. + type_kwargs = {sys.intern(k): v for k, v in type_kwargs.items()} + skeleton_class = types.new_class( name, bases, {"metaclass": type_constructor}, lambda ns: ns.update(type_kwargs) ) + return _lookup_class_or_track(class_tracker_id, skeleton_class) @@ -694,8 +712,10 @@ def _function_getstate(func): # unpickling time by iterating over slotstate and calling setattr(func, # slotname, slotvalue) slotstate = { - "__name__": func.__name__, - "__qualname__": func.__qualname__, + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + "__name__": "".join(func.__name__), + "__qualname__": "".join(func.__qualname__), "__annotations__": func.__annotations__, "__kwdefaults__": func.__kwdefaults__, "__defaults__": func.__defaults__, @@ -721,7 +741,9 @@ def _function_getstate(func): ) slotstate["__globals__"] = f_globals - state = func.__dict__ + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + state = {"".join(k): v for k, v in func.__dict__.items()} return state, slotstate @@ -802,6 +824,19 @@ def _code_reduce(obj): # of the specific type from types, for example: # >>> from types import CodeType # >>> help(CodeType) + + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + co_name = "".join(obj.co_name) + + # Create shallow copies of these tuple to make cloudpickle payload deterministic. + # When creating a code object during load, copies of these four tuples are + # created, while in the main process, these tuples can be shared. + # By always creating copies, we make sure the resulting payload is deterministic. + co_names = tuple(name for name in obj.co_names) + co_varnames = tuple(name for name in obj.co_varnames) + co_freevars = tuple(name for name in obj.co_freevars) + co_cellvars = tuple(name for name in obj.co_cellvars) if hasattr(obj, "co_exceptiontable"): # Python 3.11 and later: there are some new attributes # related to the enhanced exceptions. @@ -814,16 +849,16 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, - obj.co_varnames, + co_names, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_qualname, obj.co_firstlineno, obj.co_linetable, obj.co_exceptiontable, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, ) elif hasattr(obj, "co_linetable"): # Python 3.10 and later: obj.co_lnotab is deprecated and constructor @@ -837,14 +872,14 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, - obj.co_varnames, + co_names, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_firstlineno, obj.co_linetable, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, ) elif hasattr(obj, "co_nmeta"): # pragma: no cover # "nogil" Python: modified attributes from 3.9 @@ -859,15 +894,15 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_varnames, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_exc_handlers, obj.co_jump_table, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, obj.co_free2reg, obj.co_cell2reg, ) @@ -882,14 +917,14 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, - obj.co_varnames, + co_names, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_firstlineno, obj.co_lnotab, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, ) return types.CodeType, args @@ -1127,7 +1162,30 @@ def _class_setstate(obj, state): if attrname == "_abc_impl": registry = attr else: + # Note: setting attribute names on a class automatically triggers their + # interning in CPython: + # https://github.com/python/cpython/blob/v3.12.0/Objects/object.c#L957 + # + # This means that to get deterministic pickling for a dynamic class that + # was initially defined in a different Python process, the pickler + # needs to ensure that dynamic class and function attribute names are + # systematically copied into a non-interned version to avoid + # unpredictable pickle payloads. + # + # Indeed the Pickler's memoizer relies on physical object identity to break + # cycles in the reference graph of the object being serialized. setattr(obj, attrname, attr) + + if sys.version_info >= (3, 13) and "__firstlineno__" in state: + # Set the Python 3.13+ only __firstlineno__ attribute one more time, as it + # will be automatically deleted by the `setattr(obj, attrname, attr)` call + # above when `attrname` is "__firstlineno__". We assume that preserving this + # information might be important for some users and that it not stale in the + # context of cloudpickle usage, hence legitimate to propagate. Furthermore it + # is necessary to do so to keep deterministic chained pickling as tested in + # test_deterministic_str_interning_for_chained_dynamic_class_pickling. + obj.__firstlineno__ = state["__firstlineno__"] + if registry is not None: for subclass in registry: obj.register(subclass) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index 52d6732e44ebc..20280f0ca354a 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -6,6 +6,7 @@ See: tests/test_backward_compat.py """ + from . import cloudpickle diff --git a/python/pyspark/cloudpickle/compat.py b/python/pyspark/cloudpickle/compat.py deleted file mode 100644 index 5e9b52773d279..0000000000000 --- a/python/pyspark/cloudpickle/compat.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys - - -if sys.version_info < (3, 8): - try: - import pickle5 as pickle # noqa: F401 - from pickle5 import Pickler # noqa: F401 - except ImportError: - import pickle # noqa: F401 - - # Use the Python pickler for old CPython versions - from pickle import _Pickler as Pickler # noqa: F401 -else: - import pickle # noqa: F401 - - # Pickler will the C implementation in CPython and the Python - # implementation in PyPy - from pickle import Pickler # noqa: F401 diff --git a/python/pyspark/core/context.py b/python/pyspark/core/context.py index 6ea793a118389..5fcd4ffb09210 100644 --- a/python/pyspark/core/context.py +++ b/python/pyspark/core/context.py @@ -75,6 +75,7 @@ if TYPE_CHECKING: from pyspark.accumulators import AccumulatorParam + from pyspark.sql.types import DataType, StructType __all__ = ["SparkContext"] @@ -362,10 +363,14 @@ def _do_init( # Create a temporary directory inside spark.local.dir: assert self._jvm is not None - local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf()) - self._temp_dir = self._jvm.org.apache.spark.util.Utils.createTempDir( - local_dir, "pyspark" - ).getAbsolutePath() + local_dir = getattr(self._jvm, "org.apache.spark.util.Utils").getLocalDir( + self._jsc.sc().conf() + ) + self._temp_dir = ( + getattr(self._jvm, "org.apache.spark.util.Utils") + .createTempDir(local_dir, "pyspark") + .getAbsolutePath() + ) # profiling stats collected for each PythonRDD if ( @@ -554,7 +559,7 @@ def setSystemProperty(cls, key: str, value: str) -> None: """ SparkContext._ensure_initialized() assert SparkContext._jvm is not None - SparkContext._jvm.java.lang.System.setProperty(key, value) + getattr(SparkContext._jvm, "java.lang.System").setProperty(key, value) @classmethod def getSystemProperty(cls, key: str) -> str: @@ -576,7 +581,7 @@ def getSystemProperty(cls, key: str) -> str: """ SparkContext._ensure_initialized() assert SparkContext._jvm is not None - return SparkContext._jvm.java.lang.System.getProperty(key) + return getattr(SparkContext._jvm, "java.lang.System").getProperty(key) @property def version(self) -> str: @@ -1201,7 +1206,7 @@ def binaryRecords(self, path: str, recordLength: int) -> RDD[bytes]: def _dictToJavaMap(self, d: Optional[Dict[str, str]]) -> JavaMap: assert self._jvm is not None - jm = self._jvm.java.util.HashMap() + jm = getattr(self._jvm, "java.util.HashMap")() if not d: d = {} for k, v in d.items(): @@ -1740,9 +1745,9 @@ def union(self, rdds: List[RDD[T]]) -> RDD[T]: assert gw is not None jvm = SparkContext._jvm assert jvm is not None - jrdd_cls = jvm.org.apache.spark.api.java.JavaRDD - jpair_rdd_cls = jvm.org.apache.spark.api.java.JavaPairRDD - jdouble_rdd_cls = jvm.org.apache.spark.api.java.JavaDoubleRDD + jrdd_cls = getattr(jvm, "org.apache.spark.api.java.JavaRDD") + jpair_rdd_cls = getattr(jvm, "org.apache.spark.api.java.JavaPairRDD") + jdouble_rdd_cls = getattr(jvm, "org.apache.spark.api.java.JavaDoubleRDD") if is_instance_of(gw, rdds[0]._jrdd, jrdd_cls): cls = jrdd_cls elif is_instance_of(gw, rdds[0]._jrdd, jpair_rdd_cls): @@ -1933,7 +1938,7 @@ def listFiles(self) -> List[str]: :meth:`SparkContext.addFile` """ return list( - self._jvm.scala.jdk.javaapi.CollectionConverters.asJava( # type: ignore[union-attr] + getattr(self._jvm, "scala.jdk.javaapi.CollectionConverters").asJava( self._jsc.sc().listFiles() ) ) @@ -2061,7 +2066,7 @@ def listArchives(self) -> List[str]: :meth:`SparkContext.addArchive` """ return list( - self._jvm.scala.jdk.javaapi.CollectionConverters.asJava( # type: ignore[union-attr] + getattr(self._jvm, "scala.jdk.javaapi.CollectionConverters").asJava( self._jsc.sc().listArchives() ) ) @@ -2111,7 +2116,7 @@ def _getJavaStorageLevel(self, storageLevel: StorageLevel) -> JavaObject: if not isinstance(storageLevel, StorageLevel): raise TypeError("storageLevel must be of type pyspark.StorageLevel") assert self._jvm is not None - newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel + newStorageLevel = getattr(self._jvm, "org.apache.spark.storage.StorageLevel") return newStorageLevel( storageLevel.useDisk, storageLevel.useMemory, @@ -2619,6 +2624,16 @@ def _assert_on_driver() -> None: messageParameters={}, ) + def _to_ddl(self, struct: "StructType") -> str: + assert self._jvm is not None + return self._jvm.PythonSQLUtils.jsonToDDL(struct.json()) + + def _parse_ddl(self, ddl: str) -> "DataType": + from pyspark.sql.types import _parse_datatype_json_string + + assert self._jvm is not None + return _parse_datatype_json_string(self._jvm.PythonSQLUtils.ddlToJson(ddl)) + def _test() -> None: import doctest diff --git a/python/pyspark/core/files.py b/python/pyspark/core/files.py index 83b98726aee70..a2544425af0cf 100644 --- a/python/pyspark/core/files.py +++ b/python/pyspark/core/files.py @@ -145,7 +145,7 @@ def getRootDirectory(cls) -> str: # This will have to change if we support multiple SparkContexts: assert cls._sc is not None assert cls._sc._jvm is not None - return cls._sc._jvm.org.apache.spark.SparkFiles.getRootDirectory() + return getattr(cls._sc._jvm, "org.apache.spark.SparkFiles").getRootDirectory() def _test() -> None: diff --git a/python/pyspark/core/rdd.py b/python/pyspark/core/rdd.py index a40af3e551584..bbf17dbed7fa2 100644 --- a/python/pyspark/core/rdd.py +++ b/python/pyspark/core/rdd.py @@ -3286,7 +3286,9 @@ def func(split: int, iterator: Iterable[Any]) -> Iterable[bytes]: assert self.ctx._jvm is not None if compressionCodecClass: - compressionCodec = self.ctx._jvm.java.lang.Class.forName(compressionCodecClass) + compressionCodec = getattr(self.ctx._jvm, "java.lang.Class").forName( + compressionCodecClass + ) keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path, compressionCodec) else: keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path) @@ -4998,8 +5000,8 @@ def barrier(self: "RDD[T]") -> "RDDBarrier[T]": ----- For additional information see - - `SPIP: Barrier Execution Mode `_ - - `Design Doc `_ + - `SPIP: Barrier Execution Mode `_ + - `Design Doc `_ This API is experimental """ @@ -5044,7 +5046,7 @@ def withResources(self: "RDD[T]", profile: ResourceProfile) -> "RDD[T]": else: assert self.ctx._jvm is not None - builder = self.ctx._jvm.org.apache.spark.resource.ResourceProfileBuilder() + builder = getattr(self.ctx._jvm, "org.apache.spark.resource.ResourceProfileBuilder")() ereqs = ExecutorResourceRequests(self.ctx._jvm, profile._executor_resource_requests) treqs = TaskResourceRequests(self.ctx._jvm, profile._task_resource_requests) builder.require(ereqs._java_executor_resource_requests) diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json index b2a68a83bfa70..b7c1ec23c3af9 100644 --- a/python/pyspark/errors/error-conditions.json +++ b/python/pyspark/errors/error-conditions.json @@ -189,11 +189,6 @@ "Remote client cannot create a SparkContext. Create SparkSession instead." ] }, - "DATA_SOURCE_CREATE_ERROR": { - "message": [ - "Failed to create python data source instance, error: ." - ] - }, "DATA_SOURCE_INVALID_RETURN_TYPE": { "message": [ "Unsupported return type ('') from Python data source ''. Expected types: ." @@ -1075,7 +1070,7 @@ }, "UNSUPPORTED_JOIN_TYPE": { "message": [ - "Unsupported join type: . Supported join types include: 'inner', 'outer', 'full', 'fullouter', 'full_outer', 'leftouter', 'left', 'left_outer', 'rightouter', 'right', 'right_outer', 'leftsemi', 'left_semi', 'semi', 'leftanti', 'left_anti', 'anti', 'cross'." + "Unsupported join type: ''. Supported join types include: ." ] }, "UNSUPPORTED_LITERAL": { @@ -1108,6 +1103,11 @@ "Function `` should use only POSITIONAL or POSITIONAL OR KEYWORD arguments." ] }, + "UNSUPPORTED_PIE_PLOT_PARAM": { + "message": [ + "Pie plot requires either a `y` column or `subplots=True`." + ] + }, "UNSUPPORTED_PLOT_BACKEND": { "message": [ "`` is not supported, it should be one of the values from " diff --git a/python/pyspark/errors/exceptions/captured.py b/python/pyspark/errors/exceptions/captured.py index 749b0cca96b78..b27c61d7563fb 100644 --- a/python/pyspark/errors/exceptions/captured.py +++ b/python/pyspark/errors/exceptions/captured.py @@ -67,7 +67,7 @@ def __init__( self._stackTrace = ( stackTrace if stackTrace is not None - else (SparkContext._jvm.org.apache.spark.util.Utils.exceptionString(origin)) + else (getattr(SparkContext._jvm, "org.apache.spark.util.Utils").exceptionString(origin)) ) self._cause = convert_exception(cause) if cause is not None else None if self._cause is None and origin is not None and origin.getCause() is not None: @@ -85,7 +85,7 @@ def __str__(self) -> str: # SPARK-42752: default to True to see issues with initialization debug_enabled = True try: - sql_conf = jvm.org.apache.spark.sql.internal.SQLConf.get() + sql_conf = getattr(jvm, "org.apache.spark.sql.internal.SQLConf").get() debug_enabled = sql_conf.pysparkJVMStacktraceEnabled() except BaseException: pass @@ -149,7 +149,7 @@ def getMessage(self) -> str: errorClass = self._origin.getErrorClass() messageParameters = self._origin.getMessageParameters() - error_message = gw.jvm.org.apache.spark.SparkThrowableHelper.getMessage( + error_message = getattr(gw.jvm, "org.apache.spark.SparkThrowableHelper").getMessage( errorClass, messageParameters ) @@ -220,7 +220,7 @@ def convert_exception(e: "Py4JJavaError") -> CapturedException: return SparkNoSuchElementException(origin=e) c: "Py4JJavaError" = e.getCause() - stacktrace: str = jvm.org.apache.spark.util.Utils.exceptionString(e) + stacktrace: str = getattr(jvm, "org.apache.spark.util.Utils").exceptionString(e) if c is not None and ( is_instance_of(gw, c, "org.apache.spark.api.python.PythonException") # To make sure this only catches Python UDFs. diff --git a/python/pyspark/errors/exceptions/connect.py b/python/pyspark/errors/exceptions/connect.py index c24b25af01631..ef90f8559b425 100644 --- a/python/pyspark/errors/exceptions/connect.py +++ b/python/pyspark/errors/exceptions/connect.py @@ -54,206 +54,67 @@ def convert_exception( resp: Optional[pb2.FetchErrorDetailsResponse], display_server_stacktrace: bool = False, ) -> SparkConnectException: - classes = [] - sql_state = None - errorClass = None - messageParameters = None - contexts: Optional[List[BaseQueryContext]] = None - - if "classes" in info.metadata: - classes = json.loads(info.metadata["classes"]) - - if "sqlState" in info.metadata: - sql_state = info.metadata["sqlState"] - - if "errorClass" in info.metadata: - errorClass = info.metadata["errorClass"] - - if "messageParameters" in info.metadata: - messageParameters = json.loads(info.metadata["messageParameters"]) - + raw_classes = info.metadata.get("classes") + classes: List[str] = json.loads(raw_classes) if raw_classes else [] + sql_state = info.metadata.get("sqlState") + error_class = info.metadata.get("errorClass") + raw_message_parameters = info.metadata.get("messageParameters") + message_parameters: Dict[str, str] = ( + json.loads(raw_message_parameters) if raw_message_parameters else {} + ) stacktrace: Optional[str] = None + if resp is not None and resp.HasField("root_error_idx"): message = resp.errors[resp.root_error_idx].message stacktrace = _extract_jvm_stacktrace(resp) else: message = truncated_message - stacktrace = info.metadata["stackTrace"] if "stackTrace" in info.metadata else None - display_server_stacktrace = display_server_stacktrace if stacktrace is not None else False - - if ( - resp is not None - and resp.errors - and hasattr(resp.errors[resp.root_error_idx], "spark_throwable") - ): - messageParameters = dict( - resp.errors[resp.root_error_idx].spark_throwable.message_parameters - ) - contexts = [] - for context in resp.errors[resp.root_error_idx].spark_throwable.query_contexts: - if context.context_type == pb2.FetchErrorDetailsResponse.QueryContext.SQL: - contexts.append(SQLQueryContext(context)) - else: - contexts.append(DataFrameQueryContext(context)) - - if "org.apache.spark.sql.catalyst.parser.ParseException" in classes: - return ParseException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - # Order matters. ParseException inherits AnalysisException. - elif "org.apache.spark.sql.AnalysisException" in classes: - return AnalysisException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "org.apache.spark.sql.streaming.StreamingQueryException" in classes: - return StreamingQueryException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "org.apache.spark.sql.execution.QueryExecutionException" in classes: - return QueryExecutionException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - # Order matters. NumberFormatException inherits IllegalArgumentException. - elif "java.lang.NumberFormatException" in classes: - return NumberFormatException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "java.lang.IllegalArgumentException" in classes: - return IllegalArgumentException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "java.lang.ArithmeticException" in classes: - return ArithmeticException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "java.lang.UnsupportedOperationException" in classes: - return UnsupportedOperationException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "java.lang.ArrayIndexOutOfBoundsException" in classes: - return ArrayIndexOutOfBoundsException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "java.time.DateTimeException" in classes: - return DateTimeException( - message, - errorClass=errorClass, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "org.apache.spark.SparkRuntimeException" in classes: - return SparkRuntimeException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "org.apache.spark.SparkUpgradeException" in classes: - return SparkUpgradeException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - elif "org.apache.spark.api.python.PythonException" in classes: + stacktrace = info.metadata.get("stackTrace") + display_server_stacktrace = display_server_stacktrace if stacktrace else False + + contexts = None + if resp and resp.HasField("root_error_idx"): + root_error = resp.errors[resp.root_error_idx] + if hasattr(root_error, "spark_throwable"): + message_parameters = dict(root_error.spark_throwable.message_parameters) + contexts = [ + SQLQueryContext(c) + if c.context_type == pb2.FetchErrorDetailsResponse.QueryContext.SQL + else DataFrameQueryContext(c) + for c in root_error.spark_throwable.query_contexts + ] + + if "org.apache.spark.api.python.PythonException" in classes: return PythonException( "\n An exception was thrown from the Python worker. " "Please see the stack trace below.\n%s" % message ) - elif "org.apache.spark.SparkNoSuchElementException" in classes: - return SparkNoSuchElementException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - # Make sure that the generic SparkException is handled last. - elif "org.apache.spark.SparkException" in classes: - return SparkException( - message, - errorClass=errorClass, - messageParameters=messageParameters, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) - else: - return SparkConnectGrpcException( - message, - reason=info.reason, - messageParameters=messageParameters, - errorClass=errorClass, - sql_state=sql_state, - server_stacktrace=stacktrace, - display_server_stacktrace=display_server_stacktrace, - contexts=contexts, - ) + + # Return exception based on class mapping + for error_class_name in classes: + ExceptionClass = EXCEPTION_CLASS_MAPPING.get(error_class_name) + if ExceptionClass: + return ExceptionClass( + message, + errorClass=error_class, + messageParameters=message_parameters, + sql_state=sql_state, + server_stacktrace=stacktrace, + display_server_stacktrace=display_server_stacktrace, + contexts=contexts, + ) + + # Return SparkConnectGrpcException if there is no matched exception class + return SparkConnectGrpcException( + message, + reason=info.reason, + messageParameters=message_parameters, + errorClass=error_class, + sql_state=sql_state, + server_stacktrace=stacktrace, + display_server_stacktrace=display_server_stacktrace, + contexts=contexts, + ) def _extract_jvm_stacktrace(resp: pb2.FetchErrorDetailsResponse) -> str: @@ -434,6 +295,26 @@ class SparkNoSuchElementException(SparkConnectGrpcException, BaseNoSuchElementEx """ +# Update EXCEPTION_CLASS_MAPPING here when adding a new exception +EXCEPTION_CLASS_MAPPING = { + "org.apache.spark.sql.catalyst.parser.ParseException": ParseException, + "org.apache.spark.sql.AnalysisException": AnalysisException, + "org.apache.spark.sql.streaming.StreamingQueryException": StreamingQueryException, + "org.apache.spark.sql.execution.QueryExecutionException": QueryExecutionException, + "java.lang.NumberFormatException": NumberFormatException, + "java.lang.IllegalArgumentException": IllegalArgumentException, + "java.lang.ArithmeticException": ArithmeticException, + "java.lang.UnsupportedOperationException": UnsupportedOperationException, + "java.lang.ArrayIndexOutOfBoundsException": ArrayIndexOutOfBoundsException, + "java.time.DateTimeException": DateTimeException, + "org.apache.spark.SparkRuntimeException": SparkRuntimeException, + "org.apache.spark.SparkUpgradeException": SparkUpgradeException, + "org.apache.spark.api.python.PythonException": PythonException, + "org.apache.spark.SparkNoSuchElementException": SparkNoSuchElementException, + "org.apache.spark.SparkException": SparkException, +} + + class SQLQueryContext(BaseQueryContext): def __init__(self, q: pb2.FetchErrorDetailsResponse.QueryContext): self._q = q diff --git a/python/pyspark/errors/tests/test_connect_errors_conversion.py b/python/pyspark/errors/tests/test_connect_errors_conversion.py new file mode 100644 index 0000000000000..a6ed5e7d391ee --- /dev/null +++ b/python/pyspark/errors/tests/test_connect_errors_conversion.py @@ -0,0 +1,169 @@ +# -*- encoding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +from pyspark.errors.exceptions.connect import ( + convert_exception, + EXCEPTION_CLASS_MAPPING, + SparkConnectGrpcException, + PythonException, + AnalysisException, +) +from pyspark.sql.connect.proto import FetchErrorDetailsResponse as pb2 +from google.rpc.error_details_pb2 import ErrorInfo + + +class ConnectErrorsTest(unittest.TestCase): + def test_convert_exception_known_class(self): + # Mock ErrorInfo with a known error class + info = { + "reason": "org.apache.spark.sql.AnalysisException", + "metadata": { + "classes": '["org.apache.spark.sql.AnalysisException"]', + "sqlState": "42000", + "errorClass": "ANALYSIS.ERROR", + "messageParameters": '{"param1": "value1"}', + }, + } + truncated_message = "Analysis error occurred" + exception = convert_exception( + info=ErrorInfo(**info), truncated_message=truncated_message, resp=None + ) + + self.assertIsInstance(exception, AnalysisException) + self.assertEqual(exception.getSqlState(), "42000") + self.assertEqual(exception._errorClass, "ANALYSIS.ERROR") + self.assertEqual(exception._messageParameters, {"param1": "value1"}) + + def test_convert_exception_python_exception(self): + # Mock ErrorInfo for PythonException + info = { + "reason": "org.apache.spark.api.python.PythonException", + "metadata": { + "classes": '["org.apache.spark.api.python.PythonException"]', + }, + } + truncated_message = "Python worker error occurred" + exception = convert_exception( + info=ErrorInfo(**info), truncated_message=truncated_message, resp=None + ) + + self.assertIsInstance(exception, PythonException) + self.assertIn("An exception was thrown from the Python worker", exception.getMessage()) + + def test_convert_exception_unknown_class(self): + # Mock ErrorInfo with an unknown error class + info = { + "reason": "org.apache.spark.UnknownException", + "metadata": {"classes": '["org.apache.spark.UnknownException"]'}, + } + truncated_message = "Unknown error occurred" + exception = convert_exception( + info=ErrorInfo(**info), truncated_message=truncated_message, resp=None + ) + + self.assertIsInstance(exception, SparkConnectGrpcException) + self.assertEqual( + exception.getMessage(), "(org.apache.spark.UnknownException) Unknown error occurred" + ) + + def test_exception_class_mapping(self): + # Ensure that all keys in EXCEPTION_CLASS_MAPPING are valid + for error_class_name, exception_class in EXCEPTION_CLASS_MAPPING.items(): + self.assertTrue( + hasattr(exception_class, "__name__"), + f"{exception_class} in EXCEPTION_CLASS_MAPPING is not a valid class", + ) + + def test_convert_exception_with_stacktrace(self): + # Mock FetchErrorDetailsResponse with stacktrace + resp = pb2( + root_error_idx=0, + errors=[ + pb2.Error( + message="Root error message", + error_type_hierarchy=["org.apache.spark.SparkException"], + stack_trace=[ + pb2.StackTraceElement( + declaring_class="org.apache.spark.Main", + method_name="main", + file_name="Main.scala", + line_number=42, + ), + ], + cause_idx=1, + ), + pb2.Error( + message="Cause error message", + error_type_hierarchy=["java.lang.RuntimeException"], + stack_trace=[ + pb2.StackTraceElement( + declaring_class="org.apache.utils.Helper", + method_name="help", + file_name="Helper.java", + line_number=10, + ), + ], + ), + ], + ) + + info = { + "reason": "org.apache.spark.SparkException", + "metadata": { + "classes": '["org.apache.spark.SparkException"]', + "sqlState": "42000", + }, + } + truncated_message = "Root error message" + exception = convert_exception( + info=ErrorInfo(**info), truncated_message=truncated_message, resp=resp + ) + + self.assertIsInstance(exception, SparkConnectGrpcException) + self.assertIn("Root error message", exception.getMessage()) + self.assertIn("Caused by", exception.getMessage()) + + def test_convert_exception_fallback(self): + # Mock ErrorInfo with missing class information + info = { + "reason": "org.apache.spark.UnknownReason", + "metadata": {}, + } + truncated_message = "Fallback error occurred" + exception = convert_exception( + info=ErrorInfo(**info), truncated_message=truncated_message, resp=None + ) + + self.assertIsInstance(exception, SparkConnectGrpcException) + self.assertEqual( + exception.getMessage(), "(org.apache.spark.UnknownReason) Fallback error occurred" + ) + + +if __name__ == "__main__": + import unittest + from pyspark.errors.tests.test_errors import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py index cbe5739204ac1..0d01cbb961bb6 100644 --- a/python/pyspark/errors/utils.py +++ b/python/pyspark/errors/utils.py @@ -31,21 +31,44 @@ Type, Optional, Union, - TYPE_CHECKING, overload, cast, ) +from types import FrameType + import pyspark from pyspark.errors.error_classes import ERROR_CLASSES_MAP -if TYPE_CHECKING: - from pyspark.sql import SparkSession - T = TypeVar("T") FuncT = TypeVar("FuncT", bound=Callable[..., Any]) _current_origin = threading.local() +# Providing DataFrame debugging options to reduce performance slowdown. +# Default is True. +_enable_debugging_cache = None + + +def is_debugging_enabled() -> bool: + global _enable_debugging_cache + + if _enable_debugging_cache is None: + from pyspark.sql import SparkSession + + spark = SparkSession.getActiveSession() + if spark is not None: + _enable_debugging_cache = ( + spark.conf.get( + "spark.python.sql.dataFrameDebugging.enabled", + "true", # type: ignore[union-attr] + ).lower() + == "true" + ) + else: + _enable_debugging_cache = False + + return _enable_debugging_cache + def current_origin() -> threading.local: global _current_origin @@ -164,17 +187,12 @@ def get_message_template(self, errorClass: str) -> str: return message_template -def _capture_call_site(spark_session: "SparkSession", depth: int) -> str: +def _capture_call_site(depth: int) -> str: """ Capture the call site information including file name, line number, and function name. This function updates the thread-local storage from JVM side (PySparkCurrentOrigin) with the current call site information when a PySpark API function is called. - Parameters - ---------- - spark_session : SparkSession - Current active Spark session. - Notes ----- The call site information is used to enhance error messages with the exact location @@ -183,18 +201,15 @@ def _capture_call_site(spark_session: "SparkSession", depth: int) -> str: # Filtering out PySpark code and keeping user code only pyspark_root = os.path.dirname(pyspark.__file__) - def inspect_stack() -> Iterator[inspect.FrameInfo]: + def inspect_stack() -> Iterator[FrameType]: frame = inspect.currentframe() while frame: - frameinfo = (frame,) + inspect.getframeinfo(frame, context=0) - yield inspect.FrameInfo(*frameinfo) + yield frame frame = frame.f_back - stack = ( - frame_info for frame_info in inspect_stack() if pyspark_root not in frame_info.filename - ) + stack = (f for f in inspect_stack() if pyspark_root not in f.f_code.co_filename) - selected_frames: Iterator[inspect.FrameInfo] = itertools.islice(stack, depth) + selected_frames: Iterator[FrameType] = itertools.islice(stack, depth) # We try import here since IPython is not a required dependency try: @@ -210,7 +225,8 @@ def inspect_stack() -> Iterator[inspect.FrameInfo]: selected_frames = ( frame for frame in selected_frames - if (ipy_root not in frame.filename) and (ipykernel_root not in frame.filename) + if (ipy_root not in frame.f_code.co_filename) + and (ipykernel_root not in frame.f_code.co_filename) ) except ImportError: ipython = None @@ -218,10 +234,11 @@ def inspect_stack() -> Iterator[inspect.FrameInfo]: # Identifying the cell is useful when the error is generated from IPython Notebook if ipython: call_sites = [ - f"line {frame.lineno} in cell [{ipython.execution_count}]" for frame in selected_frames + f"line {frame.f_lineno} in cell [{ipython.execution_count}]" + for frame in selected_frames ] else: - call_sites = [f"{frame.filename}:{frame.lineno}" for frame in selected_frames] + call_sites = [f"{frame.f_code.co_filename}:{frame.f_lineno}" for frame in selected_frames] call_sites_str = "\n".join(call_sites) return call_sites_str @@ -239,13 +256,12 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: from pyspark.sql.utils import is_remote spark = SparkSession.getActiveSession() - if spark is not None and hasattr(func, "__name__"): - if is_remote(): - global current_origin + if spark is not None and hasattr(func, "__name__") and is_debugging_enabled(): + if is_remote(): # Getting the configuration requires RPC call. Uses the default value for now. depth = 1 - set_current_origin(func.__name__, _capture_call_site(spark, depth)) + set_current_origin(func.__name__, _capture_call_site(depth)) try: return func(*args, **kwargs) @@ -253,8 +269,8 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: set_current_origin(None, None) else: assert spark._jvm is not None - jvm_pyspark_origin = ( - spark._jvm.org.apache.spark.sql.catalyst.trees.PySparkCurrentOrigin + jvm_pyspark_origin = getattr( + spark._jvm, "org.apache.spark.sql.catalyst.trees.PySparkCurrentOrigin" ) depth = int( spark.conf.get( # type: ignore[arg-type] @@ -262,7 +278,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: ) ) # Update call site when the function is called - jvm_pyspark_origin.set(func.__name__, _capture_call_site(spark, depth)) + jvm_pyspark_origin.set(func.__name__, _capture_call_site(depth)) try: return func(*args, **kwargs) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b89755d9c18a5..e003ba43ec7c8 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -62,6 +62,7 @@ HasSolver, HasParallelism, ) +from pyspark.ml.remote.util import try_remote_attribute_relation from pyspark.ml.tree import ( _DecisionTreeModel, _DecisionTreeParams, @@ -336,6 +337,7 @@ class _ClassificationSummary(JavaWrapper): @property @since("3.1.0") + @try_remote_attribute_relation def predictions(self) -> DataFrame: """ Dataframe outputted by the model's `transform` method. @@ -521,6 +523,7 @@ def scoreCol(self) -> str: return self._call_java("scoreCol") @property + @try_remote_attribute_relation def roc(self) -> DataFrame: """ Returns the receiver operating characteristic (ROC) curve, @@ -546,6 +549,7 @@ def areaUnderROC(self) -> float: @property @since("3.1.0") + @try_remote_attribute_relation def pr(self) -> DataFrame: """ Returns the precision-recall curve, which is a Dataframe @@ -556,6 +560,7 @@ def pr(self) -> DataFrame: @property @since("3.1.0") + @try_remote_attribute_relation def fMeasureByThreshold(self) -> DataFrame: """ Returns a dataframe with two fields (threshold, F-Measure) curve @@ -565,6 +570,7 @@ def fMeasureByThreshold(self) -> DataFrame: @property @since("3.1.0") + @try_remote_attribute_relation def precisionByThreshold(self) -> DataFrame: """ Returns a dataframe with two fields (threshold, precision) curve. @@ -575,6 +581,7 @@ def precisionByThreshold(self) -> DataFrame: @property @since("3.1.0") + @try_remote_attribute_relation def recallByThreshold(self) -> DataFrame: """ Returns a dataframe with two fields (threshold, recall) curve. @@ -3788,7 +3795,8 @@ def __init__(self, models: List[ClassificationModel]): assert sc is not None and sc._gateway is not None java_models_array = JavaWrapper._new_java_array( - java_models, sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel + java_models, + getattr(sc._gateway.jvm, "org.apache.spark.ml.classification.ClassificationModel"), ) # TODO: need to set metadata metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata") @@ -3928,7 +3936,8 @@ def _to_java(self) -> "JavaObject": java_models = [cast(_JavaClassificationModel, model)._to_java() for model in self.models] java_models_array = JavaWrapper._new_java_array( - java_models, sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel + java_models, + getattr(sc._gateway.jvm, "org.apache.spark.ml.classification.ClassificationModel"), ) metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata") _java_obj = JavaParams._new_java_obj( diff --git a/python/pyspark/ml/common.py b/python/pyspark/ml/common.py index 1ae15fdf547eb..2417df6ab9eb3 100644 --- a/python/pyspark/ml/common.py +++ b/python/pyspark/ml/common.py @@ -74,7 +74,7 @@ def _to_java_object_rdd(rdd: "RDD") -> "JavaObject": """ rdd = rdd._reserialize(AutoBatchedSerializer(CPickleSerializer())) assert rdd.ctx._jvm is not None - return rdd.ctx._jvm.org.apache.spark.ml.python.MLSerDe.pythonToJava(rdd._jrdd, True) + return getattr(rdd.ctx._jvm, "org.apache.spark.ml.python.MLSerDe").pythonToJava(rdd._jrdd, True) def _py2java(sc: "SparkContext", obj: Any) -> "JavaObject": @@ -98,7 +98,7 @@ def _py2java(sc: "SparkContext", obj: Any) -> "JavaObject": else: data = bytearray(CPickleSerializer().dumps(obj)) assert sc._jvm is not None - obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data) + obj = getattr(sc._jvm, "org.apache.spark.ml.python.MLSerDe").loads(data) return obj @@ -117,17 +117,17 @@ def _java2py(sc: "SparkContext", r: "JavaObjectOrPickleDump", encoding: str = "b assert sc._jvm is not None if clsName == "JavaRDD": - jrdd = sc._jvm.org.apache.spark.ml.python.MLSerDe.javaToPython(r) + jrdd = getattr(sc._jvm, "org.apache.spark.ml.python.MLSerDe").javaToPython(r) return RDD(jrdd, sc) if clsName == "Dataset": return DataFrame(r, SparkSession._getActiveSessionOrCreate()) if clsName in _picklable_classes: - r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r) + r = getattr(sc._jvm, "org.apache.spark.ml.python.MLSerDe").dumps(r) elif isinstance(r, (JavaArray, JavaList)): try: - r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r) + r = getattr(sc._jvm, "org.apache.spark.ml.python.MLSerDe").dumps(r) except Py4JJavaError: pass # not picklable diff --git a/python/pyspark/ml/connect/io_utils.py b/python/pyspark/ml/connect/io_utils.py index c401e3e76676a..fdaa23ff9011c 100644 --- a/python/pyspark/ml/connect/io_utils.py +++ b/python/pyspark/ml/connect/io_utils.py @@ -38,7 +38,9 @@ def _copy_file_from_local_to_fs(local_path: str, dest_path: str) -> None: session.copyFromLocalToFs(local_path, dest_path) else: jvm = session.sparkContext._gateway.jvm # type: ignore[union-attr] - jvm.org.apache.spark.ml.python.MLUtil.copyFileFromLocalToFs(local_path, dest_path) + getattr(jvm, "org.apache.spark.ml.python.MLUtil").copyFileFromLocalToFs( + local_path, dest_path + ) def _copy_dir_from_local_to_fs(local_path: str, dest_path: str) -> None: @@ -74,7 +76,7 @@ class ParamsReadWrite(Params): def _get_extra_metadata(self) -> Any: """ - Returns exta metadata of the instance + Returns extra metadata of the instance """ return None diff --git a/python/pyspark/ml/connect/tuning.py b/python/pyspark/ml/connect/tuning.py index cdb606048a59a..190fc683acf7d 100644 --- a/python/pyspark/ml/connect/tuning.py +++ b/python/pyspark/ml/connect/tuning.py @@ -170,7 +170,7 @@ def _parallelFitTasks( if active_session is None: raise RuntimeError( - "An active SparkSession is required for running cross valiator fit tasks." + "An active SparkSession is required for running cross validator fit tasks." ) def get_single_task(index: int, param_map: Any) -> Callable[[], Tuple[int, float]]: diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py b/python/pyspark/ml/deepspeed/deepspeed_distributor.py index 4ac5ff2fb4207..3fd1d3bb32463 100644 --- a/python/pyspark/ml/deepspeed/deepspeed_distributor.py +++ b/python/pyspark/ml/deepspeed/deepspeed_distributor.py @@ -49,7 +49,7 @@ def __init__( Parameters ---------- numGpus: int - The number of GPUs to use per node (analagous to num_gpus in deepspeed command). + The number of GPUs to use per node (analogous to num_gpus in deepspeed command). nnodes: int The number of nodes that should be used for the run. localMode: bool diff --git a/python/pyspark/ml/dl_util.py b/python/pyspark/ml/dl_util.py index 8ead529d7b729..3b87049ef2777 100644 --- a/python/pyspark/ml/dl_util.py +++ b/python/pyspark/ml/dl_util.py @@ -27,7 +27,7 @@ class FunctionPickler: This class provides a way to pickle a function and its arguments. It also provides a way to create a script that can run a function with arguments if they have them pickled to a file. - It also provides a way of extracting the conents of a pickle file. + It also provides a way of extracting the contents of a pickle file. """ @staticmethod diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index e053ea273140c..cf12a5390746f 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1208,7 +1208,7 @@ def from_vocabulary( sc = SparkContext._active_spark_context assert sc is not None and sc._gateway is not None - java_class = sc._gateway.jvm.java.lang.String + java_class = getattr(sc._gateway.jvm, "java.lang.String") jvocab = CountVectorizerModel._new_java_array(vocabulary, java_class) model = CountVectorizerModel._create_from_java_class( "org.apache.spark.ml.feature.CountVectorizerModel", jvocab @@ -4799,7 +4799,7 @@ def from_labels( sc = SparkContext._active_spark_context assert sc is not None and sc._gateway is not None - java_class = sc._gateway.jvm.java.lang.String + java_class = getattr(sc._gateway.jvm, "java.lang.String") jlabels = StringIndexerModel._new_java_array(labels, java_class) model = StringIndexerModel._create_from_java_class( "org.apache.spark.ml.feature.StringIndexerModel", jlabels @@ -4828,7 +4828,7 @@ def from_arrays_of_labels( sc = SparkContext._active_spark_context assert sc is not None and sc._gateway is not None - java_class = sc._gateway.jvm.java.lang.String + java_class = getattr(sc._gateway.jvm, "java.lang.String") jlabels = StringIndexerModel._new_java_array(arrayOfLabels, java_class) model = StringIndexerModel._create_from_java_class( "org.apache.spark.ml.feature.StringIndexerModel", jlabels @@ -5198,7 +5198,7 @@ def loadDefaultStopWords(language: str) -> List[str]: Supported languages: danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, russian, spanish, swedish, turkish """ - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover + stopWordsObj = getattr(_jvm(), "org.apache.spark.ml.feature.StopWordsRemover") return list(stopWordsObj.loadDefaultStopWords(language)) diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py index 32941b33c4603..de5539afd4a1a 100644 --- a/python/pyspark/ml/functions.py +++ b/python/pyspark/ml/functions.py @@ -121,7 +121,9 @@ def vector_to_array(col: Column, dtype: str = "float64") -> Column: sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None return Column( - sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype) + getattr(sc._jvm, "org.apache.spark.ml.functions").vector_to_array( + _to_java_column(col), dtype + ) ) @@ -164,7 +166,9 @@ def array_to_vector(col: Column) -> Column: sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None - return Column(sc._jvm.org.apache.spark.ml.functions.array_to_vector(_to_java_column(col))) + return Column( + getattr(sc._jvm, "org.apache.spark.ml.functions").array_to_vector(_to_java_column(col)) + ) def _batched( diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index d0223739ffdf8..325992c085802 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -25,7 +25,8 @@ """ import sys -from typing import Any, Dict, List, NoReturn, Optional, cast +from typing import Any, Dict, List, NoReturn, cast +from functools import cached_property import numpy as np @@ -42,14 +43,7 @@ class _ImageSchema: APIs of this class. """ - def __init__(self) -> None: - self._imageSchema: Optional[StructType] = None - self._ocvTypes: Optional[Dict[str, int]] = None - self._columnSchema: Optional[StructType] = None - self._imageFields: Optional[List[str]] = None - self._undefinedImageType: Optional[str] = None - - @property + @cached_property def imageSchema(self) -> StructType: """ Returns the image schema. @@ -64,14 +58,12 @@ def imageSchema(self) -> StructType: """ from pyspark.core.context import SparkContext - if self._imageSchema is None: - ctx = SparkContext._active_spark_context - assert ctx is not None and ctx._jvm is not None - jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageSchema() - self._imageSchema = cast(StructType, _parse_datatype_json_string(jschema.json())) - return self._imageSchema + ctx = SparkContext._active_spark_context + assert ctx is not None and ctx._jvm is not None + jschema = getattr(ctx._jvm, "org.apache.spark.ml.image.ImageSchema").imageSchema() + return cast(StructType, _parse_datatype_json_string(jschema.json())) - @property + @cached_property def ocvTypes(self) -> Dict[str, int]: """ Returns the OpenCV type mapping supported. @@ -85,13 +77,11 @@ def ocvTypes(self) -> Dict[str, int]: """ from pyspark.core.context import SparkContext - if self._ocvTypes is None: - ctx = SparkContext._active_spark_context - assert ctx is not None and ctx._jvm is not None - self._ocvTypes = dict(ctx._jvm.org.apache.spark.ml.image.ImageSchema.javaOcvTypes()) - return self._ocvTypes + ctx = SparkContext._active_spark_context + assert ctx is not None and ctx._jvm is not None + return dict(getattr(ctx._jvm, "org.apache.spark.ml.image.ImageSchema").javaOcvTypes()) - @property + @cached_property def columnSchema(self) -> StructType: """ Returns the schema for the image column. @@ -106,14 +96,12 @@ def columnSchema(self) -> StructType: """ from pyspark.core.context import SparkContext - if self._columnSchema is None: - ctx = SparkContext._active_spark_context - assert ctx is not None and ctx._jvm is not None - jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.columnSchema() - self._columnSchema = cast(StructType, _parse_datatype_json_string(jschema.json())) - return self._columnSchema + ctx = SparkContext._active_spark_context + assert ctx is not None and ctx._jvm is not None + jschema = getattr(ctx._jvm, "org.apache.spark.ml.image.ImageSchema").columnSchema() + return cast(StructType, _parse_datatype_json_string(jschema.json())) - @property + @cached_property def imageFields(self) -> List[str]: """ Returns field names of image columns. @@ -127,13 +115,11 @@ def imageFields(self) -> List[str]: """ from pyspark.core.context import SparkContext - if self._imageFields is None: - ctx = SparkContext._active_spark_context - assert ctx is not None and ctx._jvm is not None - self._imageFields = list(ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageFields()) - return self._imageFields + ctx = SparkContext._active_spark_context + assert ctx is not None and ctx._jvm is not None + return list(getattr(ctx._jvm, "org.apache.spark.ml.image.ImageSchema").imageFields()) - @property + @cached_property def undefinedImageType(self) -> str: """ Returns the name of undefined image type for the invalid image. @@ -142,13 +128,9 @@ def undefinedImageType(self) -> str: """ from pyspark.core.context import SparkContext - if self._undefinedImageType is None: - ctx = SparkContext._active_spark_context - assert ctx is not None and ctx._jvm is not None - self._undefinedImageType = ( - ctx._jvm.org.apache.spark.ml.image.ImageSchema.undefinedImageType() - ) - return self._undefinedImageType + ctx = SparkContext._active_spark_context + assert ctx is not None and ctx._jvm is not None + return getattr(ctx._jvm, "org.apache.spark.ml.image.ImageSchema").undefinedImageType() def toNDArray(self, image: Row) -> np.ndarray: """ diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 01339283839e1..0ffacde3bb423 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -207,7 +207,7 @@ def _to_java(self) -> "JavaObject": gateway = SparkContext._gateway assert gateway is not None and SparkContext._jvm is not None - cls = SparkContext._jvm.org.apache.spark.ml.PipelineStage + cls = getattr(SparkContext._jvm, "org.apache.spark.ml.PipelineStage") java_stages = gateway.new_array(cls, len(self.getStages())) for idx, stage in enumerate(self.getStages()): java_stages[idx] = cast(JavaParams, stage)._to_java() @@ -361,7 +361,7 @@ def _to_java(self) -> "JavaObject": gateway = SparkContext._gateway assert gateway is not None and SparkContext._jvm is not None - cls = SparkContext._jvm.org.apache.spark.ml.Transformer + cls = getattr(SparkContext._jvm, "org.apache.spark.ml.Transformer") java_stages = gateway.new_array(cls, len(self.stages)) for idx, stage in enumerate(self.stages): java_stages[idx] = cast(JavaParams, stage)._to_java() diff --git a/python/pyspark/ml/remote/__init__.py b/python/pyspark/ml/remote/__init__.py new file mode 100644 index 0000000000000..cce3acad34a49 --- /dev/null +++ b/python/pyspark/ml/remote/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/ml/remote/proto.py b/python/pyspark/ml/remote/proto.py new file mode 100644 index 0000000000000..3a81e74b6aec3 --- /dev/null +++ b/python/pyspark/ml/remote/proto.py @@ -0,0 +1,76 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Optional, TYPE_CHECKING, List + +import pyspark.sql.connect.proto as pb2 +from pyspark.sql.connect.plan import LogicalPlan + +if TYPE_CHECKING: + from pyspark.sql.connect.client import SparkConnectClient + + +class TransformerRelation(LogicalPlan): + """A logical plan for transforming of a transformer which could be a cached model + or a non-model transformer like VectorAssembler.""" + + def __init__( + self, + child: Optional["LogicalPlan"], + name: str, + ml_params: pb2.MlParams, + uid: str = "", + is_model: bool = True, + ) -> None: + super().__init__(child) + self._name = name + self._ml_params = ml_params + self._uid = uid + self._is_model = is_model + + def plan(self, session: "SparkConnectClient") -> pb2.Relation: + assert self._child is not None + plan = self._create_proto_relation() + plan.ml_relation.transform.input.CopyFrom(self._child.plan(session)) + + if self._is_model: + plan.ml_relation.transform.obj_ref.CopyFrom(pb2.ObjectRef(id=self._name)) + else: + plan.ml_relation.transform.transformer.CopyFrom( + pb2.MlOperator(name=self._name, uid=self._uid, type=pb2.MlOperator.TRANSFORMER) + ) + + if self._ml_params is not None: + plan.ml_relation.transform.params.CopyFrom(self._ml_params) + + return plan + + +class AttributeRelation(LogicalPlan): + """A logical plan used in ML to represent an attribute of an instance, which + could be a model or a summary. This attribute returns a DataFrame. + """ + + def __init__(self, ref_id: str, methods: List[pb2.Fetch.Method]) -> None: + super().__init__(None) + self._ref_id = ref_id + self._methods = methods + + def plan(self, session: "SparkConnectClient") -> pb2.Relation: + plan = self._create_proto_relation() + plan.ml_relation.fetch.obj_ref.CopyFrom(pb2.ObjectRef(id=self._ref_id)) + plan.ml_relation.fetch.methods.extend(self._methods) + return plan diff --git a/python/pyspark/ml/remote/readwrite.py b/python/pyspark/ml/remote/readwrite.py new file mode 100644 index 0000000000000..9149ab3bfd454 --- /dev/null +++ b/python/pyspark/ml/remote/readwrite.py @@ -0,0 +1,134 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import cast, Type, TYPE_CHECKING + +import pyspark.sql.connect.proto as pb2 +from pyspark.ml.remote.serialize import serialize_ml_params, deserialize, deserialize_param +from pyspark.ml.util import MLWriter, MLReader, RL +from pyspark.ml.wrapper import JavaWrapper + +if TYPE_CHECKING: + from pyspark.ml.util import JavaMLReadable, JavaMLWritable + from pyspark.core.context import SparkContext + + +class RemoteMLWriter(MLWriter): + def __init__(self, instance: "JavaMLWritable") -> None: + super().__init__() + self._instance = instance + + @property + def sc(self) -> "SparkContext": + raise RuntimeError("Accessing SparkContext is not supported on Connect") + + def save(self, path: str) -> None: + from pyspark.ml.wrapper import JavaModel, JavaEstimator + from pyspark.sql.connect.session import SparkSession + + session = SparkSession.getActiveSession() + assert session is not None + + # Spark Connect ML is built on scala Spark.ML, that means we're only + # supporting JavaModel or JavaEstimator or JavaEvaluator + if isinstance(self._instance, JavaModel): + model = cast("JavaModel", self._instance) + params = serialize_ml_params(model, session.client) + assert isinstance(model._java_obj, str) + writer = pb2.MlCommand.Write( + obj_ref=pb2.ObjectRef(id=model._java_obj), + params=params, + path=path, + should_overwrite=self.shouldOverwrite, + options=self.optionMap, + ) + elif isinstance(self._instance, JavaEstimator): + estimator = cast("JavaEstimator", self._instance) + params = serialize_ml_params(estimator, session.client) + assert isinstance(estimator._java_obj, str) + writer = pb2.MlCommand.Write( + operator=pb2.MlOperator( + name=estimator._java_obj, uid=estimator.uid, type=pb2.MlOperator.ESTIMATOR + ), + params=params, + path=path, + should_overwrite=self.shouldOverwrite, + options=self.optionMap, + ) + else: + raise NotImplementedError(f"Unsupported writing for {self._instance}") + + command = pb2.Command() + command.ml_command.write.CopyFrom(writer) + session.client.execute_command(command) + + +class RemoteMLReader(MLReader[RL]): + def __init__(self, clazz: Type["JavaMLReadable[RL]"]) -> None: + super().__init__() + self._clazz = clazz + + def load(self, path: str) -> RL: + from pyspark.sql.connect.session import SparkSession + from pyspark.ml.wrapper import JavaModel, JavaEstimator + + session = SparkSession.getActiveSession() + assert session is not None + # to get the java corresponding qualified class name + java_qualified_class_name = ( + self._clazz.__module__.replace("pyspark", "org.apache.spark") + + "." + + self._clazz.__name__ + ) + + if issubclass(self._clazz, JavaModel): + ml_type = pb2.MlOperator.MODEL + elif issubclass(self._clazz, JavaEstimator): + ml_type = pb2.MlOperator.ESTIMATOR + else: + raise ValueError(f"Unsupported reading for {java_qualified_class_name}") + + command = pb2.Command() + command.ml_command.read.CopyFrom( + pb2.MlCommand.Read( + operator=pb2.MlOperator(name=java_qualified_class_name, type=ml_type), path=path + ) + ) + (_, properties, _) = session.client.execute_command(command) + result = deserialize(properties) + + # Get the python type + def _get_class() -> Type[RL]: + parts = (self._clazz.__module__ + "." + self._clazz.__name__).split(".") + module = ".".join(parts[:-1]) + m = __import__(module, fromlist=[parts[-1]]) + return getattr(m, parts[-1]) + + py_type = _get_class() + # It must be JavaWrapper, since we're passing the string to the _java_obj + if issubclass(py_type, JavaWrapper): + if ml_type == pb2.MlOperator.MODEL: + session.client.add_ml_cache(result.obj_ref.id) + instance = py_type(result.obj_ref.id) + else: + instance = py_type() + instance._resetUid(result.uid) + params = {k: deserialize_param(v) for k, v in result.params.params.items()} + instance._set(**params) + return instance + else: + raise RuntimeError(f"Unsupported class {self._clazz}") diff --git a/python/pyspark/ml/remote/serialize.py b/python/pyspark/ml/remote/serialize.py new file mode 100644 index 0000000000000..69e3af1f4c787 --- /dev/null +++ b/python/pyspark/ml/remote/serialize.py @@ -0,0 +1,132 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Any, List, TYPE_CHECKING, Mapping, Dict + +import pyspark.sql.connect.proto as pb2 +from pyspark.ml.linalg import ( + Vectors, + Matrices, + DenseVector, + SparseVector, + DenseMatrix, + SparseMatrix, +) +from pyspark.sql.connect.expressions import LiteralExpression + +if TYPE_CHECKING: + from pyspark.sql.connect.client import SparkConnectClient + from pyspark.ml.param import Params + + +def serialize_param(value: Any, client: "SparkConnectClient") -> pb2.Param: + if isinstance(value, DenseVector): + return pb2.Param(vector=pb2.Vector(dense=pb2.Vector.Dense(value=value.values.tolist()))) + elif isinstance(value, SparseVector): + return pb2.Param( + vector=pb2.Vector( + sparse=pb2.Vector.Sparse( + size=value.size, index=value.indices.tolist(), value=value.values.tolist() + ) + ) + ) + elif isinstance(value, DenseMatrix): + return pb2.Param( + matrix=pb2.Matrix( + dense=pb2.Matrix.Dense( + num_rows=value.numRows, num_cols=value.numCols, value=value.values.tolist() + ) + ) + ) + elif isinstance(value, SparseMatrix): + return pb2.Param( + matrix=pb2.Matrix( + sparse=pb2.Matrix.Sparse( + num_rows=value.numRows, + num_cols=value.numCols, + colptr=value.colPtrs.tolist(), + row_index=value.rowIndices.tolist(), + value=value.values.tolist(), + ) + ) + ) + else: + literal = LiteralExpression._from_value(value).to_plan(client).literal + return pb2.Param(literal=literal) + + +def serialize(client: "SparkConnectClient", *args: Any) -> List[Any]: + from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame + + result = [] + for arg in args: + if isinstance(arg, ConnectDataFrame): + result.append(pb2.Fetch.Method.Args(input=arg._plan.plan(client))) + else: + result.append(pb2.Fetch.Method.Args(param=serialize_param(arg, client))) + return result + + +def deserialize_param(param: pb2.Param) -> Any: + if param.HasField("literal"): + return LiteralExpression._to_value(param.literal) + if param.HasField("vector"): + vector = param.vector + if vector.HasField("dense"): + return Vectors.dense(vector.dense.value) + elif vector.HasField("sparse"): + return Vectors.sparse(vector.sparse.size, vector.sparse.index, vector.sparse.value) + else: + raise ValueError("Unsupported vector type") + if param.HasField("matrix"): + matrix = param.matrix + if matrix.HasField("dense"): + return DenseMatrix( + matrix.dense.num_rows, + matrix.dense.num_cols, + matrix.dense.value, + matrix.dense.is_transposed, + ) + elif matrix.HasField("sparse"): + return Matrices.sparse( + matrix.sparse.num_rows, + matrix.sparse.num_cols, + matrix.sparse.colptr, + matrix.sparse.row_index, + matrix.sparse.value, + ) + else: + raise ValueError("Unsupported matrix type") + + raise ValueError("Unsupported param type") + + +def deserialize(ml_command_result_properties: Dict[str, Any]) -> Any: + ml_command_result = ml_command_result_properties["ml_command_result"] + if ml_command_result.HasField("operator_info"): + return ml_command_result.operator_info + + if ml_command_result.HasField("param"): + return deserialize_param(ml_command_result.param) + + raise ValueError("Unsupported result type") + + +def serialize_ml_params(instance: "Params", client: "SparkConnectClient") -> pb2.MlParams: + params: Mapping[str, pb2.Param] = { + k.name: serialize_param(v, client) for k, v in instance._paramMap.items() + } + return pb2.MlParams(params=params) diff --git a/python/pyspark/ml/remote/util.py b/python/pyspark/ml/remote/util.py new file mode 100644 index 0000000000000..cb34dae165863 --- /dev/null +++ b/python/pyspark/ml/remote/util.py @@ -0,0 +1,293 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import functools +import os +from typing import Any, cast, TypeVar, Callable, TYPE_CHECKING, Type, List, Tuple + +import pyspark.sql.connect.proto as pb2 +from pyspark.ml.remote.serialize import serialize_ml_params, serialize, deserialize +from pyspark.sql import is_remote + +if TYPE_CHECKING: + from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame + from pyspark.ml.wrapper import JavaWrapper, JavaEstimator + from pyspark.ml.util import JavaMLReadable, JavaMLWritable + +FuncT = TypeVar("FuncT", bound=Callable[..., Any]) + + +def _extract_id_methods(obj_identifier: str) -> Tuple[List[pb2.Fetch.Method], str]: + """Extract the obj reference id and the methods. Eg, model.summary""" + method_chain = obj_identifier.split(".") + obj_ref = method_chain[0] + methods: List[pb2.Fetch.Method] = [] + if len(method_chain) > 1: + methods = [pb2.Fetch.Method(method=m) for m in method_chain[1:]] + return methods, obj_ref + + +def try_remote_intermediate_result(f: FuncT) -> FuncT: + """Mark the function/property that returns the intermediate result of the remote call. + Eg, model.summary""" + + @functools.wraps(f) + def wrapped(self: "JavaWrapper") -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + return f"{self._java_obj}.{f.__name__}" + else: + return f(self) + + return cast(FuncT, wrapped) + + +def try_remote_attribute_relation(f: FuncT) -> FuncT: + """Mark the function/property that returns a Relation. + Eg, model.summary.roc""" + + @functools.wraps(f) + def wrapped(self: "JavaWrapper", *args: Any, **kwargs: Any) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + # The attribute returns a dataframe, we need to wrap it + # in the AttributeRelation + from pyspark.ml.remote.proto import AttributeRelation + from pyspark.sql.connect.session import SparkSession + from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame + + session = SparkSession.getActiveSession() + assert session is not None + + assert isinstance(self._java_obj, str) + + methods, obj_ref = _extract_id_methods(self._java_obj) + methods.append( + pb2.Fetch.Method(method=f.__name__, args=serialize(session.client, *args)) + ) + plan = AttributeRelation(obj_ref, methods) + return ConnectDataFrame(plan, session) + else: + return f(self, *args, **kwargs) + + return cast(FuncT, wrapped) + + +def try_remote_fit(f: FuncT) -> FuncT: + """Mark the function that fits a model.""" + + @functools.wraps(f) + def wrapped(self: "JavaEstimator", dataset: "ConnectDataFrame") -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + client = dataset.sparkSession.client + input = dataset._plan.plan(client) + assert isinstance(self._java_obj, str) + estimator = pb2.MlOperator( + name=self._java_obj, uid=self.uid, type=pb2.MlOperator.ESTIMATOR + ) + command = pb2.Command() + command.ml_command.fit.CopyFrom( + pb2.MlCommand.Fit( + estimator=estimator, + params=serialize_ml_params(self, client), + dataset=input, + ) + ) + (_, properties, _) = client.execute_command(command) + model_info = deserialize(properties) + client.add_ml_cache(model_info.obj_ref.id) + return model_info.obj_ref.id + else: + return f(self, dataset) + + return cast(FuncT, wrapped) + + +def try_remote_transform_relation(f: FuncT) -> FuncT: + """Mark the function/property that returns a relation for model transform.""" + + @functools.wraps(f) + def wrapped(self: "JavaWrapper", dataset: "ConnectDataFrame") -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + from pyspark.ml import Model, Transformer + from pyspark.sql.connect.session import SparkSession + from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame + + session = SparkSession.getActiveSession() + assert session is not None + # Model is also a Transformer, so we much match Model first + if isinstance(self, Model): + params = serialize_ml_params(self, session.client) + from pyspark.ml.remote.proto import TransformerRelation + + assert isinstance(self._java_obj, str) + return ConnectDataFrame( + TransformerRelation( + child=dataset._plan, name=self._java_obj, ml_params=params, is_model=True + ), + session, + ) + elif isinstance(self, Transformer): + params = serialize_ml_params(self, session.client) + from pyspark.ml.remote.proto import TransformerRelation + + assert isinstance(self._java_obj, str) + return ConnectDataFrame( + TransformerRelation( + child=dataset._plan, + name=self._java_obj, + ml_params=params, + uid=self.uid, + is_model=False, + ), + session, + ) + else: + raise RuntimeError(f"Unsupported {self}") + else: + return f(self, dataset) + + return cast(FuncT, wrapped) + + +def try_remote_call(f: FuncT) -> FuncT: + """Mark the function/property for the remote call. + Eg, model.coefficients""" + + @functools.wraps(f) + def wrapped(self: "JavaWrapper", name: str, *args: Any) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + """Launch a remote call if possible""" + from pyspark.sql.connect.session import SparkSession + + session = SparkSession.getActiveSession() + assert session is not None + assert isinstance(self._java_obj, str) + methods, obj_ref = _extract_id_methods(self._java_obj) + methods.append(pb2.Fetch.Method(method=name, args=serialize(session.client, *args))) + command = pb2.Command() + command.ml_command.fetch.CopyFrom( + pb2.Fetch(obj_ref=pb2.ObjectRef(id=obj_ref), methods=methods) + ) + (_, properties, _) = session.client.execute_command(command) + ml_command_result = properties["ml_command_result"] + if ml_command_result.HasField("summary"): + summary = ml_command_result.summary + session.client.add_ml_cache(summary) + return summary + else: + return deserialize(properties) + else: + return f(self, name, *args) + + return cast(FuncT, wrapped) + + +def try_remote_del(f: FuncT) -> FuncT: + """Mark the function/property to delete a model on the server side.""" + + @functools.wraps(f) + def wrapped(self: "JavaWrapper") -> Any: + try: + in_remote = is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ + except Exception: + return + + if in_remote: + # Delete the model if possible + model_id = self._java_obj + if model_id is not None and "." not in model_id: + try: + from pyspark.sql.connect.session import SparkSession + + session = SparkSession.getActiveSession() + if session is not None: + session.client.remove_ml_cache(model_id) + return + except Exception: + # SparkSession's down. + return + else: + return f(self) + + return cast(FuncT, wrapped) + + +def try_remote_return_java_class(f: FuncT) -> FuncT: + """Mark the function/property that returns none.""" + + @functools.wraps(f) + def wrapped(java_class: str, *args: Any) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + return java_class + else: + return f(java_class, *args) + + return cast(FuncT, wrapped) + + +def try_remote_write(f: FuncT) -> FuncT: + """Mark the function that write an estimator/model or evaluator""" + + @functools.wraps(f) + def wrapped(self: "JavaMLWritable") -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + from pyspark.ml.remote.readwrite import RemoteMLWriter + + return RemoteMLWriter(self) + else: + return f(self) + + return cast(FuncT, wrapped) + + +def try_remote_read(f: FuncT) -> FuncT: + """Mark the function to read an estimator/model or evaluator""" + + @functools.wraps(f) + def wrapped(cls: Type["JavaMLReadable"]) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + from pyspark.ml.remote.readwrite import RemoteMLReader + + return RemoteMLReader(cls) + else: + return f(cls) + + return cast(FuncT, wrapped) + + +def try_remote_intercept(f: FuncT) -> FuncT: + """Mark the function/property that returns none.""" + + @functools.wraps(f) + def wrapped(java_class: str, *args: Any) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + return None + else: + return f(java_class, *args) + + return cast(FuncT, wrapped) + + +def try_remote_not_supporting(f: FuncT) -> FuncT: + """Mark the function/property that has not been supported yet""" + + @functools.wraps(f) + def wrapped(*args: Any) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + raise NotImplementedError("") + else: + return f(*args) + + return cast(FuncT, wrapped) diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py index 4dcc961909520..04b0c7278a717 100644 --- a/python/pyspark/ml/stat.py +++ b/python/pyspark/ml/stat.py @@ -107,7 +107,7 @@ def test( sc = SparkContext._active_spark_context assert sc is not None - javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest + javaTestObj = getattr(_jvm(), "org.apache.spark.ml.stat.ChiSquareTest") args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten)] return _java2py(sc, javaTestObj.test(*args)) @@ -178,7 +178,7 @@ def corr(dataset: DataFrame, column: str, method: str = "pearson") -> DataFrame: sc = SparkContext._active_spark_context assert sc is not None - javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation + javaCorrObj = getattr(_jvm(), "org.apache.spark.ml.stat.Correlation") args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args)) @@ -248,7 +248,7 @@ def test(dataset: DataFrame, sampleCol: str, distName: str, *params: float) -> D sc = SparkContext._active_spark_context assert sc is not None - javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest + javaTestObj = getattr(_jvm(), "org.apache.spark.ml.stat.KolmogorovSmirnovTest") dataset = _py2java(sc, dataset) params = [float(param) for param in params] # type: ignore[assignment] return _java2py( diff --git a/python/pyspark/ml/tests/connect/test_connect_function.py b/python/pyspark/ml/tests/connect/test_connect_function.py index 393d38fdc426a..7d3a115ab0619 100644 --- a/python/pyspark/ml/tests/connect/test_connect_function.py +++ b/python/pyspark/ml/tests/connect/test_connect_function.py @@ -43,7 +43,7 @@ def setUpClass(cls): # Disable the shared namespace so pyspark.sql.functions, etc point the regular # PySpark libraries. os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1" - cls.connect = cls.spark # Switch Spark Connect session and regular PySpark sesion. + cls.connect = cls.spark # Switch Spark Connect session and regular PySpark session. cls.spark = PySparkSession._instantiatedSession assert cls.spark is not None diff --git a/python/pyspark/ml/tests/connect/test_connect_spark_ml_classification.py b/python/pyspark/ml/tests/connect/test_connect_spark_ml_classification.py new file mode 100644 index 0000000000000..2000a38d9e616 --- /dev/null +++ b/python/pyspark/ml/tests/connect/test_connect_spark_ml_classification.py @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import unittest + +from pyspark.ml.tests.test_classification import ClassificationTestsMixin +from pyspark.sql import SparkSession + + +class ClassificationTestsOnConnect(ClassificationTestsMixin, unittest.TestCase): + def setUp(self) -> None: + self.spark = SparkSession.builder.remote( + os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[2]") + ).getOrCreate() + + def test_assert_remote_mode(self): + from pyspark.sql import is_remote + + self.assertTrue(is_remote()) + + def tearDown(self) -> None: + self.spark.stop() + + +if __name__ == "__main__": + from pyspark.ml.tests.connect.test_connect_spark_ml_classification import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index eeb342c4238dd..d0e2600a9a8b3 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -29,93 +29,13 @@ ) from pyspark.ml.clustering import DistributedLDAModel, KMeans, LocalLDAModel, LDA, LDAModel from pyspark.ml.fpm import FPGrowth -from pyspark.ml.linalg import Matrices, Vectors, DenseVector +from pyspark.ml.linalg import Vectors, DenseVector from pyspark.ml.recommendation import ALS from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression from pyspark.sql import Row from pyspark.testing.mlutils import SparkSessionTestCase -class LogisticRegressionTest(SparkSessionTestCase): - def test_binomial_logistic_regression_with_bound(self): - df = self.spark.createDataFrame( - [ - (1.0, 1.0, Vectors.dense(0.0, 5.0)), - (0.0, 2.0, Vectors.dense(1.0, 2.0)), - (1.0, 3.0, Vectors.dense(2.0, 1.0)), - (0.0, 4.0, Vectors.dense(3.0, 3.0)), - ], - ["label", "weight", "features"], - ) - - lor = LogisticRegression( - regParam=0.01, - weightCol="weight", - lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), - upperBoundsOnIntercepts=Vectors.dense(0.0), - ) - model = lor.fit(df) - self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1e-4)) - self.assertTrue(np.isclose(model.intercept, 0.0, atol=1e-4)) - - def test_multinomial_logistic_regression_with_bound(self): - data_path = "data/mllib/sample_multiclass_classification_data.txt" - df = self.spark.read.format("libsvm").load(data_path) - - lor = LogisticRegression( - regParam=0.01, - lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), - upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0), - ) - model = lor.fit(df) - expected = [ - [4.593, 4.5516, 9.0099, 12.2904], - [1.0, 8.1093, 7.0, 10.0], - [3.041, 5.0, 8.0, 11.0], - ] - for i in range(0, len(expected)): - self.assertTrue( - np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1e-4) - ) - self.assertTrue( - np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1e-4) - ) - - def test_logistic_regression_with_threshold(self): - df = self.spark.createDataFrame( - [ - (1.0, 1.0, Vectors.dense(0.0, 5.0)), - (0.0, 2.0, Vectors.dense(1.0, 2.0)), - (1.0, 3.0, Vectors.dense(2.0, 1.0)), - (0.0, 4.0, Vectors.dense(3.0, 3.0)), - ], - ["label", "weight", "features"], - ) - - lor = LogisticRegression(weightCol="weight") - model = lor.fit(df) - - # status changes 1 - for t in [0.0, 0.1, 0.2, 0.5, 1.0]: - model.setThreshold(t).transform(df) - - # status changes 2 - [model.setThreshold(t).predict(Vectors.dense(0.0, 5.0)) for t in [0.0, 0.1, 0.2, 0.5, 1.0]] - - self.assertEqual( - [row.prediction for row in model.setThreshold(0.0).transform(df).collect()], - [1.0, 1.0, 1.0, 1.0], - ) - self.assertEqual( - [row.prediction for row in model.setThreshold(0.5).transform(df).collect()], - [0.0, 1.0, 1.0, 0.0], - ) - self.assertEqual( - [row.prediction for row in model.setThreshold(1.0).transform(df).collect()], - [0.0, 0.0, 0.0, 0.0], - ) - - class MultilayerPerceptronClassifierTest(SparkSessionTestCase): def test_raw_and_probability_prediction(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" diff --git a/python/pyspark/ml/tests/test_classification.py b/python/pyspark/ml/tests/test_classification.py new file mode 100644 index 0000000000000..ee72e0394e3a0 --- /dev/null +++ b/python/pyspark/ml/tests/test_classification.py @@ -0,0 +1,304 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import tempfile +import unittest +from shutil import rmtree + +import numpy as np + +from pyspark.ml.linalg import Vectors, Matrices +from pyspark.sql import SparkSession, DataFrame +from pyspark.ml.classification import ( + LogisticRegression, + LogisticRegressionModel, + LogisticRegressionSummary, + BinaryLogisticRegressionSummary, +) + + +class ClassificationTestsMixin: + def test_binomial_logistic_regression_with_bound(self): + df = self.spark.createDataFrame( + [ + (1.0, 1.0, Vectors.dense(0.0, 5.0)), + (0.0, 2.0, Vectors.dense(1.0, 2.0)), + (1.0, 3.0, Vectors.dense(2.0, 1.0)), + (0.0, 4.0, Vectors.dense(3.0, 3.0)), + ], + ["label", "weight", "features"], + ) + + lor = LogisticRegression( + regParam=0.01, + weightCol="weight", + lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), + upperBoundsOnIntercepts=Vectors.dense(0.0), + ) + lor_model = lor.fit(df) + + def check_result(model: LogisticRegressionModel) -> None: + self.assertTrue( + np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1e-4) + ) + self.assertTrue(np.isclose(model.intercept, 0.0, atol=1e-4)) + + check_result(lor_model) + + # Model save + with tempfile.TemporaryDirectory(prefix="model_save") as tmp_dir: + local_path = os.path.join(tmp_dir, "model") + lor_model.write().save(local_path) + loaded_model = LogisticRegressionModel.load(local_path) + check_result(loaded_model) + + def test_multinomial_logistic_regression_with_bound(self): + data_path = "data/mllib/sample_multiclass_classification_data.txt" + df = self.spark.read.format("libsvm").load(data_path) + + lor = LogisticRegression( + regParam=0.01, + lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), + upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0), + ) + lor_model = lor.fit(df) + + def check_result(model: LogisticRegressionModel) -> None: + expected = [ + [4.593, 4.5516, 9.0099, 12.2904], + [1.0, 8.1093, 7.0, 10.0], + [3.041, 5.0, 8.0, 11.0], + ] + for i in range(0, len(expected)): + self.assertTrue( + np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1e-4) + ) + self.assertTrue( + np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1e-4) + ) + + check_result(lor_model) + + # Model save + with tempfile.TemporaryDirectory(prefix="model_save") as tmp_dir: + local_path = os.path.join(tmp_dir, "model") + lor_model.write().save(local_path) + loaded_model = LogisticRegressionModel.load(local_path) + check_result(loaded_model) + + def test_logistic_regression_with_threshold(self): + df = self.spark.createDataFrame( + [ + (1.0, 1.0, Vectors.dense(0.0, 5.0)), + (0.0, 2.0, Vectors.dense(1.0, 2.0)), + (1.0, 3.0, Vectors.dense(2.0, 1.0)), + (0.0, 4.0, Vectors.dense(3.0, 3.0)), + ], + ["label", "weight", "features"], + ) + + lor = LogisticRegression(weightCol="weight") + model = lor.fit(df) + + # status changes 1 + for t in [0.0, 0.1, 0.2, 0.5, 1.0]: + model.setThreshold(t).transform(df) + + # status changes 2 + [model.setThreshold(t).predict(Vectors.dense(0.0, 5.0)) for t in [0.0, 0.1, 0.2, 0.5, 1.0]] + + self.assertEqual( + [row.prediction for row in model.setThreshold(0.0).transform(df).collect()], + [1.0, 1.0, 1.0, 1.0], + ) + self.assertEqual( + [row.prediction for row in model.setThreshold(0.5).transform(df).collect()], + [0.0, 1.0, 1.0, 0.0], + ) + self.assertEqual( + [row.prediction for row in model.setThreshold(1.0).transform(df).collect()], + [0.0, 0.0, 0.0, 0.0], + ) + + def test_binary_logistic_regression_summary(self): + df = self.spark.createDataFrame( + [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], + ["label", "weight", "features"], + ) + lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) + model = lr.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.probabilityCol, "probability") + self.assertEqual(s.labelCol, "label") + self.assertEqual(s.featuresCol, "features") + self.assertEqual(s.predictionCol, "prediction") + objHist = s.objectiveHistory + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.labels, list)) + self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.precisionByLabel, list)) + self.assertTrue(isinstance(s.recallByLabel, list)) + self.assertTrue(isinstance(s.fMeasureByLabel(), list)) + self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) + self.assertTrue(isinstance(s.roc, DataFrame)) + self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) + self.assertTrue(isinstance(s.pr, DataFrame)) + self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) + self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) + self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) + self.assertAlmostEqual(s.accuracy, 1.0, 2) + self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) + self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) + self.assertAlmostEqual(s.weightedRecall, 1.0, 2) + self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) + self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) + self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) + + # test evaluation (with training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary)) + self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) + self.assertEqual(sorted(sameSummary.predictions.collect()), sorted(s.predictions.collect())) + + def test_multiclass_logistic_regression_summary(self): + df = self.spark.createDataFrame( + [ + (1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], [])), + (2.0, 2.0, Vectors.dense(2.0)), + (2.0, 2.0, Vectors.dense(1.9)), + ], + ["label", "weight", "features"], + ) + lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) + model = lr.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.probabilityCol, "probability") + self.assertEqual(s.labelCol, "label") + self.assertEqual(s.featuresCol, "features") + self.assertEqual(s.predictionCol, "prediction") + objHist = s.objectiveHistory + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.labels, list)) + self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.precisionByLabel, list)) + self.assertTrue(isinstance(s.recallByLabel, list)) + self.assertTrue(isinstance(s.fMeasureByLabel(), list)) + self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) + self.assertAlmostEqual(s.accuracy, 0.75, 2) + self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) + self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) + self.assertAlmostEqual(s.weightedRecall, 0.75, 2) + self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) + self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) + self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2) + + # test evaluation (with training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary)) + self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary)) + self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) + + # We can't use sorted(s.predictions.collect()), since the DenseVector doesn't support "<" + self.assertEqual( + sameSummary.predictions.coalesce(1).sort("label", "weight", "prediction").collect(), + s.predictions.coalesce(1).sort("label", "weight", "prediction").collect(), + ) + + def test_logistic_regression(self): + # test sparse/dense vector and matrix + lower_intercepts = Vectors.dense([1, 2, 3, 4]) + upper_intercepts = Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) + lower_coefficients = Matrices.dense(3, 2, [0, 1, 4, 5, 9, 10]) + upper_coefficients = Matrices.sparse(1, 1, [0, 1], [0], [2.0]) + + lr = LogisticRegression( + maxIter=1, + lowerBoundsOnIntercepts=lower_intercepts, + upperBoundsOnIntercepts=upper_intercepts, + lowerBoundsOnCoefficients=lower_coefficients, + upperBoundsOnCoefficients=upper_coefficients, + ) + path = tempfile.mkdtemp() + lr_path = path + "/logreg" + lr.save(lr_path) + lr2 = LogisticRegression.load(lr_path) + self.assertEqual( + lr2.uid, + lr2.maxIter.parent, + "Loaded LogisticRegression instance uid (%s) " + "did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent), + ) + self.assertEqual( + lr._defaultParamMap[lr.maxIter], + lr2._defaultParamMap[lr2.maxIter], + "Loaded LogisticRegression instance default params did not match " + + "original defaults", + ) + self.assertEqual( + lr.getLowerBoundsOnIntercepts(), + lr2.getLowerBoundsOnIntercepts(), + ) + self.assertEqual( + lr.getUpperBoundsOnIntercepts(), + lr2.getUpperBoundsOnIntercepts(), + ) + self.assertEqual( + lr.getLowerBoundsOnCoefficients(), + lr2.getLowerBoundsOnCoefficients(), + ) + self.assertEqual( + lr.getUpperBoundsOnCoefficients(), + lr2.getUpperBoundsOnCoefficients(), + ) + try: + rmtree(path) + except OSError: + pass + + +class ClassificationTests(ClassificationTestsMixin, unittest.TestCase): + def setUp(self) -> None: + self.spark = SparkSession.builder.master("local[4]").getOrCreate() + + def tearDown(self) -> None: + self.spark.stop() + + +if __name__ == "__main__": + from pyspark.ml.tests.test_classification import * # noqa: F401,F403 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/ml/tests/test_dl_util.py b/python/pyspark/ml/tests/test_dl_util.py index e5e2c6bc191d8..c130cf1ff6b9d 100644 --- a/python/pyspark/ml/tests/test_dl_util.py +++ b/python/pyspark/ml/tests/test_dl_util.py @@ -137,7 +137,7 @@ def _are_two_files_identical(self, fpath1: str, fpath2: str) -> bool: "", ), ( - "Check if it creates the correct file with only suffix + boddy", + "Check if it creates the correct file with only suffix + body", "", "print('goodbye')", ), diff --git a/python/pyspark/ml/tests/test_functions.py b/python/pyspark/ml/tests/test_functions.py index e67e46ded67bd..7719b2b27e0ab 100644 --- a/python/pyspark/ml/tests/test_functions.py +++ b/python/pyspark/ml/tests/test_functions.py @@ -265,14 +265,14 @@ def predict(a, b, c): with self.assertRaisesRegex(Exception, "Model expected 3 inputs, but received 4 columns"): preds = self.df.withColumn("preds", sum_cols(*columns)).toPandas() - # muliple scalar columns with one tensor_input_shape => single numpy array + # multiple scalar columns with one tensor_input_shape => single numpy array sum_cols = predict_batch_udf( array_sum_fn, return_type=DoubleType(), batch_size=5, input_tensor_shapes=[[4]] ) preds = self.df.withColumn("preds", sum_cols(struct(*columns))).toPandas() self.assertTrue(np.array_equal(np.sum(self.data, axis=1), preds["preds"].to_numpy())) - # muliple scalar columns with wrong tensor_input_shape => ERROR + # multiple scalar columns with wrong tensor_input_shape => ERROR sum_cols = predict_batch_udf( array_sum_fn, return_type=DoubleType(), batch_size=5, input_tensor_shapes=[[3]] ) diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 8df50a5963e6b..0aa9827124954 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -368,12 +368,12 @@ def test_default_params_transferred(self): self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0) - def test_lr_evaluate_invaild_type(self): + def test_lr_evaluate_invalid_type(self): lr = LinearRegressionModel() invalid_type = "" self.assertRaises(TypeError, lr.evaluate, invalid_type) - def test_glr_evaluate_invaild_type(self): + def test_glr_evaluate_invalid_type(self): glr = GeneralizedLinearRegressionModel() invalid_type = "" self.assertRaises(TypeError, glr.evaluate, invalid_type) diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py index 406180d9a6391..481c2f236d46f 100644 --- a/python/pyspark/ml/tests/test_persistence.py +++ b/python/pyspark/ml/tests/test_persistence.py @@ -153,29 +153,6 @@ def test_linear_regression_pmml_basic(self): self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text) - def test_logistic_regression(self): - lr = LogisticRegression(maxIter=1) - path = tempfile.mkdtemp() - lr_path = path + "/logreg" - lr.save(lr_path) - lr2 = LogisticRegression.load(lr_path) - self.assertEqual( - lr2.uid, - lr2.maxIter.parent, - "Loaded LogisticRegression instance uid (%s) " - "did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent), - ) - self.assertEqual( - lr._defaultParamMap[lr.maxIter], - lr2._defaultParamMap[lr2.maxIter], - "Loaded LogisticRegression instance default params did not match " - + "original defaults", - ) - try: - rmtree(path) - except OSError: - pass - def test_kmeans(self): kmeans = KMeans(k=2, seed=1) path = tempfile.mkdtemp() diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py index 5704d7186734f..e1c8f4197e3c7 100644 --- a/python/pyspark/ml/tests/test_training_summary.py +++ b/python/pyspark/ml/tests/test_training_summary.py @@ -18,14 +18,11 @@ import unittest from pyspark.ml.classification import ( - BinaryLogisticRegressionSummary, BinaryRandomForestClassificationSummary, FMClassifier, FMClassificationSummary, LinearSVC, LinearSVCSummary, - LogisticRegression, - LogisticRegressionSummary, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationSummary, RandomForestClassificationSummary, @@ -122,94 +119,6 @@ def test_glr_summary(self): sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance) - def test_binary_logistic_regression_summary(self): - df = self.spark.createDataFrame( - [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], - ["label", "weight", "features"], - ) - lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) - model = lr.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.probabilityCol, "probability") - self.assertEqual(s.labelCol, "label") - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - objHist = s.objectiveHistory - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) - self.assertGreater(s.totalIterations, 0) - self.assertTrue(isinstance(s.labels, list)) - self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.precisionByLabel, list)) - self.assertTrue(isinstance(s.recallByLabel, list)) - self.assertTrue(isinstance(s.fMeasureByLabel(), list)) - self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) - self.assertTrue(isinstance(s.roc, DataFrame)) - self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) - self.assertTrue(isinstance(s.pr, DataFrame)) - self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) - self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) - self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) - self.assertAlmostEqual(s.accuracy, 1.0, 2) - self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) - self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) - self.assertAlmostEqual(s.weightedRecall, 1.0, 2) - self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) - self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) - self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) - # test evaluation (with training dataset) produces a summary with same values - # one check is enough to verify a summary is returned, Scala version runs full test - sameSummary = model.evaluate(df) - self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary)) - self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) - - def test_multiclass_logistic_regression_summary(self): - df = self.spark.createDataFrame( - [ - (1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], [])), - (2.0, 2.0, Vectors.dense(2.0)), - (2.0, 2.0, Vectors.dense(1.9)), - ], - ["label", "weight", "features"], - ) - lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) - model = lr.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.probabilityCol, "probability") - self.assertEqual(s.labelCol, "label") - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - objHist = s.objectiveHistory - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) - self.assertGreater(s.totalIterations, 0) - self.assertTrue(isinstance(s.labels, list)) - self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.precisionByLabel, list)) - self.assertTrue(isinstance(s.recallByLabel, list)) - self.assertTrue(isinstance(s.fMeasureByLabel(), list)) - self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) - self.assertAlmostEqual(s.accuracy, 0.75, 2) - self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) - self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) - self.assertAlmostEqual(s.weightedRecall, 0.75, 2) - self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) - self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) - self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2) - # test evaluation (with training dataset) produces a summary with same values - # one check is enough to verify a summary is returned, Scala version runs full test - sameSummary = model.evaluate(df) - self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary)) - self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary)) - self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) - def test_linear_svc_summary(self): df = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0, 1.0, 1.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0, 3.0))], diff --git a/python/pyspark/ml/torch/distributor.py b/python/pyspark/ml/torch/distributor.py index 62a71c5a96af4..ef86f38b716b7 100644 --- a/python/pyspark/ml/torch/distributor.py +++ b/python/pyspark/ml/torch/distributor.py @@ -232,10 +232,10 @@ def _get_num_tasks(self) -> int: def _validate_input_params(self) -> None: if self.num_processes <= 0: - raise ValueError("num_proccesses has to be a positive integer") + raise ValueError("num_processes has to be a positive integer") def _check_encryption(self) -> None: - """Checks to see if the user requires encrpytion of data. + """Checks to see if the user requires encryption of data. If required, throw an exception since we don't support that. Raises diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 888beff663523..695bbf98517c3 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -278,7 +278,7 @@ def _to_java_impl(self) -> Tuple["JavaObject", "JavaObject", "JavaObject"]: gateway = SparkContext._gateway assert gateway is not None and SparkContext._jvm is not None - cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap + cls = getattr(SparkContext._jvm, "org.apache.spark.ml.param.ParamMap") estimator = self.getEstimator() if isinstance(estimator, JavaEstimator): @@ -313,7 +313,7 @@ def meta_estimator_transfer_param_maps_to_java( sc is not None and SparkContext._jvm is not None and SparkContext._gateway is not None ) - paramMapCls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap + paramMapCls = getattr(SparkContext._jvm, "org.apache.spark.ml.param.ParamMap") javaParamMaps = SparkContext._gateway.new_array(paramMapCls, len(pyParamMaps)) for idx, pyParamMap in enumerate(pyParamMaps): diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 9bbd64d2aef5a..3fe97f44619c0 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -37,6 +37,7 @@ from pyspark import since from pyspark.ml.common import inherit_doc +from pyspark.ml.remote.util import try_remote_intermediate_result, try_remote_write, try_remote_read from pyspark.sql import SparkSession from pyspark.sql.utils import is_remote from pyspark.util import VersionUtils @@ -270,6 +271,7 @@ class JavaMLWritable(MLWritable): (Private) Mixin for ML instances that provide :py:class:`JavaMLWriter`. """ + @try_remote_write def write(self) -> JavaMLWriter: """Returns an MLWriter instance for this ML instance.""" return JavaMLWriter(self) @@ -378,6 +380,7 @@ class JavaMLReadable(MLReadable[RL]): """ @classmethod + @try_remote_read def read(cls) -> JavaMLReader[RL]: """Returns an MLReader instance for this class.""" return JavaMLReader(cls) @@ -680,6 +683,7 @@ def hasSummary(self) -> bool: @property @since("2.1.0") + @try_remote_intermediate_result def summary(self) -> T: """ Gets summary of the model trained on the training set. An exception is thrown if diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index eed7781dc71e3..e2bf25386c77a 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -19,7 +19,15 @@ from typing import Any, Generic, Optional, List, Type, TypeVar, TYPE_CHECKING from pyspark import since -from pyspark.sql import DataFrame +from pyspark.ml.remote.util import ( + try_remote_transform_relation, + try_remote_call, + try_remote_fit, + try_remote_del, + try_remote_return_java_class, + try_remote_intercept, +) +from pyspark.sql import DataFrame, is_remote from pyspark.ml import Estimator, Predictor, PredictionModel, Transformer, Model from pyspark.ml.base import _PredictorParams from pyspark.ml.param import Param, Params @@ -47,6 +55,7 @@ def __init__(self, java_obj: Optional["JavaObject"] = None): super(JavaWrapper, self).__init__() self._java_obj = java_obj + @try_remote_del def __del__(self) -> None: from pyspark.core.context import SparkContext @@ -63,6 +72,7 @@ def _create_from_java_class(cls: Type[JW], java_class: str, *args: Any) -> JW: java_obj = JavaWrapper._new_java_obj(java_class, *args) return cls(java_obj) + @try_remote_call def _call_java(self, name: str, *args: Any) -> Any: from pyspark.core.context import SparkContext @@ -74,6 +84,7 @@ def _call_java(self, name: str, *args: Any) -> Any: return _java2py(sc, m(*java_args)) @staticmethod + @try_remote_return_java_class def _new_java_obj(java_class: str, *args: Any) -> "JavaObject": """ Returns a new Java object. @@ -347,6 +358,7 @@ def copy(self: "JP", extra: Optional["ParamMap"] = None) -> "JP": that._transfer_params_to_java() return that + @try_remote_intercept def clear(self, param: Param) -> None: """ Clears a param from the param map if it has been explicitly set. @@ -372,6 +384,7 @@ def _create_model(self, java_model: "JavaObject") -> JM: """ raise NotImplementedError() + @try_remote_fit def _fit_java(self, dataset: DataFrame) -> "JavaObject": """ Fits a Java model to the input dataset. @@ -405,6 +418,7 @@ class JavaTransformer(JavaParams, Transformer, metaclass=ABCMeta): available as _java_obj. """ + @try_remote_transform_relation def _transform(self, dataset: DataFrame) -> DataFrame: assert self._java_obj is not None @@ -435,7 +449,7 @@ def __init__(self, java_model: Optional["JavaObject"] = None): other ML classes). """ super(JavaModel, self).__init__(java_model) - if java_model is not None: + if java_model is not None and not is_remote(): # SPARK-10931: This is a temporary fix to allow models to own params # from estimators. Eventually, these params should be in models through # using common base classes between estimators and models. diff --git a/python/pyspark/pandas/accessors.py b/python/pyspark/pandas/accessors.py index 4c36f7976af83..77757e4b60873 100644 --- a/python/pyspark/pandas/accessors.py +++ b/python/pyspark/pandas/accessors.py @@ -936,7 +936,7 @@ def _transform_batch( def pandas_concat(*series: pd.Series) -> pd.DataFrame: # The input can only be a DataFrame for struct from Spark 3.0. - # This works around makeing the input as a frame. See SPARK-27240 + # This works around making the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = columns return pdf diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index bc54d8b9b17cb..01e23214d662d 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1123,7 +1123,7 @@ def shift( Shift Series/Index by desired number of periods. .. note:: the current implementation of shift uses Spark's Window without - specifying partition specification. This leads to moveing all data into + specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 49aa49f65e35b..86820573344ea 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -2632,7 +2632,7 @@ def to_latex( ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}, ... columns=['name', 'mask', 'weapon']) - >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + >>> print(df.to_latex(index=False)) # doctest: +SKIP \begin{tabular}{lll} \toprule name & mask & weapon \\ @@ -7292,8 +7292,6 @@ def select_dtypes( 4 1 True 1.0 5 2 False 2.0 """ - from pyspark.sql.types import _parse_datatype_string - include_list: List[str] if not is_list_like(include): include_list = [cast(str, include)] if include is not None else [] @@ -7320,14 +7318,14 @@ def select_dtypes( include_spark_type = [] for inc in include_list: try: - include_spark_type.append(_parse_datatype_string(inc)) + include_spark_type.append(self._internal.spark_frame._session._parse_ddl(inc)) except BaseException: pass exclude_spark_type = [] for exc in exclude_list: try: - exclude_spark_type.append(_parse_datatype_string(exc)) + exclude_spark_type.append(self._internal.spark_frame._session._parse_ddl(exc)) except BaseException: pass @@ -7686,7 +7684,7 @@ def _sort( if na_position not in ("first", "last"): raise ValueError("invalid na_position: '{}'".format(na_position)) - # Mapper: Get a spark colum + # Mapper: Get a spark column # n function for (ascending, na_position) combination mapper = { (True, "first"): PySparkColumn.asc_nulls_first, @@ -9808,7 +9806,7 @@ def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame": if is_all_string_type: # Handling string type columns - # We will retrive the `count`, `unique`, `top` and `freq`. + # We will retrieve the `count`, `unique`, `top` and `freq`. internal = self._internal.resolved_copy exprs_string = [ internal.spark_column_for(psser._column_label) for psser in psser_string diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index c77cdf51a2f6d..d31bc1f48d112 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -138,14 +138,44 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series, Parameters ---------- - pobj : pandas.DataFrame or pandas.Series - pandas DataFrame or Series to read. + pobj : pandas.DataFrame, pandas.Series or pandas.Index + pandas DataFrame, Series or Index to read. Returns ------- - Series or DataFrame - If a pandas Series is passed in, this function returns a pandas-on-Spark Series. + DataFrame, Series or Index If a pandas DataFrame is passed in, this function returns a pandas-on-Spark DataFrame. + If a pandas Series is passed in, this function returns a pandas-on-Spark Series. + If a pandas Index is passed in, this function returns a pandas-on-Spark Index. + + Examples + -------- + >>> import pandas as pd + >>> import pyspark.pandas as ps + + Convert a pandas DataFrame: + >>> pdf = pd.DataFrame({'a': [1, 2, 3]}) + >>> psdf = ps.from_pandas(pdf) + >>> psdf + a + 0 1 + 1 2 + 2 3 + + Convert a pandas Series: + >>> pser = pd.Series([1, 2, 3]) + >>> psser = ps.from_pandas(pser) + >>> psser + 0 1 + 1 2 + 2 3 + dtype: int64 + + Convert a pandas Index: + >>> pidx = pd.Index([1, 2, 3]) + >>> psidx = ps.from_pandas(pidx) + >>> psidx + Index([1, 2, 3], dtype='int64') """ if isinstance(pobj, pd.Series): return Series(pobj) diff --git a/python/pyspark/pandas/tests/io/test_dataframe_conversion.py b/python/pyspark/pandas/tests/io/test_dataframe_conversion.py index d4b03a855d382..7a4c635ee2941 100644 --- a/python/pyspark/pandas/tests/io/test_dataframe_conversion.py +++ b/python/pyspark/pandas/tests/io/test_dataframe_conversion.py @@ -26,6 +26,12 @@ from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.testing.utils import ( + have_openpyxl, + openpyxl_requirement_message, + have_jinja2, + jinja2_requirement_message, +) class DataFrameConversionMixin: @@ -86,6 +92,7 @@ def get_excel_dfs(pandas_on_spark_location, pandas_location): "expected": pd.read_excel(pandas_location, index_col=0), } + @unittest.skipIf(not have_openpyxl, openpyxl_requirement_message) def test_to_excel(self): with self.temp_dir() as dirpath: pandas_location = dirpath + "/" + "output1.xlsx" @@ -199,6 +206,7 @@ def test_to_clipboard(self): psdf.to_clipboard(sep=";", index=False), pdf.to_clipboard(sep=";", index=False) ) + @unittest.skipIf(not have_jinja2, jinja2_requirement_message) def test_to_latex(self): pdf = self.pdf psdf = self.psdf diff --git a/python/pyspark/pandas/tests/io/test_io.py b/python/pyspark/pandas/tests/io/test_io.py index 6fbdc366dd76a..da5817b86b984 100644 --- a/python/pyspark/pandas/tests/io/test_io.py +++ b/python/pyspark/pandas/tests/io/test_io.py @@ -24,7 +24,12 @@ from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils -from pyspark.testing.utils import have_tabulate, tabulate_requirement_message +from pyspark.testing.utils import ( + have_jinja2, + jinja2_requirement_message, + have_tabulate, + tabulate_requirement_message, +) # This file contains test cases for 'Serialization / IO / Conversion' @@ -91,6 +96,7 @@ def test_from_dict(self): psdf = ps.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) self.assert_eq(pdf, psdf) + @unittest.skipIf(not have_jinja2, jinja2_requirement_message) def test_style(self): # Currently, the `style` function returns a pandas object `Styler` as it is, # processing only the number of rows declared in `compute.max_rows`. diff --git a/python/pyspark/pandas/tests/io/test_series_conversion.py b/python/pyspark/pandas/tests/io/test_series_conversion.py index 2ae40e92b489a..06d923816633d 100644 --- a/python/pyspark/pandas/tests/io/test_series_conversion.py +++ b/python/pyspark/pandas/tests/io/test_series_conversion.py @@ -23,6 +23,7 @@ from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.testing.utils import have_jinja2, jinja2_requirement_message class SeriesConversionTestsMixin: @@ -48,6 +49,7 @@ def test_to_clipboard(self): psser.to_clipboard(sep=",", index=False), pser.to_clipboard(sep=",", index=False) ) + @unittest.skipIf(not have_jinja2, jinja2_requirement_message) def test_to_latex(self): pser = self.pser psser = self.psser diff --git a/python/pyspark/resource/profile.py b/python/pyspark/resource/profile.py index e9e6ef3520eea..f0fb8f0b32d5b 100644 --- a/python/pyspark/resource/profile.py +++ b/python/pyspark/resource/profile.py @@ -211,9 +211,9 @@ def __init__(self) -> None: if _jvm is not None: self._jvm = _jvm - self._java_resource_profile_builder = ( - _jvm.org.apache.spark.resource.ResourceProfileBuilder() - ) + self._java_resource_profile_builder = getattr( + _jvm, "org.apache.spark.resource.ResourceProfileBuilder" + )() else: self._jvm = None self._java_resource_profile_builder = None diff --git a/python/pyspark/resource/requests.py b/python/pyspark/resource/requests.py index fa8bb43ee2c49..805cecd5dbbe0 100644 --- a/python/pyspark/resource/requests.py +++ b/python/pyspark/resource/requests.py @@ -173,9 +173,9 @@ def __init__( jvm = _jvm or SparkContext._jvm if jvm is not None: - self._java_executor_resource_requests = ( - jvm.org.apache.spark.resource.ExecutorResourceRequests() - ) + self._java_executor_resource_requests = getattr( + jvm, "org.apache.spark.resource.ExecutorResourceRequests" + )() if _requests is not None: for k, v in _requests.items(): if k == self._MEMORY: @@ -474,9 +474,9 @@ def __init__( jvm = _jvm or SparkContext._jvm if jvm is not None: - self._java_task_resource_requests: Optional[ - "JavaObject" - ] = jvm.org.apache.spark.resource.TaskResourceRequests() + self._java_task_resource_requests: Optional["JavaObject"] = getattr( + jvm, "org.apache.spark.resource.TaskResourceRequests" + )() if _requests is not None: for k, v in _requests.items(): if k == self._CPUS: diff --git a/python/pyspark/sql/_typing.pyi b/python/pyspark/sql/_typing.pyi index 4969268939adf..27fa0f2a90133 100644 --- a/python/pyspark/sql/_typing.pyi +++ b/python/pyspark/sql/_typing.pyi @@ -36,8 +36,10 @@ from pyspark._typing import PrimitiveType from pyspark.profiler import CodeMapDict import pyspark.sql.types from pyspark.sql.column import Column +from pyspark.sql.tvf_argument import TableValuedFunctionArgument ColumnOrName = Union[Column, str] +TVFArgumentOrName = Union[TableValuedFunctionArgument, str] ColumnOrNameOrOrdinal = Union[Column, str, int] DecimalLiteral = decimal.Decimal DateTimeLiteral = Union[datetime.datetime, datetime.date] diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py index a9e41f20357e8..0b18212faf605 100644 --- a/python/pyspark/sql/avro/functions.py +++ b/python/pyspark/sql/avro/functions.py @@ -102,7 +102,7 @@ def from_avro( sc = get_active_spark_context() try: - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.avro.functions.from_avro( + jc = getattr(cast(JVMView, sc._jvm), "org.apache.spark.sql.avro.functions").from_avro( _to_java_column(data), jsonFormatSchema, options or {} ) except TypeError as e: @@ -168,11 +168,11 @@ def to_avro(data: "ColumnOrName", jsonFormatSchema: str = "") -> Column: sc = get_active_spark_context() try: if jsonFormatSchema == "": - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.avro.functions.to_avro( + jc = getattr(cast(JVMView, sc._jvm), "org.apache.spark.sql.avro.functions").to_avro( _to_java_column(data) ) else: - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.avro.functions.to_avro( + jc = getattr(cast(JVMView, sc._jvm), "org.apache.spark.sql.avro.functions").to_avro( _to_java_column(data), jsonFormatSchema ) except TypeError as e: diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 8c35aafa7066c..40a0d9346ccc3 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -479,7 +479,6 @@ def listFunctions( """ if dbName is None: dbName = self.currentDatabase() - iter = self._jcatalog.listFunctions(dbName).toLocalIterator() if pattern is None: iter = self._jcatalog.listFunctions(dbName).toLocalIterator() else: diff --git a/python/pyspark/sql/classic/column.py b/python/pyspark/sql/classic/column.py index c08eac7f6a049..fe0e440203c36 100644 --- a/python/pyspark/sql/classic/column.py +++ b/python/pyspark/sql/classic/column.py @@ -522,7 +522,9 @@ def alias(self, *alias: str, **kwargs: Any) -> ParentColumn: if len(alias) == 1: if metadata: assert sc._jvm is not None - jmeta = sc._jvm.org.apache.spark.sql.types.Metadata.fromJson(json.dumps(metadata)) + jmeta = getattr(sc._jvm, "org.apache.spark.sql.types.Metadata").fromJson( + json.dumps(metadata) + ) return Column(getattr(self._jc, "as")(alias[0], jmeta)) else: return Column(getattr(self._jc, "as")(alias[0])) diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py index 169755c753907..84498f1b2294d 100644 --- a/python/pyspark/sql/classic/dataframe.py +++ b/python/pyspark/sql/classic/dataframe.py @@ -21,7 +21,7 @@ import random import warnings from collections.abc import Iterable -from functools import reduce +from functools import reduce, cached_property from typing import ( Any, Callable, @@ -74,6 +74,7 @@ from pyspark.sql.utils import get_active_spark_context, to_java_array, to_scala_map from pyspark.sql.pandas.conversion import PandasConversionMixin from pyspark.sql.pandas.map_ops import PandasMapOpsMixin +from pyspark.sql.table_arg import TableArg if TYPE_CHECKING: @@ -118,8 +119,6 @@ def __init__( ): from pyspark.sql.context import SQLContext - self._sql_ctx: Optional["SQLContext"] = None - if isinstance(sql_ctx, SQLContext): assert not os.environ.get("SPARK_TESTING") # Sanity check for our internal usage. assert isinstance(sql_ctx, SQLContext) @@ -136,14 +135,11 @@ def __init__( self._sc: "SparkContext" = sql_ctx._sc self._jdf: "JavaObject" = jdf self.is_cached = False - # initialized lazily - self._schema: Optional[StructType] = None - self._lazy_rdd: Optional["RDD[Row]"] = None # Check whether _repr_html is supported or not, we use it to avoid calling _jdf twice # by __repr__ and _repr_html_ while eager evaluation opens. self._support_repr_html = False - @property + @cached_property def sql_ctx(self) -> "SQLContext": from pyspark.sql.context import SQLContext @@ -151,24 +147,18 @@ def sql_ctx(self) -> "SQLContext": "DataFrame.sql_ctx is an internal property, and will be removed " "in future releases. Use DataFrame.sparkSession instead." ) - if self._sql_ctx is None: - self._sql_ctx = SQLContext._get_or_create(self._sc) - return self._sql_ctx + return SQLContext._get_or_create(self._sc) @property def sparkSession(self) -> "SparkSession": return self._session - @property + @cached_property def rdd(self) -> "RDD[Row]": from pyspark.core.rdd import RDD - if self._lazy_rdd is None: - jrdd = self._jdf.javaToPython() - self._lazy_rdd = RDD( - jrdd, self.sparkSession._sc, BatchedSerializer(CPickleSerializer()) - ) - return self._lazy_rdd + jrdd = self._jdf.javaToPython() + return RDD(jrdd, self.sparkSession._sc, BatchedSerializer(CPickleSerializer())) @property def na(self) -> ParentDataFrameNaFunctions: @@ -208,21 +198,17 @@ def write(self) -> DataFrameWriter: def writeStream(self) -> DataStreamWriter: return DataStreamWriter(self) - @property + @cached_property def schema(self) -> StructType: - if self._schema is None: - try: - self._schema = cast( - StructType, _parse_datatype_json_string(self._jdf.schema().json()) - ) - except AnalysisException as e: - raise e - except Exception as e: - raise PySparkValueError( - errorClass="CANNOT_PARSE_DATATYPE", - messageParameters={"error": str(e)}, - ) - return self._schema + try: + return cast(StructType, _parse_datatype_json_string(self._jdf.schema().json())) + except AnalysisException as e: + raise e + except Exception as e: + raise PySparkValueError( + errorClass="CANNOT_PARSE_DATATYPE", + messageParameters={"error": str(e)}, + ) def printSchema(self, level: Optional[int] = None) -> None: if level: @@ -665,6 +651,15 @@ def dtypes(self) -> List[Tuple[str, str]]: def columns(self) -> List[str]: return [f.name for f in self.schema.fields] + def metadataColumn(self, colName: str) -> Column: + if not isinstance(colName, str): + raise PySparkTypeError( + errorClass="NOT_STR", + messageParameters={"arg_name": "colName", "arg_type": type(colName).__name__}, + ) + jc = self._jdf.metadataColumn(colName) + return Column(jc) + def colRegex(self, colName: str) -> Column: if not isinstance(colName, str): raise PySparkTypeError( @@ -715,6 +710,22 @@ def join( jdf = self._jdf.join(other._jdf, on, how) return DataFrame(jdf, self.sparkSession) + def lateralJoin( + self, + other: ParentDataFrame, + on: Optional[Column] = None, + how: Optional[str] = None, + ) -> ParentDataFrame: + if on is None and how is None: + jdf = self._jdf.lateralJoin(other._jdf) + elif on is None: + jdf = self._jdf.lateralJoin(other._jdf, how) + elif how is None: + jdf = self._jdf.lateralJoin(other._jdf, on._jc) + else: + jdf = self._jdf.lateralJoin(other._jdf, on._jc, how) + return DataFrame(jdf, self.sparkSession) + # TODO(SPARK-22947): Fix the DataFrame API. def _joinAsOf( self, @@ -1786,6 +1797,9 @@ def transpose(self, indexColumn: Optional["ColumnOrName"] = None) -> ParentDataF else: return DataFrame(self._jdf.transpose(), self.sparkSession) + def asTable(self) -> TableArg: + return TableArg(self._jdf.asTable()) + def scalar(self) -> Column: return Column(self._jdf.scalar()) diff --git a/python/pyspark/sql/classic/window.py b/python/pyspark/sql/classic/window.py index 63e9a337c0c2e..c7bc92739b240 100644 --- a/python/pyspark/sql/classic/window.py +++ b/python/pyspark/sql/classic/window.py @@ -48,9 +48,9 @@ def partitionBy(*cols: Union["ColumnOrName", Sequence["ColumnOrName"]]) -> Paren from py4j.java_gateway import JVMView sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.partitionBy( - _to_java_cols(cols) - ) + jspec = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.expressions.Window" + ).partitionBy(_to_java_cols(cols)) return WindowSpec(jspec) @staticmethod @@ -58,7 +58,7 @@ def orderBy(*cols: Union["ColumnOrName", Sequence["ColumnOrName"]]) -> ParentWin from py4j.java_gateway import JVMView sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.orderBy( + jspec = getattr(cast(JVMView, sc._jvm), "org.apache.spark.sql.expressions.Window").orderBy( _to_java_cols(cols) ) return WindowSpec(jspec) @@ -72,9 +72,9 @@ def rowsBetween(start: int, end: int) -> ParentWindowSpec: if end >= Window._FOLLOWING_THRESHOLD: end = Window.unboundedFollowing sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.rowsBetween( - start, end - ) + jspec = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.expressions.Window" + ).rowsBetween(start, end) return WindowSpec(jspec) @staticmethod @@ -86,9 +86,9 @@ def rangeBetween(start: int, end: int) -> ParentWindowSpec: if end >= Window._FOLLOWING_THRESHOLD: end = Window.unboundedFollowing sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.rangeBetween( - start, end - ) + jspec = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.expressions.Window" + ).rangeBetween(start, end) return WindowSpec(jspec) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 06dd2860fe406..e5640dd81b1fb 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -25,6 +25,7 @@ Union, ) +from pyspark.sql.tvf_argument import TableValuedFunctionArgument from pyspark.sql.utils import dispatch_col_method from pyspark.sql.types import DataType from pyspark.errors import PySparkValueError @@ -37,7 +38,7 @@ __all__ = ["Column"] -class Column: +class Column(TableValuedFunctionArgument): """ A column in a DataFrame. @@ -1524,7 +1525,11 @@ def over(self, window: "WindowSpec") -> "Column": @dispatch_col_method def outer(self) -> "Column": """ - Mark this column reference as an outer reference for subqueries. + Mark this column as an outer column if its expression refers to columns from an outer query. + + This is used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it + to build subquery expressions. Spark Connect DataFrame is always lazily analyzed and + does not need to use this function. .. versionadded:: 4.0.0 diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 78d4e0fc1c4f4..4a85ca26b64de 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -20,6 +20,8 @@ "SparkConnectClient", ] +import atexit + from pyspark.sql.connect.utils import check_dependencies check_dependencies(__name__) @@ -329,8 +331,8 @@ def default_port() -> int: jvm = PySparkSession._instantiatedSession._jvm # type: ignore[union-attr] return getattr( getattr( - jvm.org.apache.spark.sql.connect.service, # type: ignore[union-attr] - "SparkConnectService$", + jvm, + "org.apache.spark.sql.connect.service.SparkConnectService$", ), "MODULE$", ).localPort() @@ -494,6 +496,7 @@ def __init__( is_same_semantics: Optional[bool], semantic_hash: Optional[int], storage_level: Optional[StorageLevel], + ddl_string: Optional[str], ): self.schema = schema self.explain_string = explain_string @@ -506,6 +509,7 @@ def __init__( self.is_same_semantics = is_same_semantics self.semantic_hash = semantic_hash self.storage_level = storage_level + self.ddl_string = ddl_string @classmethod def fromProto(cls, pb: Any) -> "AnalyzeResult": @@ -520,6 +524,7 @@ def fromProto(cls, pb: Any) -> "AnalyzeResult": is_same_semantics: Optional[bool] = None semantic_hash: Optional[int] = None storage_level: Optional[StorageLevel] = None + ddl_string: Optional[str] = None if pb.HasField("schema"): schema = types.proto_schema_to_pyspark_data_type(pb.schema.schema) @@ -547,6 +552,8 @@ def fromProto(cls, pb: Any) -> "AnalyzeResult": pass elif pb.HasField("get_storage_level"): storage_level = proto_to_storage_level(pb.get_storage_level.storage_level) + elif pb.HasField("json_to_ddl"): + ddl_string = pb.json_to_ddl.ddl_string else: raise SparkConnectException("No analyze result found!") @@ -562,6 +569,7 @@ def fromProto(cls, pb: Any) -> "AnalyzeResult": is_same_semantics, semantic_hash, storage_level, + ddl_string, ) @@ -669,6 +677,9 @@ def __init__( self._progress_handlers: List[ProgressHandler] = [] + # cleanup ml cache if possible + atexit.register(self._cleanup_ml) + def register_progress_handler(self, handler: ProgressHandler) -> None: """ Register a progress handler to be called when a progress message is received. @@ -1284,6 +1295,8 @@ def _analyze(self, method: str, **kwargs: Any) -> AnalyzeResult: req.unpersist.blocking = cast(bool, kwargs.get("blocking")) elif method == "get_storage_level": req.get_storage_level.relation.CopyFrom(cast(pb2.Relation, kwargs.get("relation"))) + elif method == "json_to_ddl": + req.json_to_ddl.json_string = cast(str, kwargs.get("json_string")) else: raise PySparkValueError( errorClass="UNSUPPORTED_OPERATION", @@ -1471,6 +1484,8 @@ def handle_response( b.checkpoint_command_result.relation ) } + if b.HasField("ml_command_result"): + yield {"ml_command_result": b.ml_command_result} try: if self._use_reattachable_execute: @@ -1923,3 +1938,33 @@ def _create_profile(self, profile: pb2.ResourceProfile) -> int: (_, properties, _) = self.execute_command(cmd) profile_id = properties["create_resource_profile_command_result"] return profile_id + + def add_ml_cache(self, cache_id: str) -> None: + if not hasattr(self.thread_local, "ml_caches"): + self.thread_local.ml_caches = set() + self.thread_local.ml_caches.add(cache_id) + + def remove_ml_cache(self, cache_id: str) -> None: + if not hasattr(self.thread_local, "ml_caches"): + self.thread_local.ml_caches = set() + + if cache_id in self.thread_local.ml_caches: + self._delete_ml_cache(cache_id) + + def _delete_ml_cache(self, cache_id: str) -> None: + # try best to delete the cache + try: + command = pb2.Command() + command.ml_command.delete.obj_ref.CopyFrom(pb2.ObjectRef(id=cache_id)) + self.execute_command(command) + except Exception: + pass + + def _cleanup_ml(self) -> None: + if not hasattr(self.thread_local, "ml_caches"): + self.thread_local.ml_caches = set() + + self.disable_reattachable_execute() + # Todo add a pattern to delete all model in one command + for model_id in self.thread_local.ml_caches: + self._delete_ml_cache(model_id) diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py index e840081146340..c5733801814eb 100644 --- a/python/pyspark/sql/connect/column.py +++ b/python/pyspark/sql/connect/column.py @@ -34,7 +34,6 @@ PySparkTypeError, PySparkAttributeError, PySparkValueError, - PySparkNotImplementedError, ) from pyspark.sql.types import DataType from pyspark.sql.utils import enum_to_value @@ -44,6 +43,7 @@ Expression, UnresolvedFunction, UnresolvedExtractValue, + LazyExpression, LiteralExpression, CaseWhen, SortOrder, @@ -460,11 +460,7 @@ def over(self, window: "WindowSpec") -> ParentColumn: # type: ignore[override] return Column(WindowExpression(windowFunction=self._expr, windowSpec=window)) def outer(self) -> ParentColumn: - # TODO(SPARK-50134): Implement this method - raise PySparkNotImplementedError( - errorClass="NOT_IMPLEMENTED", - messageParameters={"feature": "outer()"}, - ) + return Column(LazyExpression(self._expr)) def isin(self, *cols: Any) -> ParentColumn: if len(cols) == 1 and isinstance(cols[0], (list, set)): diff --git a/python/pyspark/sql/connect/conf.py b/python/pyspark/sql/connect/conf.py index 1ef72ee3cfa43..84d7ad34fb360 100644 --- a/python/pyspark/sql/connect/conf.py +++ b/python/pyspark/sql/connect/conf.py @@ -49,6 +49,20 @@ def set(self, key: str, value: Union[str, int, bool]) -> None: set.__doc__ = PySparkRuntimeConfig.set.__doc__ + def _set_all(self, configs: Dict[str, Union[str, int, bool]], silent: bool) -> None: + conf_list = [] + for key, value in configs.items(): + if isinstance(value, bool): + value = "true" if value else "false" + elif isinstance(value, int): + value = str(value) + conf_list.append(proto.KeyValue(key=key, value=value)) + op_set = proto.ConfigRequest.Set(pairs=conf_list, silent=silent) + operation = proto.ConfigRequest.Operation(set=op_set) + result = self._client.config(operation) + for warn in result.warnings: + warnings.warn(warn) + def get( self, key: str, default: Union[Optional[str], _NoValueType] = _NoValue ) -> Optional[str]: diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py index d803f37c5b9f1..b6b0bd65adcb8 100644 --- a/python/pyspark/sql/connect/conversion.py +++ b/python/pyspark/sql/connect/conversion.py @@ -104,6 +104,7 @@ def _need_converter( def _create_converter( dataType: DataType, nullable: bool = True, + variants_as_dicts: bool = False, # some code paths may require python internal types ) -> Callable: assert dataType is not None and isinstance(dataType, DataType) assert isinstance(nullable, bool) @@ -126,8 +127,7 @@ def convert_null(value: Any) -> Any: field_convs = [ LocalDataToArrowConversion._create_converter( - field.dataType, - field.nullable, + field.dataType, field.nullable, variants_as_dicts ) for field in dataType.fields ] @@ -170,8 +170,7 @@ def convert_struct(value: Any) -> Any: elif isinstance(dataType, ArrayType): element_conv = LocalDataToArrowConversion._create_converter( - dataType.elementType, - dataType.containsNull, + dataType.elementType, dataType.containsNull, variants_as_dicts ) def convert_array(value: Any) -> Any: @@ -188,8 +187,7 @@ def convert_array(value: Any) -> Any: elif isinstance(dataType, MapType): key_conv = LocalDataToArrowConversion._create_converter(dataType.keyType) value_conv = LocalDataToArrowConversion._create_converter( - dataType.valueType, - dataType.valueContainsNull, + dataType.valueType, dataType.valueContainsNull, variants_as_dicts ) def convert_map(value: Any) -> Any: @@ -303,8 +301,11 @@ def convert_variant(value: Any) -> Any: isinstance(value, dict) and all(key in value for key in ["value", "metadata"]) and all(isinstance(value[key], bytes) for key in ["value", "metadata"]) + and not variants_as_dicts ): return VariantVal(value["value"], value["metadata"]) + elif isinstance(value, VariantVal) and variants_as_dicts: + return VariantType().toInternal(value) else: raise PySparkValueError(errorClass="MALFORMED_VARIANT") @@ -331,8 +332,7 @@ def convert(data: Sequence[Any], schema: StructType) -> "pa.Table": column_convs = [ LocalDataToArrowConversion._create_converter( - field.dataType, - field.nullable, + field.dataType, field.nullable, variants_as_dicts=True ) for field in schema.fields ] diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index e85efeb592dff..76b7881f234ff 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -79,12 +79,14 @@ from pyspark.sql.column import Column from pyspark.sql.connect.expressions import ( ColumnReference, + SubqueryExpression, UnresolvedRegex, UnresolvedStar, ) from pyspark.sql.connect.functions import builtin as F from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema from pyspark.sql.pandas.functions import _validate_pandas_udf # type: ignore[attr-defined] +from pyspark.sql.table_arg import TableArg if TYPE_CHECKING: @@ -272,6 +274,14 @@ def alias(self, alias: str) -> ParentDataFrame: res._cached_schema = self._cached_schema return res + def metadataColumn(self, colName: str) -> Column: + if not isinstance(colName, str): + raise PySparkTypeError( + errorClass="NOT_STR", + messageParameters={"arg_name": "colName", "arg_type": type(colName).__name__}, + ) + return self._col(colName, is_metadata_column=True) + def colRegex(self, colName: str) -> Column: from pyspark.sql.connect.column import Column as ConnectColumn @@ -686,6 +696,22 @@ def join( session=self._session, ) + def lateralJoin( + self, + other: ParentDataFrame, + on: Optional[Column] = None, + how: Optional[str] = None, + ) -> ParentDataFrame: + self._check_same_session(other) + if how is not None and isinstance(how, str): + how = how.lower().replace("_", "") + return DataFrame( + plan.LateralJoin( + left=self._plan, right=cast(plan.LogicalPlan, other._plan), on=on, how=how + ), + session=self._session, + ) + def _joinAsOf( self, other: ParentDataFrame, @@ -1732,13 +1758,14 @@ def __getitem__( messageParameters={"arg_name": "item", "arg_type": type(item).__name__}, ) - def _col(self, name: str) -> Column: + def _col(self, name: str, is_metadata_column: bool = False) -> Column: from pyspark.sql.connect.column import Column as ConnectColumn return ConnectColumn( ColumnReference( unparsed_identifier=name, plan_id=self._plan._plan_id, + is_metadata_column=is_metadata_column, ) ) @@ -1784,19 +1811,22 @@ def transpose(self, indexColumn: Optional["ColumnOrName"] = None) -> ParentDataF self._session, ) - def scalar(self) -> Column: - # TODO(SPARK-50134): Implement this method + def asTable(self) -> TableArg: + # TODO(SPARK-50393): Support DataFrame conversion to table argument in Spark Connect raise PySparkNotImplementedError( errorClass="NOT_IMPLEMENTED", - messageParameters={"feature": "scalar()"}, + messageParameters={"feature": "asTable()"}, ) + def scalar(self) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + + return ConnectColumn(SubqueryExpression(self._plan, subquery_type="scalar")) + def exists(self) -> Column: - # TODO(SPARK-50134): Implement this method - raise PySparkNotImplementedError( - errorClass="NOT_IMPLEMENTED", - messageParameters={"feature": "exists()"}, - ) + from pyspark.sql.connect.column import Column as ConnectColumn + + return ConnectColumn(SubqueryExpression(self._plan, subquery_type="exists")) @property def schema(self) -> StructType: @@ -2023,6 +2053,8 @@ def _map_partitions( from pyspark.sql.connect.udf import UserDefinedFunction _validate_pandas_udf(func, evalType) + if isinstance(schema, str): + schema = cast(StructType, self._session._parse_ddl(schema)) udf_obj = UserDefinedFunction( func, returnType=schema, @@ -2262,10 +2294,6 @@ def _test() -> None: del pyspark.sql.dataframe.DataFrame.toJSON.__doc__ del pyspark.sql.dataframe.DataFrame.rdd.__doc__ - # TODO(SPARK-50134): Support subquery in connect - del pyspark.sql.dataframe.DataFrame.scalar.__doc__ - del pyspark.sql.dataframe.DataFrame.exists.__doc__ - globs["spark"] = ( PySparkSession.builder.appName("sql.connect.dataframe tests") .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]")) diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index 5a5320366f666..c32db14968c6b 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -82,6 +82,7 @@ if TYPE_CHECKING: from pyspark.sql.connect.client import SparkConnectClient from pyspark.sql.connect.window import WindowSpec + from pyspark.sql.connect.plan import LogicalPlan class Expression: @@ -128,6 +129,15 @@ def _create_proto_expression(self) -> proto.Expression: plan.common.origin.CopyFrom(self.origin) return plan + @property + def children(self) -> Sequence["Expression"]: + return [] + + def foreach(self, f: Callable[["Expression"], None]) -> None: + f(self) + for c in self.children: + c.foreach(f) + class CaseWhen(Expression): def __init__( @@ -162,6 +172,16 @@ def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": return unresolved_function.to_plan(session) + @property + def children(self) -> Sequence["Expression"]: + children = [] + for branch in self._branches: + children.append(branch[0]) + children.append(branch[1]) + if self._else_value is not None: + children.append(self._else_value) + return children + def __repr__(self) -> str: _cases = "".join([f" WHEN {c} THEN {v}" for c, v in self._branches]) _else = f" ELSE {self._else_value}" if self._else_value is not None else "" @@ -196,6 +216,10 @@ def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": exp.alias.expr.CopyFrom(self._child.to_plan(session)) return exp + @property + def children(self) -> Sequence["Expression"]: + return [self._child] + def __repr__(self) -> str: return f"{self._child} AS {','.join(self._alias)}" @@ -500,7 +524,12 @@ class ColumnReference(Expression): treat it as an unresolved attribute. Attributes that have the same fully qualified name are identical""" - def __init__(self, unparsed_identifier: str, plan_id: Optional[int] = None) -> None: + def __init__( + self, + unparsed_identifier: str, + plan_id: Optional[int] = None, + is_metadata_column: bool = False, + ) -> None: super().__init__() assert isinstance(unparsed_identifier, str) self._unparsed_identifier = unparsed_identifier @@ -508,6 +537,8 @@ def __init__(self, unparsed_identifier: str, plan_id: Optional[int] = None) -> N assert plan_id is None or isinstance(plan_id, int) self._plan_id = plan_id + self._is_metadata_column = is_metadata_column + def name(self) -> str: """Returns the qualified name of the column reference.""" return self._unparsed_identifier @@ -518,6 +549,7 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: expr.unresolved_attribute.unparsed_identifier = self._unparsed_identifier if self._plan_id is not None: expr.unresolved_attribute.plan_id = self._plan_id + expr.unresolved_attribute.is_metadata_column = self._is_metadata_column return expr def __repr__(self) -> str: @@ -622,6 +654,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: return sort + @property + def children(self) -> Sequence["Expression"]: + return [self._child] + class UnresolvedFunction(Expression): def __init__( @@ -649,6 +685,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: fun.unresolved_function.is_distinct = self._is_distinct return fun + @property + def children(self) -> Sequence["Expression"]: + return self._args + def __repr__(self) -> str: # Default print handling: if self._is_distinct: @@ -730,12 +770,12 @@ def __init__( function_name: str, function: Union[PythonUDF, JavaUDF], deterministic: bool = False, - arguments: Sequence[Expression] = [], + arguments: Optional[Sequence[Expression]] = None, ): super().__init__() self._function_name = function_name self._deterministic = deterministic - self._arguments = arguments + self._arguments: Sequence[Expression] = arguments or [] self._function = function def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": @@ -770,6 +810,10 @@ def to_plan_judf( expr.java_udf.CopyFrom(cast(proto.JavaUDF, self._function.to_plan(session))) return expr + @property + def children(self) -> Sequence["Expression"]: + return self._arguments + def __repr__(self) -> str: return f"{self._function_name}({', '.join([str(arg) for arg in self._arguments])})" @@ -799,6 +843,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: expr.update_fields.value_expression.CopyFrom(self._valueExpr.to_plan(session)) return expr + @property + def children(self) -> Sequence["Expression"]: + return [self._structExpr, self._valueExpr] + def __repr__(self) -> str: return f"update_field({self._structExpr}, {self._fieldName}, {self._valueExpr})" @@ -823,6 +871,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: expr.update_fields.field_name = self._fieldName return expr + @property + def children(self) -> Sequence["Expression"]: + return [self._structExpr] + def __repr__(self) -> str: return f"drop_field({self._structExpr}, {self._fieldName})" @@ -847,6 +899,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: expr.unresolved_extract_value.extraction.CopyFrom(self._extraction.to_plan(session)) return expr + @property + def children(self) -> Sequence["Expression"]: + return [self._child, self._extraction] + def __repr__(self) -> str: return f"{self._child}['{self._extraction}']" @@ -906,6 +962,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: return fun + @property + def children(self) -> Sequence["Expression"]: + return [self._expr] + def __repr__(self) -> str: # We cannot guarantee the string representations be exactly the same, e.g. # str(sf.col("a").cast("long")): @@ -989,6 +1049,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: ) return expr + @property + def children(self) -> Sequence["Expression"]: + return [self._function] + self._arguments + def __repr__(self) -> str: return ( f"LambdaFunction({str(self._function)}, " @@ -1098,6 +1162,12 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: return expr + @property + def children(self) -> Sequence["Expression"]: + return ( + [self._windowFunction] + self._windowSpec._partitionSpec + self._windowSpec._orderSpec + ) + def __repr__(self) -> str: return f"WindowExpression({str(self._windowFunction)}, ({str(self._windowSpec)}))" @@ -1128,6 +1198,10 @@ def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": expr.call_function.arguments.extend([arg.to_plan(session) for arg in self._args]) return expr + @property + def children(self) -> Sequence["Expression"]: + return self._args + def __repr__(self) -> str: if len(self._args) > 0: return f"CallFunction('{self._name}', {', '.join([str(arg) for arg in self._args])})" @@ -1151,5 +1225,50 @@ def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": expr.named_argument_expression.value.CopyFrom(self._value.to_plan(session)) return expr + @property + def children(self) -> Sequence["Expression"]: + return [self._value] + def __repr__(self) -> str: return f"{self._key} => {self._value}" + + +class LazyExpression(Expression): + def __init__(self, expr: Expression): + assert isinstance(expr, Expression) + super().__init__() + self._expr = expr + + def to_plan(self, session: "SparkConnectClient") -> proto.Expression: + expr = self._create_proto_expression() + expr.lazy_expression.child.CopyFrom(self._expr.to_plan(session)) + return expr + + @property + def children(self) -> Sequence["Expression"]: + return [self._expr] + + def __repr__(self) -> str: + return f"lazy({self._expr})" + + +class SubqueryExpression(Expression): + def __init__(self, plan: "LogicalPlan", subquery_type: str) -> None: + assert isinstance(subquery_type, str) + assert subquery_type in ("scalar", "exists") + + super().__init__() + self._plan = plan + self._subquery_type = subquery_type + + def to_plan(self, session: "SparkConnectClient") -> proto.Expression: + expr = self._create_proto_expression() + expr.subquery_expression.plan_id = self._plan._plan_id + if self._subquery_type == "scalar": + expr.subquery_expression.subquery_type = proto.SubqueryExpression.SUBQUERY_TYPE_SCALAR + elif self._subquery_type == "exists": + expr.subquery_expression.subquery_type = proto.SubqueryExpression.SUBQUERY_TYPE_EXISTS + return expr + + def __repr__(self) -> str: + return f"SubqueryExpression({self._plan}, {self._subquery_type})" diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index f52cdffb84b7c..f13eeab12dd35 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -1064,6 +1064,64 @@ def collect_set(col: "ColumnOrName") -> Column: collect_set.__doc__ = pysparkfuncs.collect_set.__doc__ +def listagg(col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None) -> Column: + if delimiter is None: + return _invoke_function_over_columns("listagg", col) + else: + return _invoke_function_over_columns("listagg", col, lit(delimiter)) + + +listagg.__doc__ = pysparkfuncs.listagg.__doc__ + + +def listagg_distinct( + col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None +) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + + args = [col] + if delimiter is not None: + args += [lit(delimiter)] + + _exprs = [_to_col(c)._expr for c in args] + return ConnectColumn( + UnresolvedFunction("listagg", _exprs, is_distinct=True) # type: ignore[arg-type] + ) + + +listagg_distinct.__doc__ = pysparkfuncs.listagg_distinct.__doc__ + + +def string_agg( + col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None +) -> Column: + if delimiter is None: + return _invoke_function_over_columns("string_agg", col) + else: + return _invoke_function_over_columns("string_agg", col, lit(delimiter)) + + +string_agg.__doc__ = pysparkfuncs.string_agg.__doc__ + + +def string_agg_distinct( + col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None +) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + + args = [col] + if delimiter is not None: + args += [lit(delimiter)] + + _exprs = [_to_col(c)._expr for c in args] + return ConnectColumn( + UnresolvedFunction("string_agg", _exprs, is_distinct=True) # type: ignore[arg-type] + ) + + +string_agg_distinct.__doc__ = pysparkfuncs.string_agg_distinct.__doc__ + + def corr(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: return _invoke_function_over_columns("corr", col1, col2) diff --git a/python/pyspark/sql/connect/group.py b/python/pyspark/sql/connect/group.py index 863461da10ec9..11adc8850fec1 100644 --- a/python/pyspark/sql/connect/group.py +++ b/python/pyspark/sql/connect/group.py @@ -35,8 +35,7 @@ from pyspark.sql.group import GroupedData as PySparkGroupedData from pyspark.sql.pandas.group_ops import PandasCogroupedOps as PySparkPandasCogroupedOps from pyspark.sql.pandas.functions import _validate_pandas_udf # type: ignore[attr-defined] -from pyspark.sql.types import NumericType -from pyspark.sql.types import StructType +from pyspark.sql.types import NumericType, StructType import pyspark.sql.connect.plan as plan from pyspark.sql.column import Column @@ -295,6 +294,8 @@ def applyInPandas( from pyspark.sql.connect.dataframe import DataFrame _validate_pandas_udf(func, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) + if isinstance(schema, str): + schema = cast(StructType, self._df._session._parse_ddl(schema)) udf_obj = UserDefinedFunction( func, returnType=schema, @@ -367,6 +368,8 @@ def applyInArrow( from pyspark.sql.connect.dataframe import DataFrame _validate_pandas_udf(func, PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF) + if isinstance(schema, str): + schema = cast(StructType, self._df._session._parse_ddl(schema)) udf_obj = UserDefinedFunction( func, returnType=schema, @@ -410,6 +413,8 @@ def applyInPandas( from pyspark.sql.connect.dataframe import DataFrame _validate_pandas_udf(func, PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF) + if isinstance(schema, str): + schema = cast(StructType, self._gd1._df._session._parse_ddl(schema)) udf_obj = UserDefinedFunction( func, returnType=schema, @@ -439,6 +444,8 @@ def applyInArrow( from pyspark.sql.connect.dataframe import DataFrame _validate_pandas_udf(func, PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF) + if isinstance(schema, str): + schema = cast(StructType, self._gd1._df._session._parse_ddl(schema)) udf_obj = UserDefinedFunction( func, returnType=schema, diff --git a/python/pyspark/sql/connect/logging.py b/python/pyspark/sql/connect/logging.py index b80342cf99743..099193fd7ce45 100644 --- a/python/pyspark/sql/connect/logging.py +++ b/python/pyspark/sql/connect/logging.py @@ -21,13 +21,18 @@ import os from typing import Optional -__all__ = [ - "getLogLevel", -] +__all__ = ["configureLogging", "getLogLevel"] -def _configure_logging() -> logging.Logger: - """Configure logging for the Spark Connect clients.""" +def configureLogging(level: Optional[str] = None) -> logging.Logger: + """ + Configure log level for Spark Connect components. + When not specified as a parameter, log level will be configured based on + the SPARK_CONNECT_LOG_LEVEL environment variable. + When both are absent, logging is disabled. + + .. versionadded:: 4.0.0 + """ logger = PySparkLogger.getLogger(__name__) handler = logging.StreamHandler() handler.setFormatter( @@ -35,8 +40,9 @@ def _configure_logging() -> logging.Logger: ) logger.addHandler(handler) - # Check the environment variables for log levels: - if "SPARK_CONNECT_LOG_LEVEL" in os.environ: + if level is not None: + logger.setLevel(level.upper()) + elif "SPARK_CONNECT_LOG_LEVEL" in os.environ: logger.setLevel(os.environ["SPARK_CONNECT_LOG_LEVEL"].upper()) else: logger.disabled = True @@ -44,7 +50,7 @@ def _configure_logging() -> logging.Logger: # Instantiate the logger based on the environment configuration. -logger = _configure_logging() +logger = configureLogging() def getLogLevel() -> Optional[int]: diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index b387ca1d4e508..02b60381ab939 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -52,7 +52,7 @@ from pyspark.sql.connect.logging import logger from pyspark.sql.connect.proto import base_pb2 as spark_dot_connect_dot_base__pb2 from pyspark.sql.connect.conversion import storage_level_to_proto -from pyspark.sql.connect.expressions import Expression +from pyspark.sql.connect.expressions import Expression, SubqueryExpression from pyspark.sql.connect.types import pyspark_types_to_proto_types, UnparsedDataType from pyspark.errors import ( AnalysisException, @@ -73,9 +73,30 @@ class LogicalPlan: INDENT = 2 - def __init__(self, child: Optional["LogicalPlan"]) -> None: + def __init__( + self, child: Optional["LogicalPlan"], references: Optional[Sequence["LogicalPlan"]] = None + ) -> None: + """ + + Parameters + ---------- + child : :class:`LogicalPlan`, optional. + The child logical plan. + references : list of :class:`LogicalPlan`, optional. + The list of logical plans that are referenced as subqueries in this logical plan. + """ self._child = child - self._plan_id = LogicalPlan._fresh_plan_id() + self._root_plan_id = LogicalPlan._fresh_plan_id() + + self._references: Sequence["LogicalPlan"] = references or [] + self._plan_id_with_rel: Optional[int] = None + if len(self._references) > 0: + assert all(isinstance(r, LogicalPlan) for r in self._references) + self._plan_id_with_rel = LogicalPlan._fresh_plan_id() + + @property + def _plan_id(self) -> int: + return self._plan_id_with_rel or self._root_plan_id @staticmethod def _fresh_plan_id() -> int: @@ -89,7 +110,7 @@ def _fresh_plan_id() -> int: def _create_proto_relation(self) -> proto.Relation: plan = proto.Relation() - plan.common.plan_id = self._plan_id + plan.common.plan_id = self._root_plan_id return plan def plan(self, session: "SparkConnectClient") -> proto.Relation: # type: ignore[empty-body] @@ -136,6 +157,42 @@ def observations(self) -> Dict[str, "Observation"]: else: return self._child.observations + @staticmethod + def _collect_references( + cols_or_exprs: Sequence[Union[Column, Expression]] + ) -> Sequence["LogicalPlan"]: + references: List[LogicalPlan] = [] + + def append_reference(e: Expression) -> None: + if isinstance(e, SubqueryExpression): + references.append(e._plan) + + for col_or_expr in cols_or_exprs: + if isinstance(col_or_expr, Column): + col_or_expr._expr.foreach(append_reference) + else: + col_or_expr.foreach(append_reference) + return references + + def _with_relations( + self, root: proto.Relation, session: "SparkConnectClient" + ) -> proto.Relation: + if len(self._references) == 0: + return root + else: + # When there are references to other DataFrame, e.g., subqueries, build new plan like: + # with_relations [id 10] + # root: plan [id 9] + # reference: + # refs#1: [id 8] + # refs#2: [id 5] + plan = proto.Relation() + assert isinstance(self._plan_id_with_rel, int) + plan.common.plan_id = self._plan_id_with_rel + plan.with_relations.root.CopyFrom(root) + plan.with_relations.references.extend([ref.plan(session) for ref in self._references]) + return plan + def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, Any]: """ Extracts the parameters that are able to be printed. It looks up the signature @@ -192,6 +249,7 @@ def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, An getattr(a, "__forward_arg__", "").endswith("LogicalPlan") for a in getattr(tpe.annotation, "__args__", ()) ) + if ( not is_logical_plan and not is_forwardref_logical_plan @@ -205,7 +263,7 @@ def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, An try: params[name] = getattr(self, "_" + name) except AttributeError: - pass # Simpy ignore + pass # Simply ignore return params def print(self, indent: int = 0) -> str: @@ -473,8 +531,8 @@ def __init__( child: Optional["LogicalPlan"], columns: List[Column], ) -> None: - super().__init__(child) assert all(isinstance(c, Column) for c in columns) + super().__init__(child, self._collect_references(columns)) self._columns = columns def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -482,7 +540,8 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.project.input.CopyFrom(self._child.plan(session)) plan.project.expressions.extend([c.to_plan(session) for c in self._columns]) - return plan + + return self._with_relations(plan, session) class WithColumns(LogicalPlan): @@ -495,8 +554,6 @@ def __init__( columns: Sequence[Column], metadata: Optional[Sequence[str]] = None, ) -> None: - super().__init__(child) - assert isinstance(columnNames, list) assert len(columnNames) > 0 assert all(isinstance(c, str) for c in columnNames) @@ -513,6 +570,8 @@ def __init__( # validate json string assert m == "" or json.loads(m) is not None + super().__init__(child, self._collect_references(columns)) + self._columnNames = columnNames self._columns = columns self._metadata = metadata @@ -530,7 +589,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: alias.metadata = self._metadata[i] plan.with_columns.aliases.append(alias) - return plan + return self._with_relations(plan, session) class WithWatermark(LogicalPlan): @@ -608,16 +667,14 @@ def __init__( name: str, parameters: Sequence[Column], ) -> None: - super().__init__(child) - assert isinstance(name, str) - self._name = name - assert parameters is not None and isinstance(parameters, List) for param in parameters: assert isinstance(param, Column) + super().__init__(child, self._collect_references(parameters)) + self._name = name self._parameters = parameters def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -626,12 +683,12 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.hint.input.CopyFrom(self._child.plan(session)) plan.hint.name = self._name plan.hint.parameters.extend([param.to_plan(session) for param in self._parameters]) - return plan + return self._with_relations(plan, session) class Filter(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], filter: Column) -> None: - super().__init__(child) + super().__init__(child, self._collect_references([filter])) self.filter = filter def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -639,7 +696,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.filter.input.CopyFrom(self._child.plan(session)) plan.filter.condition.CopyFrom(self.filter.to_plan(session)) - return plan + return self._with_relations(plan, session) class Limit(LogicalPlan): @@ -712,11 +769,10 @@ def __init__( columns: List[Column], is_global: bool, ) -> None: - super().__init__(child) - assert all(isinstance(c, Column) for c in columns) assert isinstance(is_global, bool) + super().__init__(child, self._collect_references(columns)) self.columns = columns self.is_global = is_global @@ -726,7 +782,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.sort.input.CopyFrom(self._child.plan(session)) plan.sort.order.extend([c.to_plan(session).sort_order for c in self.columns]) plan.sort.is_global = self.is_global - return plan + return self._with_relations(plan, session) class Drop(LogicalPlan): @@ -735,9 +791,12 @@ def __init__( child: Optional["LogicalPlan"], columns: List[Union[Column, str]], ) -> None: - super().__init__(child) if len(columns) > 0: assert all(isinstance(c, (Column, str)) for c in columns) + + super().__init__( + child, self._collect_references([c for c in columns if isinstance(c, Column)]) + ) self._columns = columns def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -749,7 +808,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.drop.columns.append(c.to_plan(session)) else: plan.drop.column_names.append(c) - return plan + return self._with_relations(plan, session) class Sample(LogicalPlan): @@ -792,8 +851,6 @@ def __init__( pivot_values: Optional[Sequence[Column]], grouping_sets: Optional[Sequence[Sequence[Column]]], ) -> None: - super().__init__(child) - assert isinstance(group_type, str) and group_type in [ "groupby", "rollup", @@ -801,15 +858,12 @@ def __init__( "pivot", "grouping_sets", ] - self._group_type = group_type assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) - self._grouping_cols = grouping_cols assert isinstance(aggregate_cols, list) and all( isinstance(c, Column) for c in aggregate_cols ) - self._aggregate_cols = aggregate_cols if group_type == "pivot": assert pivot_col is not None and isinstance(pivot_col, Column) @@ -821,6 +875,19 @@ def __init__( assert pivot_values is None assert grouping_sets is None + super().__init__( + child, + self._collect_references( + grouping_cols + + aggregate_cols + + ([pivot_col] if pivot_col is not None else []) + + (pivot_values if pivot_values is not None else []) + + ([g for gs in grouping_sets for g in gs] if grouping_sets is not None else []) + ), + ) + self._group_type = group_type + self._grouping_cols = grouping_cols + self._aggregate_cols = aggregate_cols self._pivot_col = pivot_col self._pivot_values = pivot_values self._grouping_sets = grouping_sets @@ -859,7 +926,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: grouping_set=[c.to_plan(session) for c in grouping_set] ) ) - return plan + return self._with_relations(plan, session) class Join(LogicalPlan): @@ -870,7 +937,16 @@ def __init__( on: Optional[Union[str, List[str], Column, List[Column]]], how: Optional[str], ) -> None: - super().__init__(left) + super().__init__( + left, + self._collect_references( + [] + if on is None or isinstance(on, str) + else [on] + if isinstance(on, Column) + else [c for c in on if isinstance(c, Column)] + ), + ) self.left = cast(LogicalPlan, left) self.right = right self.on = on @@ -893,7 +969,35 @@ def __init__( else: raise AnalysisException( errorClass="UNSUPPORTED_JOIN_TYPE", - messageParameters={"join_type": how}, + messageParameters={ + "typ": how, + "supported": ( + "'" + + "', '".join( + [ + "inner", + "outer", + "full", + "fullouter", + "full_outer", + "leftouter", + "left", + "left_outer", + "rightouter", + "right", + "right_outer", + "leftsemi", + "left_semi", + "semi", + "leftanti", + "left_anti", + "anti", + "cross", + ] + ) + + "'" + ), + }, ) self.how = join_type @@ -914,7 +1018,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: merge_column = functools.reduce(lambda c1, c2: c1 & c2, self.on) plan.join.join_condition.CopyFrom(cast(Column, merge_column).to_plan(session)) plan.join.join_type = self.how - return plan + return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: @@ -954,7 +1058,20 @@ def __init__( allow_exact_matches: bool, direction: str, ) -> None: - super().__init__(left) + super().__init__( + left, + self._collect_references( + [left_as_of, right_as_of] + + ( + [] + if on is None or isinstance(on, str) + else [on] + if isinstance(on, Column) + else [c for c in on if isinstance(c, Column)] + ) + + ([tolerance] if tolerance is not None else []) + ), + ) self.left = left self.right = right self.left_as_of = left_as_of @@ -994,7 +1111,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.as_of_join.allow_exact_matches = self.allow_exact_matches plan.as_of_join.direction = self.direction - return plan + return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: @@ -1028,6 +1145,74 @@ def _repr_html_(self) -> str: """ +class LateralJoin(LogicalPlan): + def __init__( + self, + left: Optional[LogicalPlan], + right: LogicalPlan, + on: Optional[Column], + how: Optional[str], + ) -> None: + super().__init__(left, self._collect_references([on] if on is not None else [])) + self.left = cast(LogicalPlan, left) + self.right = right + self.on = on + if how is None: + join_type = proto.Join.JoinType.JOIN_TYPE_INNER + elif how == "inner": + join_type = proto.Join.JoinType.JOIN_TYPE_INNER + elif how in ["leftouter", "left"]: + join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER + elif how == "cross": + join_type = proto.Join.JoinType.JOIN_TYPE_CROSS + else: + raise AnalysisException( + errorClass="UNSUPPORTED_JOIN_TYPE", + messageParameters={ + "typ": how, + "supported": ( + "'" + + "', '".join(["inner", "leftouter", "left", "left_outer", "cross"]) + + "'" + ), + }, + ) + self.how = join_type + + def plan(self, session: "SparkConnectClient") -> proto.Relation: + plan = self._create_proto_relation() + plan.lateral_join.left.CopyFrom(self.left.plan(session)) + plan.lateral_join.right.CopyFrom(self.right.plan(session)) + if self.on is not None: + plan.lateral_join.join_condition.CopyFrom(self.on.to_plan(session)) + plan.lateral_join.join_type = self.how + return self._with_relations(plan, session) + + @property + def observations(self) -> Dict[str, "Observation"]: + return dict(**super().observations, **self.right.observations) + + def print(self, indent: int = 0) -> str: + i = " " * indent + o = " " * (indent + LogicalPlan.INDENT) + n = indent + LogicalPlan.INDENT * 2 + return ( + f"{i}\n{o}" + f"left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}" + ) + + def _repr_html_(self) -> str: + return f""" +
    +
  • + LateralJoin
    + Left: {self.left._repr_html_()} + Right: {self.right._repr_html_()} +
  • +
+ """ + + class SetOperation(LogicalPlan): def __init__( self, @@ -1129,9 +1314,9 @@ def __init__( num_partitions: Optional[int], columns: List[Column], ) -> None: - super().__init__(child) - self.num_partitions = num_partitions assert all(isinstance(c, Column) for c in columns) + super().__init__(child, self._collect_references(columns)) + self.num_partitions = num_partitions self.columns = columns def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -1144,7 +1329,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.repartition_by_expression.input.CopyFrom(self._child.plan(session)) if self.num_partitions is not None: plan.repartition_by_expression.num_partitions = self.num_partitions - return plan + return self._with_relations(plan, session) class SubqueryAlias(LogicalPlan): @@ -1190,8 +1375,6 @@ def __init__( named_args: Optional[Dict[str, Column]] = None, views: Optional[Sequence[SubqueryAlias]] = None, ) -> None: - super().__init__(None) - if args is not None: assert isinstance(args, List) assert all(isinstance(arg, Column) for arg in args) @@ -1205,10 +1388,8 @@ def __init__( if views is not None: assert isinstance(views, List) assert all(isinstance(v, SubqueryAlias) for v in views) - if len(views) > 0: - # reserved plan id for WithRelations - self._plan_id_with_rel = LogicalPlan._fresh_plan_id() + super().__init__(None, views) self._query = query self._args = args self._named_args = named_args @@ -1224,20 +1405,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: for k, arg in self._named_args.items(): plan.sql.named_arguments[k].CopyFrom(arg.to_plan(session)) - if self._views is not None and len(self._views) > 0: - # build new plan like - # with_relations [id 10] - # root: sql [id 9] - # reference: - # view#1: [id 8] - # view#2: [id 5] - sql_plan = plan - plan = proto.Relation() - plan.common.plan_id = self._plan_id_with_rel - plan.with_relations.root.CopyFrom(sql_plan) - plan.with_relations.references.extend([v.plan(session) for v in self._views]) - - return plan + return self._with_relations(plan, session) def command(self, session: "SparkConnectClient") -> proto.Command: cmd = proto.Command() @@ -1311,7 +1479,7 @@ def __init__( variable_column_name: str, value_column_name: str, ) -> None: - super().__init__(child) + super().__init__(child, self._collect_references(ids + (values or []))) self.ids = ids self.values = values self.variable_column_name = variable_column_name @@ -1326,7 +1494,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.unpivot.values.values.extend([v.to_plan(session) for v in self.values]) plan.unpivot.variable_column_name = self.variable_column_name plan.unpivot.value_column_name = self.value_column_name - return plan + return self._with_relations(plan, session) class Transpose(LogicalPlan): @@ -1337,7 +1505,7 @@ def __init__( child: Optional["LogicalPlan"], index_columns: Sequence[Column], ) -> None: - super().__init__(child) + super().__init__(child, self._collect_references(index_columns)) self.index_columns = index_columns def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -1347,12 +1515,12 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: if self.index_columns is not None and len(self.index_columns) > 0: for index_column in self.index_columns: plan.transpose.index_columns.append(index_column.to_plan(session)) - return plan + return self._with_relations(plan, session) class UnresolvedTableValuedFunction(LogicalPlan): def __init__(self, name: str, args: Sequence[Column]): - super().__init__(None) + super().__init__(None, self._collect_references(args)) self._name = name self._args = args @@ -1361,7 +1529,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.unresolved_table_valued_function.function_name = self._name for arg in self._args: plan.unresolved_table_valued_function.arguments.append(arg.to_plan(session)) - return plan + return self._with_relations(plan, session) class CollectMetrics(LogicalPlan): @@ -1373,9 +1541,9 @@ def __init__( observation: Union[str, "Observation"], exprs: List[Column], ) -> None: - super().__init__(child) - self._observation = observation assert all(isinstance(e, Column) for e in exprs) + super().__init__(child, self._collect_references(exprs)) + self._observation = observation self._exprs = exprs def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -1388,7 +1556,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: else str(self._observation._name) ) plan.collect_metrics.metrics.extend([e.to_plan(session) for e in self._exprs]) - return plan + return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: @@ -1473,13 +1641,13 @@ def __init__( cols: Optional[List[str]], replacements: Sequence[Tuple[Column, Column]], ) -> None: - super().__init__(child) - self.cols = cols - assert replacements is not None and isinstance(replacements, List) for k, v in replacements: assert k is not None and isinstance(k, Column) assert v is not None and isinstance(v, Column) + + super().__init__(child, self._collect_references([e for t in replacements for e in t])) + self.cols = cols self.replacements = replacements def plan(self, session: "SparkConnectClient") -> proto.Relation: @@ -1494,7 +1662,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: replacement.old_value.CopyFrom(old_value.to_plan(session).literal) replacement.new_value.CopyFrom(new_value.to_plan(session).literal) plan.replace.replacements.append(replacement) - return plan + return self._with_relations(plan, session) class StatSummary(LogicalPlan): @@ -1604,8 +1772,6 @@ def __init__( fractions: Sequence[Tuple[Column, float]], seed: int, ) -> None: - super().__init__(child) - assert col is not None and isinstance(col, (Column, str)) assert fractions is not None and isinstance(fractions, List) @@ -1615,6 +1781,12 @@ def __init__( assert seed is None or isinstance(seed, int) + super().__init__( + child, + self._collect_references( + [col] if isinstance(col, Column) else [] + [c for c, _ in fractions] + ), + ) self._col = col self._fractions = fractions self._seed = seed @@ -1631,7 +1803,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: fraction.fraction = float(v) plan.sample_by.fractions.append(fraction) plan.sample_by.seed = self._seed - return plan + return self._with_relations(plan, session) class StatCorr(LogicalPlan): @@ -2279,7 +2451,7 @@ def __init__( ): assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) - super().__init__(child) + super().__init__(child, self._collect_references(grouping_cols)) self._grouping_cols = grouping_cols self._function = function._build_common_inline_user_defined_function(*cols) @@ -2291,7 +2463,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: [c.to_plan(session) for c in self._grouping_cols] ) plan.group_map.func.CopyFrom(self._function.to_plan_udf(session)) - return plan + return self._with_relations(plan, session) class CoGroupMap(LogicalPlan): @@ -2312,7 +2484,7 @@ def __init__( isinstance(c, Column) for c in other_grouping_cols ) - super().__init__(input) + super().__init__(input, self._collect_references(input_grouping_cols + other_grouping_cols)) self._input_grouping_cols = input_grouping_cols self._other_grouping_cols = other_grouping_cols self._other = cast(LogicalPlan, other) @@ -2332,7 +2504,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: [c.to_plan(session) for c in self._other_grouping_cols] ) plan.co_group_map.func.CopyFrom(self._function.to_plan_udf(session)) - return plan + return self._with_relations(plan, session) class ApplyInPandasWithState(LogicalPlan): @@ -2351,7 +2523,7 @@ def __init__( ): assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) - super().__init__(child) + super().__init__(child, self._collect_references(grouping_cols)) self._grouping_cols = grouping_cols self._function = function._build_common_inline_user_defined_function(*cols) self._output_schema = output_schema @@ -2371,7 +2543,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.apply_in_pandas_with_state.state_schema = self._state_schema plan.apply_in_pandas_with_state.output_mode = self._output_mode plan.apply_in_pandas_with_state.timeout_conf = self._timeout_conf - return plan + return self._with_relations(plan, session) class PythonUDTF: @@ -2435,7 +2607,7 @@ def __init__( deterministic: bool, arguments: Sequence[Expression], ) -> None: - super().__init__(None) + super().__init__(None, self._collect_references(arguments)) self._function_name = function_name self._deterministic = deterministic self._arguments = arguments @@ -2452,7 +2624,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.common_inline_user_defined_table_function.python_udtf.CopyFrom( self._function.to_plan(session) ) - return plan + return self._with_relations(plan, session) def udtf_plan( self, session: "SparkConnectClient" diff --git a/python/pyspark/sql/connect/proto/__init__.py b/python/pyspark/sql/connect/proto/__init__.py index 3e8d074d963dc..0877696c2680e 100644 --- a/python/pyspark/sql/connect/proto/__init__.py +++ b/python/pyspark/sql/connect/proto/__init__.py @@ -23,3 +23,5 @@ from pyspark.sql.connect.proto.relations_pb2 import * from pyspark.sql.connect.proto.catalog_pb2 import * from pyspark.sql.connect.proto.common_pb2 import * +from pyspark.sql.connect.proto.ml_pb2 import * +from pyspark.sql.connect.proto.ml_common_pb2 import * diff --git a/python/pyspark/sql/connect/proto/base_pb2.py b/python/pyspark/sql/connect/proto/base_pb2.py index 620f413f62c00..2fbc4287db786 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.py +++ b/python/pyspark/sql/connect/proto/base_pb2.py @@ -40,10 +40,11 @@ from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_expressions__pb2 from pyspark.sql.connect.proto import relations_pb2 as spark_dot_connect_dot_relations__pb2 from pyspark.sql.connect.proto import types_pb2 as spark_dot_connect_dot_types__pb2 +from pyspark.sql.connect.proto import ml_pb2 as spark_dot_connect_dot_ml__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x18spark/connect/base.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1cspark/connect/commands.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x19spark/connect/types.proto"t\n\x04Plan\x12-\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x04root\x12\x32\n\x07\x63ommand\x18\x02 \x01(\x0b\x32\x16.spark.connect.CommandH\x00R\x07\x63ommandB\t\n\x07op_type"z\n\x0bUserContext\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12\x1b\n\tuser_name\x18\x02 \x01(\tR\x08userName\x12\x35\n\nextensions\x18\xe7\x07 \x03(\x0b\x32\x14.google.protobuf.AnyR\nextensions"\xf8\x13\n\x12\x41nalyzePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x11 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12\x42\n\x06schema\x18\x04 \x01(\x0b\x32(.spark.connect.AnalyzePlanRequest.SchemaH\x00R\x06schema\x12\x45\n\x07\x65xplain\x18\x05 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.ExplainH\x00R\x07\x65xplain\x12O\n\x0btree_string\x18\x06 \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.TreeStringH\x00R\ntreeString\x12\x46\n\x08is_local\x18\x07 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.IsLocalH\x00R\x07isLocal\x12R\n\x0cis_streaming\x18\x08 \x01(\x0b\x32-.spark.connect.AnalyzePlanRequest.IsStreamingH\x00R\x0bisStreaming\x12O\n\x0binput_files\x18\t \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.InputFilesH\x00R\ninputFiles\x12U\n\rspark_version\x18\n \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SparkVersionH\x00R\x0csparkVersion\x12I\n\tddl_parse\x18\x0b \x01(\x0b\x32*.spark.connect.AnalyzePlanRequest.DDLParseH\x00R\x08\x64\x64lParse\x12X\n\x0esame_semantics\x18\x0c \x01(\x0b\x32/.spark.connect.AnalyzePlanRequest.SameSemanticsH\x00R\rsameSemantics\x12U\n\rsemantic_hash\x18\r \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SemanticHashH\x00R\x0csemanticHash\x12\x45\n\x07persist\x18\x0e \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.PersistH\x00R\x07persist\x12K\n\tunpersist\x18\x0f \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.UnpersistH\x00R\tunpersist\x12_\n\x11get_storage_level\x18\x10 \x01(\x0b\x32\x31.spark.connect.AnalyzePlanRequest.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x31\n\x06Schema\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\xbb\x02\n\x07\x45xplain\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12X\n\x0c\x65xplain_mode\x18\x02 \x01(\x0e\x32\x35.spark.connect.AnalyzePlanRequest.Explain.ExplainModeR\x0b\x65xplainMode"\xac\x01\n\x0b\x45xplainMode\x12\x1c\n\x18\x45XPLAIN_MODE_UNSPECIFIED\x10\x00\x12\x17\n\x13\x45XPLAIN_MODE_SIMPLE\x10\x01\x12\x19\n\x15\x45XPLAIN_MODE_EXTENDED\x10\x02\x12\x18\n\x14\x45XPLAIN_MODE_CODEGEN\x10\x03\x12\x15\n\x11\x45XPLAIN_MODE_COST\x10\x04\x12\x1a\n\x16\x45XPLAIN_MODE_FORMATTED\x10\x05\x1aZ\n\nTreeString\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12\x19\n\x05level\x18\x02 \x01(\x05H\x00R\x05level\x88\x01\x01\x42\x08\n\x06_level\x1a\x32\n\x07IsLocal\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x36\n\x0bIsStreaming\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x35\n\nInputFiles\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x0e\n\x0cSparkVersion\x1a)\n\x08\x44\x44LParse\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlString\x1ay\n\rSameSemantics\x12\x34\n\x0btarget_plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\ntargetPlan\x12\x32\n\nother_plan\x18\x02 \x01(\x0b\x32\x13.spark.connect.PlanR\totherPlan\x1a\x37\n\x0cSemanticHash\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x97\x01\n\x07Persist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x45\n\rstorage_level\x18\x02 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level\x1an\n\tUnpersist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x1f\n\x08\x62locking\x18\x02 \x01(\x08H\x00R\x08\x62locking\x88\x01\x01\x42\x0b\n\t_blocking\x1a\x46\n\x0fGetStorageLevel\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relationB\t\n\x07\x61nalyzeB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xce\r\n\x13\x41nalyzePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12\x43\n\x06schema\x18\x02 \x01(\x0b\x32).spark.connect.AnalyzePlanResponse.SchemaH\x00R\x06schema\x12\x46\n\x07\x65xplain\x18\x03 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.ExplainH\x00R\x07\x65xplain\x12P\n\x0btree_string\x18\x04 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.TreeStringH\x00R\ntreeString\x12G\n\x08is_local\x18\x05 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.IsLocalH\x00R\x07isLocal\x12S\n\x0cis_streaming\x18\x06 \x01(\x0b\x32..spark.connect.AnalyzePlanResponse.IsStreamingH\x00R\x0bisStreaming\x12P\n\x0binput_files\x18\x07 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.InputFilesH\x00R\ninputFiles\x12V\n\rspark_version\x18\x08 \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SparkVersionH\x00R\x0csparkVersion\x12J\n\tddl_parse\x18\t \x01(\x0b\x32+.spark.connect.AnalyzePlanResponse.DDLParseH\x00R\x08\x64\x64lParse\x12Y\n\x0esame_semantics\x18\n \x01(\x0b\x32\x30.spark.connect.AnalyzePlanResponse.SameSemanticsH\x00R\rsameSemantics\x12V\n\rsemantic_hash\x18\x0b \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SemanticHashH\x00R\x0csemanticHash\x12\x46\n\x07persist\x18\x0c \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.PersistH\x00R\x07persist\x12L\n\tunpersist\x18\r \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.UnpersistH\x00R\tunpersist\x12`\n\x11get_storage_level\x18\x0e \x01(\x0b\x32\x32.spark.connect.AnalyzePlanResponse.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x39\n\x06Schema\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1a\x30\n\x07\x45xplain\x12%\n\x0e\x65xplain_string\x18\x01 \x01(\tR\rexplainString\x1a-\n\nTreeString\x12\x1f\n\x0btree_string\x18\x01 \x01(\tR\ntreeString\x1a$\n\x07IsLocal\x12\x19\n\x08is_local\x18\x01 \x01(\x08R\x07isLocal\x1a\x30\n\x0bIsStreaming\x12!\n\x0cis_streaming\x18\x01 \x01(\x08R\x0bisStreaming\x1a"\n\nInputFiles\x12\x14\n\x05\x66iles\x18\x01 \x03(\tR\x05\x66iles\x1a(\n\x0cSparkVersion\x12\x18\n\x07version\x18\x01 \x01(\tR\x07version\x1a;\n\x08\x44\x44LParse\x12/\n\x06parsed\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06parsed\x1a\'\n\rSameSemantics\x12\x16\n\x06result\x18\x01 \x01(\x08R\x06result\x1a&\n\x0cSemanticHash\x12\x16\n\x06result\x18\x01 \x01(\x05R\x06result\x1a\t\n\x07Persist\x1a\x0b\n\tUnpersist\x1aS\n\x0fGetStorageLevel\x12@\n\rstorage_level\x18\x01 \x01(\x0b\x32\x1b.spark.connect.StorageLevelR\x0cstorageLevelB\x08\n\x06result"\xa3\x05\n\x12\x45xecutePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12&\n\x0coperation_id\x18\x06 \x01(\tH\x01R\x0boperationId\x88\x01\x01\x12\'\n\x04plan\x18\x03 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12X\n\x0frequest_options\x18\x05 \x03(\x0b\x32/.spark.connect.ExecutePlanRequest.RequestOptionR\x0erequestOptions\x12\x12\n\x04tags\x18\x07 \x03(\tR\x04tags\x1a\xa5\x01\n\rRequestOption\x12K\n\x10reattach_options\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ReattachOptionsH\x00R\x0freattachOptions\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x10\n\x0erequest_optionB)\n\'_client_observed_server_side_session_idB\x0f\n\r_operation_idB\x0e\n\x0c_client_type"\xe6\x16\n\x13\x45xecutePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12!\n\x0coperation_id\x18\x0c \x01(\tR\x0boperationId\x12\x1f\n\x0bresponse_id\x18\r \x01(\tR\nresponseId\x12P\n\x0b\x61rrow_batch\x18\x02 \x01(\x0b\x32-.spark.connect.ExecutePlanResponse.ArrowBatchH\x00R\narrowBatch\x12\x63\n\x12sql_command_result\x18\x05 \x01(\x0b\x32\x33.spark.connect.ExecutePlanResponse.SqlCommandResultH\x00R\x10sqlCommandResult\x12~\n#write_stream_operation_start_result\x18\x08 \x01(\x0b\x32..spark.connect.WriteStreamOperationStartResultH\x00R\x1fwriteStreamOperationStartResult\x12q\n\x1estreaming_query_command_result\x18\t \x01(\x0b\x32*.spark.connect.StreamingQueryCommandResultH\x00R\x1bstreamingQueryCommandResult\x12k\n\x1cget_resources_command_result\x18\n \x01(\x0b\x32(.spark.connect.GetResourcesCommandResultH\x00R\x19getResourcesCommandResult\x12\x87\x01\n&streaming_query_manager_command_result\x18\x0b \x01(\x0b\x32\x31.spark.connect.StreamingQueryManagerCommandResultH\x00R"streamingQueryManagerCommandResult\x12\x87\x01\n&streaming_query_listener_events_result\x18\x10 \x01(\x0b\x32\x31.spark.connect.StreamingQueryListenerEventsResultH\x00R"streamingQueryListenerEventsResult\x12\\\n\x0fresult_complete\x18\x0e \x01(\x0b\x32\x31.spark.connect.ExecutePlanResponse.ResultCompleteH\x00R\x0eresultComplete\x12\x87\x01\n&create_resource_profile_command_result\x18\x11 \x01(\x0b\x32\x31.spark.connect.CreateResourceProfileCommandResultH\x00R"createResourceProfileCommandResult\x12\x65\n\x12\x65xecution_progress\x18\x12 \x01(\x0b\x32\x34.spark.connect.ExecutePlanResponse.ExecutionProgressH\x00R\x11\x65xecutionProgress\x12\x64\n\x19\x63heckpoint_command_result\x18\x13 \x01(\x0b\x32&.spark.connect.CheckpointCommandResultH\x00R\x17\x63heckpointCommandResult\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x44\n\x07metrics\x18\x04 \x01(\x0b\x32*.spark.connect.ExecutePlanResponse.MetricsR\x07metrics\x12]\n\x10observed_metrics\x18\x06 \x03(\x0b\x32\x32.spark.connect.ExecutePlanResponse.ObservedMetricsR\x0fobservedMetrics\x12/\n\x06schema\x18\x07 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1aG\n\x10SqlCommandResult\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1av\n\nArrowBatch\x12\x1b\n\trow_count\x18\x01 \x01(\x03R\x08rowCount\x12\x12\n\x04\x64\x61ta\x18\x02 \x01(\x0cR\x04\x64\x61ta\x12&\n\x0cstart_offset\x18\x03 \x01(\x03H\x00R\x0bstartOffset\x88\x01\x01\x42\x0f\n\r_start_offset\x1a\x85\x04\n\x07Metrics\x12Q\n\x07metrics\x18\x01 \x03(\x0b\x32\x37.spark.connect.ExecutePlanResponse.Metrics.MetricObjectR\x07metrics\x1a\xcc\x02\n\x0cMetricObject\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x17\n\x07plan_id\x18\x02 \x01(\x03R\x06planId\x12\x16\n\x06parent\x18\x03 \x01(\x03R\x06parent\x12z\n\x11\x65xecution_metrics\x18\x04 \x03(\x0b\x32M.spark.connect.ExecutePlanResponse.Metrics.MetricObject.ExecutionMetricsEntryR\x10\x65xecutionMetrics\x1a{\n\x15\x45xecutionMetricsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ExecutePlanResponse.Metrics.MetricValueR\x05value:\x02\x38\x01\x1aX\n\x0bMetricValue\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n\x05value\x18\x02 \x01(\x03R\x05value\x12\x1f\n\x0bmetric_type\x18\x03 \x01(\tR\nmetricType\x1a\x8d\x01\n\x0fObservedMetrics\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x12\x12\n\x04keys\x18\x03 \x03(\tR\x04keys\x12\x17\n\x07plan_id\x18\x04 \x01(\x03R\x06planId\x1a\x10\n\x0eResultComplete\x1a\xcd\x02\n\x11\x45xecutionProgress\x12V\n\x06stages\x18\x01 \x03(\x0b\x32>.spark.connect.ExecutePlanResponse.ExecutionProgress.StageInfoR\x06stages\x12,\n\x12num_inflight_tasks\x18\x02 \x01(\x03R\x10numInflightTasks\x1a\xb1\x01\n\tStageInfo\x12\x19\n\x08stage_id\x18\x01 \x01(\x03R\x07stageId\x12\x1b\n\tnum_tasks\x18\x02 \x01(\x03R\x08numTasks\x12.\n\x13num_completed_tasks\x18\x03 \x01(\x03R\x11numCompletedTasks\x12(\n\x10input_bytes_read\x18\x04 \x01(\x03R\x0einputBytesRead\x12\x12\n\x04\x64one\x18\x05 \x01(\x08R\x04\x64oneB\x0f\n\rresponse_type"A\n\x08KeyValue\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x19\n\x05value\x18\x02 \x01(\tH\x00R\x05value\x88\x01\x01\x42\x08\n\x06_value"\x87\t\n\rConfigRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x44\n\toperation\x18\x03 \x01(\x0b\x32&.spark.connect.ConfigRequest.OperationR\toperation\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x1a\xf2\x03\n\tOperation\x12\x34\n\x03set\x18\x01 \x01(\x0b\x32 .spark.connect.ConfigRequest.SetH\x00R\x03set\x12\x34\n\x03get\x18\x02 \x01(\x0b\x32 .spark.connect.ConfigRequest.GetH\x00R\x03get\x12W\n\x10get_with_default\x18\x03 \x01(\x0b\x32+.spark.connect.ConfigRequest.GetWithDefaultH\x00R\x0egetWithDefault\x12G\n\nget_option\x18\x04 \x01(\x0b\x32&.spark.connect.ConfigRequest.GetOptionH\x00R\tgetOption\x12>\n\x07get_all\x18\x05 \x01(\x0b\x32#.spark.connect.ConfigRequest.GetAllH\x00R\x06getAll\x12:\n\x05unset\x18\x06 \x01(\x0b\x32".spark.connect.ConfigRequest.UnsetH\x00R\x05unset\x12P\n\ris_modifiable\x18\x07 \x01(\x0b\x32).spark.connect.ConfigRequest.IsModifiableH\x00R\x0cisModifiableB\t\n\x07op_type\x1a\x34\n\x03Set\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x19\n\x03Get\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a?\n\x0eGetWithDefault\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x1f\n\tGetOption\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a\x30\n\x06GetAll\x12\x1b\n\x06prefix\x18\x01 \x01(\tH\x00R\x06prefix\x88\x01\x01\x42\t\n\x07_prefix\x1a\x1b\n\x05Unset\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a"\n\x0cIsModifiable\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keysB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xaf\x01\n\x0e\x43onfigResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x04 \x01(\tR\x13serverSideSessionId\x12-\n\x05pairs\x18\x02 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1a\n\x08warnings\x18\x03 \x03(\tR\x08warnings"\xea\x07\n\x13\x41\x64\x64\x41rtifactsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12$\n\x0b\x63lient_type\x18\x06 \x01(\tH\x02R\nclientType\x88\x01\x01\x12@\n\x05\x62\x61tch\x18\x03 \x01(\x0b\x32(.spark.connect.AddArtifactsRequest.BatchH\x00R\x05\x62\x61tch\x12Z\n\x0b\x62\x65gin_chunk\x18\x04 \x01(\x0b\x32\x37.spark.connect.AddArtifactsRequest.BeginChunkedArtifactH\x00R\nbeginChunk\x12H\n\x05\x63hunk\x18\x05 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkH\x00R\x05\x63hunk\x1a\x35\n\rArtifactChunk\x12\x12\n\x04\x64\x61ta\x18\x01 \x01(\x0cR\x04\x64\x61ta\x12\x10\n\x03\x63rc\x18\x02 \x01(\x03R\x03\x63rc\x1ao\n\x13SingleChunkArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x44\n\x04\x64\x61ta\x18\x02 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x04\x64\x61ta\x1a]\n\x05\x42\x61tch\x12T\n\tartifacts\x18\x01 \x03(\x0b\x32\x36.spark.connect.AddArtifactsRequest.SingleChunkArtifactR\tartifacts\x1a\xc1\x01\n\x14\x42\x65ginChunkedArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1f\n\x0btotal_bytes\x18\x02 \x01(\x03R\ntotalBytes\x12\x1d\n\nnum_chunks\x18\x03 \x01(\x03R\tnumChunks\x12U\n\rinitial_chunk\x18\x04 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x0cinitialChunkB\t\n\x07payloadB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x02\n\x14\x41\x64\x64\x41rtifactsResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\tartifacts\x18\x01 \x03(\x0b\x32\x33.spark.connect.AddArtifactsResponse.ArtifactSummaryR\tartifacts\x1aQ\n\x0f\x41rtifactSummary\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\x11is_crc_successful\x18\x02 \x01(\x08R\x0fisCrcSuccessful"\xc6\x02\n\x17\x41rtifactStatusesRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x01R\nclientType\x88\x01\x01\x12\x14\n\x05names\x18\x04 \x03(\tR\x05namesB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xe0\x02\n\x18\x41rtifactStatusesResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\x08statuses\x18\x01 \x03(\x0b\x32\x35.spark.connect.ArtifactStatusesResponse.StatusesEntryR\x08statuses\x1as\n\rStatusesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ArtifactStatusesResponse.ArtifactStatusR\x05value:\x02\x38\x01\x1a(\n\x0e\x41rtifactStatus\x12\x16\n\x06\x65xists\x18\x01 \x01(\x08R\x06\x65xists"\xdb\x04\n\x10InterruptRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12T\n\x0einterrupt_type\x18\x04 \x01(\x0e\x32-.spark.connect.InterruptRequest.InterruptTypeR\rinterruptType\x12%\n\roperation_tag\x18\x05 \x01(\tH\x00R\x0coperationTag\x12#\n\x0coperation_id\x18\x06 \x01(\tH\x00R\x0boperationId"\x80\x01\n\rInterruptType\x12\x1e\n\x1aINTERRUPT_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12INTERRUPT_TYPE_ALL\x10\x01\x12\x16\n\x12INTERRUPT_TYPE_TAG\x10\x02\x12\x1f\n\x1bINTERRUPT_TYPE_OPERATION_ID\x10\x03\x42\x0b\n\tinterruptB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x01\n\x11InterruptResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\'\n\x0finterrupted_ids\x18\x02 \x03(\tR\x0einterruptedIds"5\n\x0fReattachOptions\x12"\n\x0creattachable\x18\x01 \x01(\x08R\x0creattachable"\x96\x03\n\x16ReattachExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x06 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x12-\n\x10last_response_id\x18\x05 \x01(\tH\x02R\x0elastResponseId\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_typeB\x13\n\x11_last_response_id"\xc9\x04\n\x15ReleaseExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12R\n\x0brelease_all\x18\x05 \x01(\x0b\x32/.spark.connect.ReleaseExecuteRequest.ReleaseAllH\x00R\nreleaseAll\x12X\n\rrelease_until\x18\x06 \x01(\x0b\x32\x31.spark.connect.ReleaseExecuteRequest.ReleaseUntilH\x00R\x0creleaseUntil\x1a\x0c\n\nReleaseAll\x1a/\n\x0cReleaseUntil\x12\x1f\n\x0bresponse_id\x18\x01 \x01(\tR\nresponseIdB\t\n\x07releaseB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xa5\x01\n\x16ReleaseExecuteResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12&\n\x0coperation_id\x18\x02 \x01(\tH\x00R\x0boperationId\x88\x01\x01\x42\x0f\n\r_operation_id"\xab\x01\n\x15ReleaseSessionRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x00R\nclientType\x88\x01\x01\x42\x0e\n\x0c_client_type"l\n\x16ReleaseSessionResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x02 \x01(\tR\x13serverSideSessionId"\xcc\x02\n\x18\x46\x65tchErrorDetailsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x19\n\x08\x65rror_id\x18\x03 \x01(\tR\x07\x65rrorId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x93\x0c\n\x19\x46\x65tchErrorDetailsResponse\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\x1d\n\nsession_id\x18\x04 \x01(\tR\tsessionId\x12)\n\x0eroot_error_idx\x18\x01 \x01(\x05H\x00R\x0crootErrorIdx\x88\x01\x01\x12\x46\n\x06\x65rrors\x18\x02 \x03(\x0b\x32..spark.connect.FetchErrorDetailsResponse.ErrorR\x06\x65rrors\x1a\xae\x01\n\x11StackTraceElement\x12\'\n\x0f\x64\x65\x63laring_class\x18\x01 \x01(\tR\x0e\x64\x65\x63laringClass\x12\x1f\n\x0bmethod_name\x18\x02 \x01(\tR\nmethodName\x12 \n\tfile_name\x18\x03 \x01(\tH\x00R\x08\x66ileName\x88\x01\x01\x12\x1f\n\x0bline_number\x18\x04 \x01(\x05R\nlineNumberB\x0c\n\n_file_name\x1a\xf0\x02\n\x0cQueryContext\x12\x64\n\x0c\x63ontext_type\x18\n \x01(\x0e\x32\x41.spark.connect.FetchErrorDetailsResponse.QueryContext.ContextTypeR\x0b\x63ontextType\x12\x1f\n\x0bobject_type\x18\x01 \x01(\tR\nobjectType\x12\x1f\n\x0bobject_name\x18\x02 \x01(\tR\nobjectName\x12\x1f\n\x0bstart_index\x18\x03 \x01(\x05R\nstartIndex\x12\x1d\n\nstop_index\x18\x04 \x01(\x05R\tstopIndex\x12\x1a\n\x08\x66ragment\x18\x05 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x06 \x01(\tR\x08\x63\x61llSite\x12\x18\n\x07summary\x18\x07 \x01(\tR\x07summary"%\n\x0b\x43ontextType\x12\x07\n\x03SQL\x10\x00\x12\r\n\tDATAFRAME\x10\x01\x1a\x99\x03\n\x0eSparkThrowable\x12$\n\x0b\x65rror_class\x18\x01 \x01(\tH\x00R\nerrorClass\x88\x01\x01\x12}\n\x12message_parameters\x18\x02 \x03(\x0b\x32N.spark.connect.FetchErrorDetailsResponse.SparkThrowable.MessageParametersEntryR\x11messageParameters\x12\\\n\x0equery_contexts\x18\x03 \x03(\x0b\x32\x35.spark.connect.FetchErrorDetailsResponse.QueryContextR\rqueryContexts\x12 \n\tsql_state\x18\x04 \x01(\tH\x01R\x08sqlState\x88\x01\x01\x1a\x44\n\x16MessageParametersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0e\n\x0c_error_classB\x0c\n\n_sql_state\x1a\xdb\x02\n\x05\x45rror\x12\x30\n\x14\x65rror_type_hierarchy\x18\x01 \x03(\tR\x12\x65rrorTypeHierarchy\x12\x18\n\x07message\x18\x02 \x01(\tR\x07message\x12[\n\x0bstack_trace\x18\x03 \x03(\x0b\x32:.spark.connect.FetchErrorDetailsResponse.StackTraceElementR\nstackTrace\x12 \n\tcause_idx\x18\x04 \x01(\x05H\x00R\x08\x63\x61useIdx\x88\x01\x01\x12\x65\n\x0fspark_throwable\x18\x05 \x01(\x0b\x32\x37.spark.connect.FetchErrorDetailsResponse.SparkThrowableH\x01R\x0esparkThrowable\x88\x01\x01\x42\x0c\n\n_cause_idxB\x12\n\x10_spark_throwableB\x11\n\x0f_root_error_idx"Z\n\x17\x43heckpointCommandResult\x12?\n\x08relation\x18\x01 \x01(\x0b\x32#.spark.connect.CachedRemoteRelationR\x08relation2\xb2\x07\n\x13SparkConnectService\x12X\n\x0b\x45xecutePlan\x12!.spark.connect.ExecutePlanRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12V\n\x0b\x41nalyzePlan\x12!.spark.connect.AnalyzePlanRequest\x1a".spark.connect.AnalyzePlanResponse"\x00\x12G\n\x06\x43onfig\x12\x1c.spark.connect.ConfigRequest\x1a\x1d.spark.connect.ConfigResponse"\x00\x12[\n\x0c\x41\x64\x64\x41rtifacts\x12".spark.connect.AddArtifactsRequest\x1a#.spark.connect.AddArtifactsResponse"\x00(\x01\x12\x63\n\x0e\x41rtifactStatus\x12&.spark.connect.ArtifactStatusesRequest\x1a\'.spark.connect.ArtifactStatusesResponse"\x00\x12P\n\tInterrupt\x12\x1f.spark.connect.InterruptRequest\x1a .spark.connect.InterruptResponse"\x00\x12`\n\x0fReattachExecute\x12%.spark.connect.ReattachExecuteRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12_\n\x0eReleaseExecute\x12$.spark.connect.ReleaseExecuteRequest\x1a%.spark.connect.ReleaseExecuteResponse"\x00\x12_\n\x0eReleaseSession\x12$.spark.connect.ReleaseSessionRequest\x1a%.spark.connect.ReleaseSessionResponse"\x00\x12h\n\x11\x46\x65tchErrorDetails\x12\'.spark.connect.FetchErrorDetailsRequest\x1a(.spark.connect.FetchErrorDetailsResponse"\x00\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x18spark/connect/base.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1cspark/connect/commands.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x19spark/connect/types.proto\x1a\x16spark/connect/ml.proto"t\n\x04Plan\x12-\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x04root\x12\x32\n\x07\x63ommand\x18\x02 \x01(\x0b\x32\x16.spark.connect.CommandH\x00R\x07\x63ommandB\t\n\x07op_type"z\n\x0bUserContext\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12\x1b\n\tuser_name\x18\x02 \x01(\tR\x08userName\x12\x35\n\nextensions\x18\xe7\x07 \x03(\x0b\x32\x14.google.protobuf.AnyR\nextensions"\xf5\x14\n\x12\x41nalyzePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x11 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12\x42\n\x06schema\x18\x04 \x01(\x0b\x32(.spark.connect.AnalyzePlanRequest.SchemaH\x00R\x06schema\x12\x45\n\x07\x65xplain\x18\x05 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.ExplainH\x00R\x07\x65xplain\x12O\n\x0btree_string\x18\x06 \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.TreeStringH\x00R\ntreeString\x12\x46\n\x08is_local\x18\x07 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.IsLocalH\x00R\x07isLocal\x12R\n\x0cis_streaming\x18\x08 \x01(\x0b\x32-.spark.connect.AnalyzePlanRequest.IsStreamingH\x00R\x0bisStreaming\x12O\n\x0binput_files\x18\t \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.InputFilesH\x00R\ninputFiles\x12U\n\rspark_version\x18\n \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SparkVersionH\x00R\x0csparkVersion\x12I\n\tddl_parse\x18\x0b \x01(\x0b\x32*.spark.connect.AnalyzePlanRequest.DDLParseH\x00R\x08\x64\x64lParse\x12X\n\x0esame_semantics\x18\x0c \x01(\x0b\x32/.spark.connect.AnalyzePlanRequest.SameSemanticsH\x00R\rsameSemantics\x12U\n\rsemantic_hash\x18\r \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SemanticHashH\x00R\x0csemanticHash\x12\x45\n\x07persist\x18\x0e \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.PersistH\x00R\x07persist\x12K\n\tunpersist\x18\x0f \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.UnpersistH\x00R\tunpersist\x12_\n\x11get_storage_level\x18\x10 \x01(\x0b\x32\x31.spark.connect.AnalyzePlanRequest.GetStorageLevelH\x00R\x0fgetStorageLevel\x12M\n\x0bjson_to_ddl\x18\x12 \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.JsonToDDLH\x00R\tjsonToDdl\x1a\x31\n\x06Schema\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\xbb\x02\n\x07\x45xplain\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12X\n\x0c\x65xplain_mode\x18\x02 \x01(\x0e\x32\x35.spark.connect.AnalyzePlanRequest.Explain.ExplainModeR\x0b\x65xplainMode"\xac\x01\n\x0b\x45xplainMode\x12\x1c\n\x18\x45XPLAIN_MODE_UNSPECIFIED\x10\x00\x12\x17\n\x13\x45XPLAIN_MODE_SIMPLE\x10\x01\x12\x19\n\x15\x45XPLAIN_MODE_EXTENDED\x10\x02\x12\x18\n\x14\x45XPLAIN_MODE_CODEGEN\x10\x03\x12\x15\n\x11\x45XPLAIN_MODE_COST\x10\x04\x12\x1a\n\x16\x45XPLAIN_MODE_FORMATTED\x10\x05\x1aZ\n\nTreeString\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12\x19\n\x05level\x18\x02 \x01(\x05H\x00R\x05level\x88\x01\x01\x42\x08\n\x06_level\x1a\x32\n\x07IsLocal\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x36\n\x0bIsStreaming\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x35\n\nInputFiles\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x0e\n\x0cSparkVersion\x1a)\n\x08\x44\x44LParse\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlString\x1ay\n\rSameSemantics\x12\x34\n\x0btarget_plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\ntargetPlan\x12\x32\n\nother_plan\x18\x02 \x01(\x0b\x32\x13.spark.connect.PlanR\totherPlan\x1a\x37\n\x0cSemanticHash\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x97\x01\n\x07Persist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x45\n\rstorage_level\x18\x02 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level\x1an\n\tUnpersist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x1f\n\x08\x62locking\x18\x02 \x01(\x08H\x00R\x08\x62locking\x88\x01\x01\x42\x0b\n\t_blocking\x1a\x46\n\x0fGetStorageLevel\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1a,\n\tJsonToDDL\x12\x1f\n\x0bjson_string\x18\x01 \x01(\tR\njsonStringB\t\n\x07\x61nalyzeB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xca\x0e\n\x13\x41nalyzePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12\x43\n\x06schema\x18\x02 \x01(\x0b\x32).spark.connect.AnalyzePlanResponse.SchemaH\x00R\x06schema\x12\x46\n\x07\x65xplain\x18\x03 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.ExplainH\x00R\x07\x65xplain\x12P\n\x0btree_string\x18\x04 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.TreeStringH\x00R\ntreeString\x12G\n\x08is_local\x18\x05 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.IsLocalH\x00R\x07isLocal\x12S\n\x0cis_streaming\x18\x06 \x01(\x0b\x32..spark.connect.AnalyzePlanResponse.IsStreamingH\x00R\x0bisStreaming\x12P\n\x0binput_files\x18\x07 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.InputFilesH\x00R\ninputFiles\x12V\n\rspark_version\x18\x08 \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SparkVersionH\x00R\x0csparkVersion\x12J\n\tddl_parse\x18\t \x01(\x0b\x32+.spark.connect.AnalyzePlanResponse.DDLParseH\x00R\x08\x64\x64lParse\x12Y\n\x0esame_semantics\x18\n \x01(\x0b\x32\x30.spark.connect.AnalyzePlanResponse.SameSemanticsH\x00R\rsameSemantics\x12V\n\rsemantic_hash\x18\x0b \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SemanticHashH\x00R\x0csemanticHash\x12\x46\n\x07persist\x18\x0c \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.PersistH\x00R\x07persist\x12L\n\tunpersist\x18\r \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.UnpersistH\x00R\tunpersist\x12`\n\x11get_storage_level\x18\x0e \x01(\x0b\x32\x32.spark.connect.AnalyzePlanResponse.GetStorageLevelH\x00R\x0fgetStorageLevel\x12N\n\x0bjson_to_ddl\x18\x10 \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.JsonToDDLH\x00R\tjsonToDdl\x1a\x39\n\x06Schema\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1a\x30\n\x07\x45xplain\x12%\n\x0e\x65xplain_string\x18\x01 \x01(\tR\rexplainString\x1a-\n\nTreeString\x12\x1f\n\x0btree_string\x18\x01 \x01(\tR\ntreeString\x1a$\n\x07IsLocal\x12\x19\n\x08is_local\x18\x01 \x01(\x08R\x07isLocal\x1a\x30\n\x0bIsStreaming\x12!\n\x0cis_streaming\x18\x01 \x01(\x08R\x0bisStreaming\x1a"\n\nInputFiles\x12\x14\n\x05\x66iles\x18\x01 \x03(\tR\x05\x66iles\x1a(\n\x0cSparkVersion\x12\x18\n\x07version\x18\x01 \x01(\tR\x07version\x1a;\n\x08\x44\x44LParse\x12/\n\x06parsed\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06parsed\x1a\'\n\rSameSemantics\x12\x16\n\x06result\x18\x01 \x01(\x08R\x06result\x1a&\n\x0cSemanticHash\x12\x16\n\x06result\x18\x01 \x01(\x05R\x06result\x1a\t\n\x07Persist\x1a\x0b\n\tUnpersist\x1aS\n\x0fGetStorageLevel\x12@\n\rstorage_level\x18\x01 \x01(\x0b\x32\x1b.spark.connect.StorageLevelR\x0cstorageLevel\x1a*\n\tJsonToDDL\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlStringB\x08\n\x06result"\xa3\x05\n\x12\x45xecutePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12&\n\x0coperation_id\x18\x06 \x01(\tH\x01R\x0boperationId\x88\x01\x01\x12\'\n\x04plan\x18\x03 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12X\n\x0frequest_options\x18\x05 \x03(\x0b\x32/.spark.connect.ExecutePlanRequest.RequestOptionR\x0erequestOptions\x12\x12\n\x04tags\x18\x07 \x03(\tR\x04tags\x1a\xa5\x01\n\rRequestOption\x12K\n\x10reattach_options\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ReattachOptionsH\x00R\x0freattachOptions\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x10\n\x0erequest_optionB)\n\'_client_observed_server_side_session_idB\x0f\n\r_operation_idB\x0e\n\x0c_client_type"\xb4\x17\n\x13\x45xecutePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12!\n\x0coperation_id\x18\x0c \x01(\tR\x0boperationId\x12\x1f\n\x0bresponse_id\x18\r \x01(\tR\nresponseId\x12P\n\x0b\x61rrow_batch\x18\x02 \x01(\x0b\x32-.spark.connect.ExecutePlanResponse.ArrowBatchH\x00R\narrowBatch\x12\x63\n\x12sql_command_result\x18\x05 \x01(\x0b\x32\x33.spark.connect.ExecutePlanResponse.SqlCommandResultH\x00R\x10sqlCommandResult\x12~\n#write_stream_operation_start_result\x18\x08 \x01(\x0b\x32..spark.connect.WriteStreamOperationStartResultH\x00R\x1fwriteStreamOperationStartResult\x12q\n\x1estreaming_query_command_result\x18\t \x01(\x0b\x32*.spark.connect.StreamingQueryCommandResultH\x00R\x1bstreamingQueryCommandResult\x12k\n\x1cget_resources_command_result\x18\n \x01(\x0b\x32(.spark.connect.GetResourcesCommandResultH\x00R\x19getResourcesCommandResult\x12\x87\x01\n&streaming_query_manager_command_result\x18\x0b \x01(\x0b\x32\x31.spark.connect.StreamingQueryManagerCommandResultH\x00R"streamingQueryManagerCommandResult\x12\x87\x01\n&streaming_query_listener_events_result\x18\x10 \x01(\x0b\x32\x31.spark.connect.StreamingQueryListenerEventsResultH\x00R"streamingQueryListenerEventsResult\x12\\\n\x0fresult_complete\x18\x0e \x01(\x0b\x32\x31.spark.connect.ExecutePlanResponse.ResultCompleteH\x00R\x0eresultComplete\x12\x87\x01\n&create_resource_profile_command_result\x18\x11 \x01(\x0b\x32\x31.spark.connect.CreateResourceProfileCommandResultH\x00R"createResourceProfileCommandResult\x12\x65\n\x12\x65xecution_progress\x18\x12 \x01(\x0b\x32\x34.spark.connect.ExecutePlanResponse.ExecutionProgressH\x00R\x11\x65xecutionProgress\x12\x64\n\x19\x63heckpoint_command_result\x18\x13 \x01(\x0b\x32&.spark.connect.CheckpointCommandResultH\x00R\x17\x63heckpointCommandResult\x12L\n\x11ml_command_result\x18\x14 \x01(\x0b\x32\x1e.spark.connect.MlCommandResultH\x00R\x0fmlCommandResult\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x44\n\x07metrics\x18\x04 \x01(\x0b\x32*.spark.connect.ExecutePlanResponse.MetricsR\x07metrics\x12]\n\x10observed_metrics\x18\x06 \x03(\x0b\x32\x32.spark.connect.ExecutePlanResponse.ObservedMetricsR\x0fobservedMetrics\x12/\n\x06schema\x18\x07 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1aG\n\x10SqlCommandResult\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1av\n\nArrowBatch\x12\x1b\n\trow_count\x18\x01 \x01(\x03R\x08rowCount\x12\x12\n\x04\x64\x61ta\x18\x02 \x01(\x0cR\x04\x64\x61ta\x12&\n\x0cstart_offset\x18\x03 \x01(\x03H\x00R\x0bstartOffset\x88\x01\x01\x42\x0f\n\r_start_offset\x1a\x85\x04\n\x07Metrics\x12Q\n\x07metrics\x18\x01 \x03(\x0b\x32\x37.spark.connect.ExecutePlanResponse.Metrics.MetricObjectR\x07metrics\x1a\xcc\x02\n\x0cMetricObject\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x17\n\x07plan_id\x18\x02 \x01(\x03R\x06planId\x12\x16\n\x06parent\x18\x03 \x01(\x03R\x06parent\x12z\n\x11\x65xecution_metrics\x18\x04 \x03(\x0b\x32M.spark.connect.ExecutePlanResponse.Metrics.MetricObject.ExecutionMetricsEntryR\x10\x65xecutionMetrics\x1a{\n\x15\x45xecutionMetricsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ExecutePlanResponse.Metrics.MetricValueR\x05value:\x02\x38\x01\x1aX\n\x0bMetricValue\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n\x05value\x18\x02 \x01(\x03R\x05value\x12\x1f\n\x0bmetric_type\x18\x03 \x01(\tR\nmetricType\x1a\x8d\x01\n\x0fObservedMetrics\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x12\x12\n\x04keys\x18\x03 \x03(\tR\x04keys\x12\x17\n\x07plan_id\x18\x04 \x01(\x03R\x06planId\x1a\x10\n\x0eResultComplete\x1a\xcd\x02\n\x11\x45xecutionProgress\x12V\n\x06stages\x18\x01 \x03(\x0b\x32>.spark.connect.ExecutePlanResponse.ExecutionProgress.StageInfoR\x06stages\x12,\n\x12num_inflight_tasks\x18\x02 \x01(\x03R\x10numInflightTasks\x1a\xb1\x01\n\tStageInfo\x12\x19\n\x08stage_id\x18\x01 \x01(\x03R\x07stageId\x12\x1b\n\tnum_tasks\x18\x02 \x01(\x03R\x08numTasks\x12.\n\x13num_completed_tasks\x18\x03 \x01(\x03R\x11numCompletedTasks\x12(\n\x10input_bytes_read\x18\x04 \x01(\x03R\x0einputBytesRead\x12\x12\n\x04\x64one\x18\x05 \x01(\x08R\x04\x64oneB\x0f\n\rresponse_type"A\n\x08KeyValue\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x19\n\x05value\x18\x02 \x01(\tH\x00R\x05value\x88\x01\x01\x42\x08\n\x06_value"\xaf\t\n\rConfigRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x44\n\toperation\x18\x03 \x01(\x0b\x32&.spark.connect.ConfigRequest.OperationR\toperation\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x1a\xf2\x03\n\tOperation\x12\x34\n\x03set\x18\x01 \x01(\x0b\x32 .spark.connect.ConfigRequest.SetH\x00R\x03set\x12\x34\n\x03get\x18\x02 \x01(\x0b\x32 .spark.connect.ConfigRequest.GetH\x00R\x03get\x12W\n\x10get_with_default\x18\x03 \x01(\x0b\x32+.spark.connect.ConfigRequest.GetWithDefaultH\x00R\x0egetWithDefault\x12G\n\nget_option\x18\x04 \x01(\x0b\x32&.spark.connect.ConfigRequest.GetOptionH\x00R\tgetOption\x12>\n\x07get_all\x18\x05 \x01(\x0b\x32#.spark.connect.ConfigRequest.GetAllH\x00R\x06getAll\x12:\n\x05unset\x18\x06 \x01(\x0b\x32".spark.connect.ConfigRequest.UnsetH\x00R\x05unset\x12P\n\ris_modifiable\x18\x07 \x01(\x0b\x32).spark.connect.ConfigRequest.IsModifiableH\x00R\x0cisModifiableB\t\n\x07op_type\x1a\\\n\x03Set\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1b\n\x06silent\x18\x02 \x01(\x08H\x00R\x06silent\x88\x01\x01\x42\t\n\x07_silent\x1a\x19\n\x03Get\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a?\n\x0eGetWithDefault\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x1f\n\tGetOption\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a\x30\n\x06GetAll\x12\x1b\n\x06prefix\x18\x01 \x01(\tH\x00R\x06prefix\x88\x01\x01\x42\t\n\x07_prefix\x1a\x1b\n\x05Unset\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a"\n\x0cIsModifiable\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keysB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xaf\x01\n\x0e\x43onfigResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x04 \x01(\tR\x13serverSideSessionId\x12-\n\x05pairs\x18\x02 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1a\n\x08warnings\x18\x03 \x03(\tR\x08warnings"\xea\x07\n\x13\x41\x64\x64\x41rtifactsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12$\n\x0b\x63lient_type\x18\x06 \x01(\tH\x02R\nclientType\x88\x01\x01\x12@\n\x05\x62\x61tch\x18\x03 \x01(\x0b\x32(.spark.connect.AddArtifactsRequest.BatchH\x00R\x05\x62\x61tch\x12Z\n\x0b\x62\x65gin_chunk\x18\x04 \x01(\x0b\x32\x37.spark.connect.AddArtifactsRequest.BeginChunkedArtifactH\x00R\nbeginChunk\x12H\n\x05\x63hunk\x18\x05 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkH\x00R\x05\x63hunk\x1a\x35\n\rArtifactChunk\x12\x12\n\x04\x64\x61ta\x18\x01 \x01(\x0cR\x04\x64\x61ta\x12\x10\n\x03\x63rc\x18\x02 \x01(\x03R\x03\x63rc\x1ao\n\x13SingleChunkArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x44\n\x04\x64\x61ta\x18\x02 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x04\x64\x61ta\x1a]\n\x05\x42\x61tch\x12T\n\tartifacts\x18\x01 \x03(\x0b\x32\x36.spark.connect.AddArtifactsRequest.SingleChunkArtifactR\tartifacts\x1a\xc1\x01\n\x14\x42\x65ginChunkedArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1f\n\x0btotal_bytes\x18\x02 \x01(\x03R\ntotalBytes\x12\x1d\n\nnum_chunks\x18\x03 \x01(\x03R\tnumChunks\x12U\n\rinitial_chunk\x18\x04 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x0cinitialChunkB\t\n\x07payloadB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x02\n\x14\x41\x64\x64\x41rtifactsResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\tartifacts\x18\x01 \x03(\x0b\x32\x33.spark.connect.AddArtifactsResponse.ArtifactSummaryR\tartifacts\x1aQ\n\x0f\x41rtifactSummary\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\x11is_crc_successful\x18\x02 \x01(\x08R\x0fisCrcSuccessful"\xc6\x02\n\x17\x41rtifactStatusesRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x01R\nclientType\x88\x01\x01\x12\x14\n\x05names\x18\x04 \x03(\tR\x05namesB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xe0\x02\n\x18\x41rtifactStatusesResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\x08statuses\x18\x01 \x03(\x0b\x32\x35.spark.connect.ArtifactStatusesResponse.StatusesEntryR\x08statuses\x1as\n\rStatusesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ArtifactStatusesResponse.ArtifactStatusR\x05value:\x02\x38\x01\x1a(\n\x0e\x41rtifactStatus\x12\x16\n\x06\x65xists\x18\x01 \x01(\x08R\x06\x65xists"\xdb\x04\n\x10InterruptRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12T\n\x0einterrupt_type\x18\x04 \x01(\x0e\x32-.spark.connect.InterruptRequest.InterruptTypeR\rinterruptType\x12%\n\roperation_tag\x18\x05 \x01(\tH\x00R\x0coperationTag\x12#\n\x0coperation_id\x18\x06 \x01(\tH\x00R\x0boperationId"\x80\x01\n\rInterruptType\x12\x1e\n\x1aINTERRUPT_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12INTERRUPT_TYPE_ALL\x10\x01\x12\x16\n\x12INTERRUPT_TYPE_TAG\x10\x02\x12\x1f\n\x1bINTERRUPT_TYPE_OPERATION_ID\x10\x03\x42\x0b\n\tinterruptB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x01\n\x11InterruptResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\'\n\x0finterrupted_ids\x18\x02 \x03(\tR\x0einterruptedIds"5\n\x0fReattachOptions\x12"\n\x0creattachable\x18\x01 \x01(\x08R\x0creattachable"\x96\x03\n\x16ReattachExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x06 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x12-\n\x10last_response_id\x18\x05 \x01(\tH\x02R\x0elastResponseId\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_typeB\x13\n\x11_last_response_id"\xc9\x04\n\x15ReleaseExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12R\n\x0brelease_all\x18\x05 \x01(\x0b\x32/.spark.connect.ReleaseExecuteRequest.ReleaseAllH\x00R\nreleaseAll\x12X\n\rrelease_until\x18\x06 \x01(\x0b\x32\x31.spark.connect.ReleaseExecuteRequest.ReleaseUntilH\x00R\x0creleaseUntil\x1a\x0c\n\nReleaseAll\x1a/\n\x0cReleaseUntil\x12\x1f\n\x0bresponse_id\x18\x01 \x01(\tR\nresponseIdB\t\n\x07releaseB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xa5\x01\n\x16ReleaseExecuteResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12&\n\x0coperation_id\x18\x02 \x01(\tH\x00R\x0boperationId\x88\x01\x01\x42\x0f\n\r_operation_id"\xd4\x01\n\x15ReleaseSessionRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x00R\nclientType\x88\x01\x01\x12\'\n\x0f\x61llow_reconnect\x18\x04 \x01(\x08R\x0e\x61llowReconnectB\x0e\n\x0c_client_type"l\n\x16ReleaseSessionResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x02 \x01(\tR\x13serverSideSessionId"\xcc\x02\n\x18\x46\x65tchErrorDetailsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x19\n\x08\x65rror_id\x18\x03 \x01(\tR\x07\x65rrorId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x93\x0c\n\x19\x46\x65tchErrorDetailsResponse\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\x1d\n\nsession_id\x18\x04 \x01(\tR\tsessionId\x12)\n\x0eroot_error_idx\x18\x01 \x01(\x05H\x00R\x0crootErrorIdx\x88\x01\x01\x12\x46\n\x06\x65rrors\x18\x02 \x03(\x0b\x32..spark.connect.FetchErrorDetailsResponse.ErrorR\x06\x65rrors\x1a\xae\x01\n\x11StackTraceElement\x12\'\n\x0f\x64\x65\x63laring_class\x18\x01 \x01(\tR\x0e\x64\x65\x63laringClass\x12\x1f\n\x0bmethod_name\x18\x02 \x01(\tR\nmethodName\x12 \n\tfile_name\x18\x03 \x01(\tH\x00R\x08\x66ileName\x88\x01\x01\x12\x1f\n\x0bline_number\x18\x04 \x01(\x05R\nlineNumberB\x0c\n\n_file_name\x1a\xf0\x02\n\x0cQueryContext\x12\x64\n\x0c\x63ontext_type\x18\n \x01(\x0e\x32\x41.spark.connect.FetchErrorDetailsResponse.QueryContext.ContextTypeR\x0b\x63ontextType\x12\x1f\n\x0bobject_type\x18\x01 \x01(\tR\nobjectType\x12\x1f\n\x0bobject_name\x18\x02 \x01(\tR\nobjectName\x12\x1f\n\x0bstart_index\x18\x03 \x01(\x05R\nstartIndex\x12\x1d\n\nstop_index\x18\x04 \x01(\x05R\tstopIndex\x12\x1a\n\x08\x66ragment\x18\x05 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x06 \x01(\tR\x08\x63\x61llSite\x12\x18\n\x07summary\x18\x07 \x01(\tR\x07summary"%\n\x0b\x43ontextType\x12\x07\n\x03SQL\x10\x00\x12\r\n\tDATAFRAME\x10\x01\x1a\x99\x03\n\x0eSparkThrowable\x12$\n\x0b\x65rror_class\x18\x01 \x01(\tH\x00R\nerrorClass\x88\x01\x01\x12}\n\x12message_parameters\x18\x02 \x03(\x0b\x32N.spark.connect.FetchErrorDetailsResponse.SparkThrowable.MessageParametersEntryR\x11messageParameters\x12\\\n\x0equery_contexts\x18\x03 \x03(\x0b\x32\x35.spark.connect.FetchErrorDetailsResponse.QueryContextR\rqueryContexts\x12 \n\tsql_state\x18\x04 \x01(\tH\x01R\x08sqlState\x88\x01\x01\x1a\x44\n\x16MessageParametersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0e\n\x0c_error_classB\x0c\n\n_sql_state\x1a\xdb\x02\n\x05\x45rror\x12\x30\n\x14\x65rror_type_hierarchy\x18\x01 \x03(\tR\x12\x65rrorTypeHierarchy\x12\x18\n\x07message\x18\x02 \x01(\tR\x07message\x12[\n\x0bstack_trace\x18\x03 \x03(\x0b\x32:.spark.connect.FetchErrorDetailsResponse.StackTraceElementR\nstackTrace\x12 \n\tcause_idx\x18\x04 \x01(\x05H\x00R\x08\x63\x61useIdx\x88\x01\x01\x12\x65\n\x0fspark_throwable\x18\x05 \x01(\x0b\x32\x37.spark.connect.FetchErrorDetailsResponse.SparkThrowableH\x01R\x0esparkThrowable\x88\x01\x01\x42\x0c\n\n_cause_idxB\x12\n\x10_spark_throwableB\x11\n\x0f_root_error_idx"Z\n\x17\x43heckpointCommandResult\x12?\n\x08relation\x18\x01 \x01(\x0b\x32#.spark.connect.CachedRemoteRelationR\x08relation2\xb2\x07\n\x13SparkConnectService\x12X\n\x0b\x45xecutePlan\x12!.spark.connect.ExecutePlanRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12V\n\x0b\x41nalyzePlan\x12!.spark.connect.AnalyzePlanRequest\x1a".spark.connect.AnalyzePlanResponse"\x00\x12G\n\x06\x43onfig\x12\x1c.spark.connect.ConfigRequest\x1a\x1d.spark.connect.ConfigResponse"\x00\x12[\n\x0c\x41\x64\x64\x41rtifacts\x12".spark.connect.AddArtifactsRequest\x1a#.spark.connect.AddArtifactsResponse"\x00(\x01\x12\x63\n\x0e\x41rtifactStatus\x12&.spark.connect.ArtifactStatusesRequest\x1a\'.spark.connect.ArtifactStatusesResponse"\x00\x12P\n\tInterrupt\x12\x1f.spark.connect.InterruptRequest\x1a .spark.connect.InterruptResponse"\x00\x12`\n\x0fReattachExecute\x12%.spark.connect.ReattachExecuteRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12_\n\x0eReleaseExecute\x12$.spark.connect.ReleaseExecuteRequest\x1a%.spark.connect.ReleaseExecuteResponse"\x00\x12_\n\x0eReleaseSession\x12$.spark.connect.ReleaseSessionRequest\x1a%.spark.connect.ReleaseSessionResponse"\x00\x12h\n\x11\x46\x65tchErrorDetails\x12\'.spark.connect.FetchErrorDetailsRequest\x1a(.spark.connect.FetchErrorDetailsResponse"\x00\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _globals = globals() @@ -68,186 +69,190 @@ _globals[ "_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY" ]._serialized_options = b"8\001" - _globals["_PLAN"]._serialized_start = 219 - _globals["_PLAN"]._serialized_end = 335 - _globals["_USERCONTEXT"]._serialized_start = 337 - _globals["_USERCONTEXT"]._serialized_end = 459 - _globals["_ANALYZEPLANREQUEST"]._serialized_start = 462 - _globals["_ANALYZEPLANREQUEST"]._serialized_end = 3014 - _globals["_ANALYZEPLANREQUEST_SCHEMA"]._serialized_start = 1745 - _globals["_ANALYZEPLANREQUEST_SCHEMA"]._serialized_end = 1794 - _globals["_ANALYZEPLANREQUEST_EXPLAIN"]._serialized_start = 1797 - _globals["_ANALYZEPLANREQUEST_EXPLAIN"]._serialized_end = 2112 - _globals["_ANALYZEPLANREQUEST_EXPLAIN_EXPLAINMODE"]._serialized_start = 1940 - _globals["_ANALYZEPLANREQUEST_EXPLAIN_EXPLAINMODE"]._serialized_end = 2112 - _globals["_ANALYZEPLANREQUEST_TREESTRING"]._serialized_start = 2114 - _globals["_ANALYZEPLANREQUEST_TREESTRING"]._serialized_end = 2204 - _globals["_ANALYZEPLANREQUEST_ISLOCAL"]._serialized_start = 2206 - _globals["_ANALYZEPLANREQUEST_ISLOCAL"]._serialized_end = 2256 - _globals["_ANALYZEPLANREQUEST_ISSTREAMING"]._serialized_start = 2258 - _globals["_ANALYZEPLANREQUEST_ISSTREAMING"]._serialized_end = 2312 - _globals["_ANALYZEPLANREQUEST_INPUTFILES"]._serialized_start = 2314 - _globals["_ANALYZEPLANREQUEST_INPUTFILES"]._serialized_end = 2367 - _globals["_ANALYZEPLANREQUEST_SPARKVERSION"]._serialized_start = 2369 - _globals["_ANALYZEPLANREQUEST_SPARKVERSION"]._serialized_end = 2383 - _globals["_ANALYZEPLANREQUEST_DDLPARSE"]._serialized_start = 2385 - _globals["_ANALYZEPLANREQUEST_DDLPARSE"]._serialized_end = 2426 - _globals["_ANALYZEPLANREQUEST_SAMESEMANTICS"]._serialized_start = 2428 - _globals["_ANALYZEPLANREQUEST_SAMESEMANTICS"]._serialized_end = 2549 - _globals["_ANALYZEPLANREQUEST_SEMANTICHASH"]._serialized_start = 2551 - _globals["_ANALYZEPLANREQUEST_SEMANTICHASH"]._serialized_end = 2606 - _globals["_ANALYZEPLANREQUEST_PERSIST"]._serialized_start = 2609 - _globals["_ANALYZEPLANREQUEST_PERSIST"]._serialized_end = 2760 - _globals["_ANALYZEPLANREQUEST_UNPERSIST"]._serialized_start = 2762 - _globals["_ANALYZEPLANREQUEST_UNPERSIST"]._serialized_end = 2872 - _globals["_ANALYZEPLANREQUEST_GETSTORAGELEVEL"]._serialized_start = 2874 - _globals["_ANALYZEPLANREQUEST_GETSTORAGELEVEL"]._serialized_end = 2944 - _globals["_ANALYZEPLANRESPONSE"]._serialized_start = 3017 - _globals["_ANALYZEPLANRESPONSE"]._serialized_end = 4759 - _globals["_ANALYZEPLANRESPONSE_SCHEMA"]._serialized_start = 4178 - _globals["_ANALYZEPLANRESPONSE_SCHEMA"]._serialized_end = 4235 - _globals["_ANALYZEPLANRESPONSE_EXPLAIN"]._serialized_start = 4237 - _globals["_ANALYZEPLANRESPONSE_EXPLAIN"]._serialized_end = 4285 - _globals["_ANALYZEPLANRESPONSE_TREESTRING"]._serialized_start = 4287 - _globals["_ANALYZEPLANRESPONSE_TREESTRING"]._serialized_end = 4332 - _globals["_ANALYZEPLANRESPONSE_ISLOCAL"]._serialized_start = 4334 - _globals["_ANALYZEPLANRESPONSE_ISLOCAL"]._serialized_end = 4370 - _globals["_ANALYZEPLANRESPONSE_ISSTREAMING"]._serialized_start = 4372 - _globals["_ANALYZEPLANRESPONSE_ISSTREAMING"]._serialized_end = 4420 - _globals["_ANALYZEPLANRESPONSE_INPUTFILES"]._serialized_start = 4422 - _globals["_ANALYZEPLANRESPONSE_INPUTFILES"]._serialized_end = 4456 - _globals["_ANALYZEPLANRESPONSE_SPARKVERSION"]._serialized_start = 4458 - _globals["_ANALYZEPLANRESPONSE_SPARKVERSION"]._serialized_end = 4498 - _globals["_ANALYZEPLANRESPONSE_DDLPARSE"]._serialized_start = 4500 - _globals["_ANALYZEPLANRESPONSE_DDLPARSE"]._serialized_end = 4559 - _globals["_ANALYZEPLANRESPONSE_SAMESEMANTICS"]._serialized_start = 4561 - _globals["_ANALYZEPLANRESPONSE_SAMESEMANTICS"]._serialized_end = 4600 - _globals["_ANALYZEPLANRESPONSE_SEMANTICHASH"]._serialized_start = 4602 - _globals["_ANALYZEPLANRESPONSE_SEMANTICHASH"]._serialized_end = 4640 - _globals["_ANALYZEPLANRESPONSE_PERSIST"]._serialized_start = 2609 - _globals["_ANALYZEPLANRESPONSE_PERSIST"]._serialized_end = 2618 - _globals["_ANALYZEPLANRESPONSE_UNPERSIST"]._serialized_start = 2762 - _globals["_ANALYZEPLANRESPONSE_UNPERSIST"]._serialized_end = 2773 - _globals["_ANALYZEPLANRESPONSE_GETSTORAGELEVEL"]._serialized_start = 4666 - _globals["_ANALYZEPLANRESPONSE_GETSTORAGELEVEL"]._serialized_end = 4749 - _globals["_EXECUTEPLANREQUEST"]._serialized_start = 4762 - _globals["_EXECUTEPLANREQUEST"]._serialized_end = 5437 - _globals["_EXECUTEPLANREQUEST_REQUESTOPTION"]._serialized_start = 5196 - _globals["_EXECUTEPLANREQUEST_REQUESTOPTION"]._serialized_end = 5361 - _globals["_EXECUTEPLANRESPONSE"]._serialized_start = 5440 - _globals["_EXECUTEPLANRESPONSE"]._serialized_end = 8358 - _globals["_EXECUTEPLANRESPONSE_SQLCOMMANDRESULT"]._serialized_start = 7132 - _globals["_EXECUTEPLANRESPONSE_SQLCOMMANDRESULT"]._serialized_end = 7203 - _globals["_EXECUTEPLANRESPONSE_ARROWBATCH"]._serialized_start = 7205 - _globals["_EXECUTEPLANRESPONSE_ARROWBATCH"]._serialized_end = 7323 - _globals["_EXECUTEPLANRESPONSE_METRICS"]._serialized_start = 7326 - _globals["_EXECUTEPLANRESPONSE_METRICS"]._serialized_end = 7843 - _globals["_EXECUTEPLANRESPONSE_METRICS_METRICOBJECT"]._serialized_start = 7421 - _globals["_EXECUTEPLANRESPONSE_METRICS_METRICOBJECT"]._serialized_end = 7753 + _globals["_PLAN"]._serialized_start = 243 + _globals["_PLAN"]._serialized_end = 359 + _globals["_USERCONTEXT"]._serialized_start = 361 + _globals["_USERCONTEXT"]._serialized_end = 483 + _globals["_ANALYZEPLANREQUEST"]._serialized_start = 486 + _globals["_ANALYZEPLANREQUEST"]._serialized_end = 3163 + _globals["_ANALYZEPLANREQUEST_SCHEMA"]._serialized_start = 1848 + _globals["_ANALYZEPLANREQUEST_SCHEMA"]._serialized_end = 1897 + _globals["_ANALYZEPLANREQUEST_EXPLAIN"]._serialized_start = 1900 + _globals["_ANALYZEPLANREQUEST_EXPLAIN"]._serialized_end = 2215 + _globals["_ANALYZEPLANREQUEST_EXPLAIN_EXPLAINMODE"]._serialized_start = 2043 + _globals["_ANALYZEPLANREQUEST_EXPLAIN_EXPLAINMODE"]._serialized_end = 2215 + _globals["_ANALYZEPLANREQUEST_TREESTRING"]._serialized_start = 2217 + _globals["_ANALYZEPLANREQUEST_TREESTRING"]._serialized_end = 2307 + _globals["_ANALYZEPLANREQUEST_ISLOCAL"]._serialized_start = 2309 + _globals["_ANALYZEPLANREQUEST_ISLOCAL"]._serialized_end = 2359 + _globals["_ANALYZEPLANREQUEST_ISSTREAMING"]._serialized_start = 2361 + _globals["_ANALYZEPLANREQUEST_ISSTREAMING"]._serialized_end = 2415 + _globals["_ANALYZEPLANREQUEST_INPUTFILES"]._serialized_start = 2417 + _globals["_ANALYZEPLANREQUEST_INPUTFILES"]._serialized_end = 2470 + _globals["_ANALYZEPLANREQUEST_SPARKVERSION"]._serialized_start = 2472 + _globals["_ANALYZEPLANREQUEST_SPARKVERSION"]._serialized_end = 2486 + _globals["_ANALYZEPLANREQUEST_DDLPARSE"]._serialized_start = 2488 + _globals["_ANALYZEPLANREQUEST_DDLPARSE"]._serialized_end = 2529 + _globals["_ANALYZEPLANREQUEST_SAMESEMANTICS"]._serialized_start = 2531 + _globals["_ANALYZEPLANREQUEST_SAMESEMANTICS"]._serialized_end = 2652 + _globals["_ANALYZEPLANREQUEST_SEMANTICHASH"]._serialized_start = 2654 + _globals["_ANALYZEPLANREQUEST_SEMANTICHASH"]._serialized_end = 2709 + _globals["_ANALYZEPLANREQUEST_PERSIST"]._serialized_start = 2712 + _globals["_ANALYZEPLANREQUEST_PERSIST"]._serialized_end = 2863 + _globals["_ANALYZEPLANREQUEST_UNPERSIST"]._serialized_start = 2865 + _globals["_ANALYZEPLANREQUEST_UNPERSIST"]._serialized_end = 2975 + _globals["_ANALYZEPLANREQUEST_GETSTORAGELEVEL"]._serialized_start = 2977 + _globals["_ANALYZEPLANREQUEST_GETSTORAGELEVEL"]._serialized_end = 3047 + _globals["_ANALYZEPLANREQUEST_JSONTODDL"]._serialized_start = 3049 + _globals["_ANALYZEPLANREQUEST_JSONTODDL"]._serialized_end = 3093 + _globals["_ANALYZEPLANRESPONSE"]._serialized_start = 3166 + _globals["_ANALYZEPLANRESPONSE"]._serialized_end = 5032 + _globals["_ANALYZEPLANRESPONSE_SCHEMA"]._serialized_start = 4407 + _globals["_ANALYZEPLANRESPONSE_SCHEMA"]._serialized_end = 4464 + _globals["_ANALYZEPLANRESPONSE_EXPLAIN"]._serialized_start = 4466 + _globals["_ANALYZEPLANRESPONSE_EXPLAIN"]._serialized_end = 4514 + _globals["_ANALYZEPLANRESPONSE_TREESTRING"]._serialized_start = 4516 + _globals["_ANALYZEPLANRESPONSE_TREESTRING"]._serialized_end = 4561 + _globals["_ANALYZEPLANRESPONSE_ISLOCAL"]._serialized_start = 4563 + _globals["_ANALYZEPLANRESPONSE_ISLOCAL"]._serialized_end = 4599 + _globals["_ANALYZEPLANRESPONSE_ISSTREAMING"]._serialized_start = 4601 + _globals["_ANALYZEPLANRESPONSE_ISSTREAMING"]._serialized_end = 4649 + _globals["_ANALYZEPLANRESPONSE_INPUTFILES"]._serialized_start = 4651 + _globals["_ANALYZEPLANRESPONSE_INPUTFILES"]._serialized_end = 4685 + _globals["_ANALYZEPLANRESPONSE_SPARKVERSION"]._serialized_start = 4687 + _globals["_ANALYZEPLANRESPONSE_SPARKVERSION"]._serialized_end = 4727 + _globals["_ANALYZEPLANRESPONSE_DDLPARSE"]._serialized_start = 4729 + _globals["_ANALYZEPLANRESPONSE_DDLPARSE"]._serialized_end = 4788 + _globals["_ANALYZEPLANRESPONSE_SAMESEMANTICS"]._serialized_start = 4790 + _globals["_ANALYZEPLANRESPONSE_SAMESEMANTICS"]._serialized_end = 4829 + _globals["_ANALYZEPLANRESPONSE_SEMANTICHASH"]._serialized_start = 4831 + _globals["_ANALYZEPLANRESPONSE_SEMANTICHASH"]._serialized_end = 4869 + _globals["_ANALYZEPLANRESPONSE_PERSIST"]._serialized_start = 2712 + _globals["_ANALYZEPLANRESPONSE_PERSIST"]._serialized_end = 2721 + _globals["_ANALYZEPLANRESPONSE_UNPERSIST"]._serialized_start = 2865 + _globals["_ANALYZEPLANRESPONSE_UNPERSIST"]._serialized_end = 2876 + _globals["_ANALYZEPLANRESPONSE_GETSTORAGELEVEL"]._serialized_start = 4895 + _globals["_ANALYZEPLANRESPONSE_GETSTORAGELEVEL"]._serialized_end = 4978 + _globals["_ANALYZEPLANRESPONSE_JSONTODDL"]._serialized_start = 4980 + _globals["_ANALYZEPLANRESPONSE_JSONTODDL"]._serialized_end = 5022 + _globals["_EXECUTEPLANREQUEST"]._serialized_start = 5035 + _globals["_EXECUTEPLANREQUEST"]._serialized_end = 5710 + _globals["_EXECUTEPLANREQUEST_REQUESTOPTION"]._serialized_start = 5469 + _globals["_EXECUTEPLANREQUEST_REQUESTOPTION"]._serialized_end = 5634 + _globals["_EXECUTEPLANRESPONSE"]._serialized_start = 5713 + _globals["_EXECUTEPLANRESPONSE"]._serialized_end = 8709 + _globals["_EXECUTEPLANRESPONSE_SQLCOMMANDRESULT"]._serialized_start = 7483 + _globals["_EXECUTEPLANRESPONSE_SQLCOMMANDRESULT"]._serialized_end = 7554 + _globals["_EXECUTEPLANRESPONSE_ARROWBATCH"]._serialized_start = 7556 + _globals["_EXECUTEPLANRESPONSE_ARROWBATCH"]._serialized_end = 7674 + _globals["_EXECUTEPLANRESPONSE_METRICS"]._serialized_start = 7677 + _globals["_EXECUTEPLANRESPONSE_METRICS"]._serialized_end = 8194 + _globals["_EXECUTEPLANRESPONSE_METRICS_METRICOBJECT"]._serialized_start = 7772 + _globals["_EXECUTEPLANRESPONSE_METRICS_METRICOBJECT"]._serialized_end = 8104 _globals[ "_EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY" - ]._serialized_start = 7630 + ]._serialized_start = 7981 _globals[ "_EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY" - ]._serialized_end = 7753 - _globals["_EXECUTEPLANRESPONSE_METRICS_METRICVALUE"]._serialized_start = 7755 - _globals["_EXECUTEPLANRESPONSE_METRICS_METRICVALUE"]._serialized_end = 7843 - _globals["_EXECUTEPLANRESPONSE_OBSERVEDMETRICS"]._serialized_start = 7846 - _globals["_EXECUTEPLANRESPONSE_OBSERVEDMETRICS"]._serialized_end = 7987 - _globals["_EXECUTEPLANRESPONSE_RESULTCOMPLETE"]._serialized_start = 7989 - _globals["_EXECUTEPLANRESPONSE_RESULTCOMPLETE"]._serialized_end = 8005 - _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS"]._serialized_start = 8008 - _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS"]._serialized_end = 8341 - _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO"]._serialized_start = 8164 - _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO"]._serialized_end = 8341 - _globals["_KEYVALUE"]._serialized_start = 8360 - _globals["_KEYVALUE"]._serialized_end = 8425 - _globals["_CONFIGREQUEST"]._serialized_start = 8428 - _globals["_CONFIGREQUEST"]._serialized_end = 9587 - _globals["_CONFIGREQUEST_OPERATION"]._serialized_start = 8736 - _globals["_CONFIGREQUEST_OPERATION"]._serialized_end = 9234 - _globals["_CONFIGREQUEST_SET"]._serialized_start = 9236 - _globals["_CONFIGREQUEST_SET"]._serialized_end = 9288 - _globals["_CONFIGREQUEST_GET"]._serialized_start = 9290 - _globals["_CONFIGREQUEST_GET"]._serialized_end = 9315 - _globals["_CONFIGREQUEST_GETWITHDEFAULT"]._serialized_start = 9317 - _globals["_CONFIGREQUEST_GETWITHDEFAULT"]._serialized_end = 9380 - _globals["_CONFIGREQUEST_GETOPTION"]._serialized_start = 9382 - _globals["_CONFIGREQUEST_GETOPTION"]._serialized_end = 9413 - _globals["_CONFIGREQUEST_GETALL"]._serialized_start = 9415 - _globals["_CONFIGREQUEST_GETALL"]._serialized_end = 9463 - _globals["_CONFIGREQUEST_UNSET"]._serialized_start = 9465 - _globals["_CONFIGREQUEST_UNSET"]._serialized_end = 9492 - _globals["_CONFIGREQUEST_ISMODIFIABLE"]._serialized_start = 9494 - _globals["_CONFIGREQUEST_ISMODIFIABLE"]._serialized_end = 9528 - _globals["_CONFIGRESPONSE"]._serialized_start = 9590 - _globals["_CONFIGRESPONSE"]._serialized_end = 9765 - _globals["_ADDARTIFACTSREQUEST"]._serialized_start = 9768 - _globals["_ADDARTIFACTSREQUEST"]._serialized_end = 10770 - _globals["_ADDARTIFACTSREQUEST_ARTIFACTCHUNK"]._serialized_start = 10243 - _globals["_ADDARTIFACTSREQUEST_ARTIFACTCHUNK"]._serialized_end = 10296 - _globals["_ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT"]._serialized_start = 10298 - _globals["_ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT"]._serialized_end = 10409 - _globals["_ADDARTIFACTSREQUEST_BATCH"]._serialized_start = 10411 - _globals["_ADDARTIFACTSREQUEST_BATCH"]._serialized_end = 10504 - _globals["_ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT"]._serialized_start = 10507 - _globals["_ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT"]._serialized_end = 10700 - _globals["_ADDARTIFACTSRESPONSE"]._serialized_start = 10773 - _globals["_ADDARTIFACTSRESPONSE"]._serialized_end = 11045 - _globals["_ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY"]._serialized_start = 10964 - _globals["_ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY"]._serialized_end = 11045 - _globals["_ARTIFACTSTATUSESREQUEST"]._serialized_start = 11048 - _globals["_ARTIFACTSTATUSESREQUEST"]._serialized_end = 11374 - _globals["_ARTIFACTSTATUSESRESPONSE"]._serialized_start = 11377 - _globals["_ARTIFACTSTATUSESRESPONSE"]._serialized_end = 11729 - _globals["_ARTIFACTSTATUSESRESPONSE_STATUSESENTRY"]._serialized_start = 11572 - _globals["_ARTIFACTSTATUSESRESPONSE_STATUSESENTRY"]._serialized_end = 11687 - _globals["_ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS"]._serialized_start = 11689 - _globals["_ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS"]._serialized_end = 11729 - _globals["_INTERRUPTREQUEST"]._serialized_start = 11732 - _globals["_INTERRUPTREQUEST"]._serialized_end = 12335 - _globals["_INTERRUPTREQUEST_INTERRUPTTYPE"]._serialized_start = 12135 - _globals["_INTERRUPTREQUEST_INTERRUPTTYPE"]._serialized_end = 12263 - _globals["_INTERRUPTRESPONSE"]._serialized_start = 12338 - _globals["_INTERRUPTRESPONSE"]._serialized_end = 12482 - _globals["_REATTACHOPTIONS"]._serialized_start = 12484 - _globals["_REATTACHOPTIONS"]._serialized_end = 12537 - _globals["_REATTACHEXECUTEREQUEST"]._serialized_start = 12540 - _globals["_REATTACHEXECUTEREQUEST"]._serialized_end = 12946 - _globals["_RELEASEEXECUTEREQUEST"]._serialized_start = 12949 - _globals["_RELEASEEXECUTEREQUEST"]._serialized_end = 13534 - _globals["_RELEASEEXECUTEREQUEST_RELEASEALL"]._serialized_start = 13403 - _globals["_RELEASEEXECUTEREQUEST_RELEASEALL"]._serialized_end = 13415 - _globals["_RELEASEEXECUTEREQUEST_RELEASEUNTIL"]._serialized_start = 13417 - _globals["_RELEASEEXECUTEREQUEST_RELEASEUNTIL"]._serialized_end = 13464 - _globals["_RELEASEEXECUTERESPONSE"]._serialized_start = 13537 - _globals["_RELEASEEXECUTERESPONSE"]._serialized_end = 13702 - _globals["_RELEASESESSIONREQUEST"]._serialized_start = 13705 - _globals["_RELEASESESSIONREQUEST"]._serialized_end = 13876 - _globals["_RELEASESESSIONRESPONSE"]._serialized_start = 13878 - _globals["_RELEASESESSIONRESPONSE"]._serialized_end = 13986 - _globals["_FETCHERRORDETAILSREQUEST"]._serialized_start = 13989 - _globals["_FETCHERRORDETAILSREQUEST"]._serialized_end = 14321 - _globals["_FETCHERRORDETAILSRESPONSE"]._serialized_start = 14324 - _globals["_FETCHERRORDETAILSRESPONSE"]._serialized_end = 15879 - _globals["_FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT"]._serialized_start = 14553 - _globals["_FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT"]._serialized_end = 14727 - _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT"]._serialized_start = 14730 - _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT"]._serialized_end = 15098 - _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE"]._serialized_start = 15061 - _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE"]._serialized_end = 15098 - _globals["_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE"]._serialized_start = 15101 - _globals["_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE"]._serialized_end = 15510 + ]._serialized_end = 8104 + _globals["_EXECUTEPLANRESPONSE_METRICS_METRICVALUE"]._serialized_start = 8106 + _globals["_EXECUTEPLANRESPONSE_METRICS_METRICVALUE"]._serialized_end = 8194 + _globals["_EXECUTEPLANRESPONSE_OBSERVEDMETRICS"]._serialized_start = 8197 + _globals["_EXECUTEPLANRESPONSE_OBSERVEDMETRICS"]._serialized_end = 8338 + _globals["_EXECUTEPLANRESPONSE_RESULTCOMPLETE"]._serialized_start = 8340 + _globals["_EXECUTEPLANRESPONSE_RESULTCOMPLETE"]._serialized_end = 8356 + _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS"]._serialized_start = 8359 + _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS"]._serialized_end = 8692 + _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO"]._serialized_start = 8515 + _globals["_EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO"]._serialized_end = 8692 + _globals["_KEYVALUE"]._serialized_start = 8711 + _globals["_KEYVALUE"]._serialized_end = 8776 + _globals["_CONFIGREQUEST"]._serialized_start = 8779 + _globals["_CONFIGREQUEST"]._serialized_end = 9978 + _globals["_CONFIGREQUEST_OPERATION"]._serialized_start = 9087 + _globals["_CONFIGREQUEST_OPERATION"]._serialized_end = 9585 + _globals["_CONFIGREQUEST_SET"]._serialized_start = 9587 + _globals["_CONFIGREQUEST_SET"]._serialized_end = 9679 + _globals["_CONFIGREQUEST_GET"]._serialized_start = 9681 + _globals["_CONFIGREQUEST_GET"]._serialized_end = 9706 + _globals["_CONFIGREQUEST_GETWITHDEFAULT"]._serialized_start = 9708 + _globals["_CONFIGREQUEST_GETWITHDEFAULT"]._serialized_end = 9771 + _globals["_CONFIGREQUEST_GETOPTION"]._serialized_start = 9773 + _globals["_CONFIGREQUEST_GETOPTION"]._serialized_end = 9804 + _globals["_CONFIGREQUEST_GETALL"]._serialized_start = 9806 + _globals["_CONFIGREQUEST_GETALL"]._serialized_end = 9854 + _globals["_CONFIGREQUEST_UNSET"]._serialized_start = 9856 + _globals["_CONFIGREQUEST_UNSET"]._serialized_end = 9883 + _globals["_CONFIGREQUEST_ISMODIFIABLE"]._serialized_start = 9885 + _globals["_CONFIGREQUEST_ISMODIFIABLE"]._serialized_end = 9919 + _globals["_CONFIGRESPONSE"]._serialized_start = 9981 + _globals["_CONFIGRESPONSE"]._serialized_end = 10156 + _globals["_ADDARTIFACTSREQUEST"]._serialized_start = 10159 + _globals["_ADDARTIFACTSREQUEST"]._serialized_end = 11161 + _globals["_ADDARTIFACTSREQUEST_ARTIFACTCHUNK"]._serialized_start = 10634 + _globals["_ADDARTIFACTSREQUEST_ARTIFACTCHUNK"]._serialized_end = 10687 + _globals["_ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT"]._serialized_start = 10689 + _globals["_ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT"]._serialized_end = 10800 + _globals["_ADDARTIFACTSREQUEST_BATCH"]._serialized_start = 10802 + _globals["_ADDARTIFACTSREQUEST_BATCH"]._serialized_end = 10895 + _globals["_ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT"]._serialized_start = 10898 + _globals["_ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT"]._serialized_end = 11091 + _globals["_ADDARTIFACTSRESPONSE"]._serialized_start = 11164 + _globals["_ADDARTIFACTSRESPONSE"]._serialized_end = 11436 + _globals["_ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY"]._serialized_start = 11355 + _globals["_ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY"]._serialized_end = 11436 + _globals["_ARTIFACTSTATUSESREQUEST"]._serialized_start = 11439 + _globals["_ARTIFACTSTATUSESREQUEST"]._serialized_end = 11765 + _globals["_ARTIFACTSTATUSESRESPONSE"]._serialized_start = 11768 + _globals["_ARTIFACTSTATUSESRESPONSE"]._serialized_end = 12120 + _globals["_ARTIFACTSTATUSESRESPONSE_STATUSESENTRY"]._serialized_start = 11963 + _globals["_ARTIFACTSTATUSESRESPONSE_STATUSESENTRY"]._serialized_end = 12078 + _globals["_ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS"]._serialized_start = 12080 + _globals["_ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS"]._serialized_end = 12120 + _globals["_INTERRUPTREQUEST"]._serialized_start = 12123 + _globals["_INTERRUPTREQUEST"]._serialized_end = 12726 + _globals["_INTERRUPTREQUEST_INTERRUPTTYPE"]._serialized_start = 12526 + _globals["_INTERRUPTREQUEST_INTERRUPTTYPE"]._serialized_end = 12654 + _globals["_INTERRUPTRESPONSE"]._serialized_start = 12729 + _globals["_INTERRUPTRESPONSE"]._serialized_end = 12873 + _globals["_REATTACHOPTIONS"]._serialized_start = 12875 + _globals["_REATTACHOPTIONS"]._serialized_end = 12928 + _globals["_REATTACHEXECUTEREQUEST"]._serialized_start = 12931 + _globals["_REATTACHEXECUTEREQUEST"]._serialized_end = 13337 + _globals["_RELEASEEXECUTEREQUEST"]._serialized_start = 13340 + _globals["_RELEASEEXECUTEREQUEST"]._serialized_end = 13925 + _globals["_RELEASEEXECUTEREQUEST_RELEASEALL"]._serialized_start = 13794 + _globals["_RELEASEEXECUTEREQUEST_RELEASEALL"]._serialized_end = 13806 + _globals["_RELEASEEXECUTEREQUEST_RELEASEUNTIL"]._serialized_start = 13808 + _globals["_RELEASEEXECUTEREQUEST_RELEASEUNTIL"]._serialized_end = 13855 + _globals["_RELEASEEXECUTERESPONSE"]._serialized_start = 13928 + _globals["_RELEASEEXECUTERESPONSE"]._serialized_end = 14093 + _globals["_RELEASESESSIONREQUEST"]._serialized_start = 14096 + _globals["_RELEASESESSIONREQUEST"]._serialized_end = 14308 + _globals["_RELEASESESSIONRESPONSE"]._serialized_start = 14310 + _globals["_RELEASESESSIONRESPONSE"]._serialized_end = 14418 + _globals["_FETCHERRORDETAILSREQUEST"]._serialized_start = 14421 + _globals["_FETCHERRORDETAILSREQUEST"]._serialized_end = 14753 + _globals["_FETCHERRORDETAILSRESPONSE"]._serialized_start = 14756 + _globals["_FETCHERRORDETAILSRESPONSE"]._serialized_end = 16311 + _globals["_FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT"]._serialized_start = 14985 + _globals["_FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT"]._serialized_end = 15159 + _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT"]._serialized_start = 15162 + _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT"]._serialized_end = 15530 + _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE"]._serialized_start = 15493 + _globals["_FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE"]._serialized_end = 15530 + _globals["_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE"]._serialized_start = 15533 + _globals["_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE"]._serialized_end = 15942 _globals[ "_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY" - ]._serialized_start = 15412 + ]._serialized_start = 15844 _globals[ "_FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY" - ]._serialized_end = 15480 - _globals["_FETCHERRORDETAILSRESPONSE_ERROR"]._serialized_start = 15513 - _globals["_FETCHERRORDETAILSRESPONSE_ERROR"]._serialized_end = 15860 - _globals["_CHECKPOINTCOMMANDRESULT"]._serialized_start = 15881 - _globals["_CHECKPOINTCOMMANDRESULT"]._serialized_end = 15971 - _globals["_SPARKCONNECTSERVICE"]._serialized_start = 15974 - _globals["_SPARKCONNECTSERVICE"]._serialized_end = 16920 + ]._serialized_end = 15912 + _globals["_FETCHERRORDETAILSRESPONSE_ERROR"]._serialized_start = 15945 + _globals["_FETCHERRORDETAILSRESPONSE_ERROR"]._serialized_end = 16292 + _globals["_CHECKPOINTCOMMANDRESULT"]._serialized_start = 16313 + _globals["_CHECKPOINTCOMMANDRESULT"]._serialized_end = 16403 + _globals["_SPARKCONNECTSERVICE"]._serialized_start = 16406 + _globals["_SPARKCONNECTSERVICE"]._serialized_end = 17352 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/base_pb2.pyi b/python/pyspark/sql/connect/proto/base_pb2.pyi index 5db25569828b7..738339fa968ec 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.pyi +++ b/python/pyspark/sql/connect/proto/base_pb2.pyi @@ -43,6 +43,7 @@ import google.protobuf.message import pyspark.sql.connect.proto.commands_pb2 import pyspark.sql.connect.proto.common_pb2 import pyspark.sql.connect.proto.expressions_pb2 +import pyspark.sql.connect.proto.ml_pb2 import pyspark.sql.connect.proto.relations_pb2 import pyspark.sql.connect.proto.types_pb2 import sys @@ -477,6 +478,21 @@ class AnalyzePlanRequest(google.protobuf.message.Message): self, field_name: typing_extensions.Literal["relation", b"relation"] ) -> None: ... + class JsonToDDL(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + JSON_STRING_FIELD_NUMBER: builtins.int + json_string: builtins.str + """(Required) The JSON formatted string to be converted to DDL.""" + def __init__( + self, + *, + json_string: builtins.str = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["json_string", b"json_string"] + ) -> None: ... + SESSION_ID_FIELD_NUMBER: builtins.int CLIENT_OBSERVED_SERVER_SIDE_SESSION_ID_FIELD_NUMBER: builtins.int USER_CONTEXT_FIELD_NUMBER: builtins.int @@ -494,6 +510,7 @@ class AnalyzePlanRequest(google.protobuf.message.Message): PERSIST_FIELD_NUMBER: builtins.int UNPERSIST_FIELD_NUMBER: builtins.int GET_STORAGE_LEVEL_FIELD_NUMBER: builtins.int + JSON_TO_DDL_FIELD_NUMBER: builtins.int session_id: builtins.str """(Required) @@ -542,6 +559,8 @@ class AnalyzePlanRequest(google.protobuf.message.Message): def unpersist(self) -> global___AnalyzePlanRequest.Unpersist: ... @property def get_storage_level(self) -> global___AnalyzePlanRequest.GetStorageLevel: ... + @property + def json_to_ddl(self) -> global___AnalyzePlanRequest.JsonToDDL: ... def __init__( self, *, @@ -562,6 +581,7 @@ class AnalyzePlanRequest(google.protobuf.message.Message): persist: global___AnalyzePlanRequest.Persist | None = ..., unpersist: global___AnalyzePlanRequest.Unpersist | None = ..., get_storage_level: global___AnalyzePlanRequest.GetStorageLevel | None = ..., + json_to_ddl: global___AnalyzePlanRequest.JsonToDDL | None = ..., ) -> None: ... def HasField( self, @@ -588,6 +608,8 @@ class AnalyzePlanRequest(google.protobuf.message.Message): b"is_local", "is_streaming", b"is_streaming", + "json_to_ddl", + b"json_to_ddl", "persist", b"persist", "same_semantics", @@ -631,6 +653,8 @@ class AnalyzePlanRequest(google.protobuf.message.Message): b"is_local", "is_streaming", b"is_streaming", + "json_to_ddl", + b"json_to_ddl", "persist", b"persist", "same_semantics", @@ -680,6 +704,7 @@ class AnalyzePlanRequest(google.protobuf.message.Message): "persist", "unpersist", "get_storage_level", + "json_to_ddl", ] | None ): ... @@ -877,6 +902,20 @@ class AnalyzePlanResponse(google.protobuf.message.Message): self, field_name: typing_extensions.Literal["storage_level", b"storage_level"] ) -> None: ... + class JsonToDDL(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + DDL_STRING_FIELD_NUMBER: builtins.int + ddl_string: builtins.str + def __init__( + self, + *, + ddl_string: builtins.str = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["ddl_string", b"ddl_string"] + ) -> None: ... + SESSION_ID_FIELD_NUMBER: builtins.int SERVER_SIDE_SESSION_ID_FIELD_NUMBER: builtins.int SCHEMA_FIELD_NUMBER: builtins.int @@ -892,6 +931,7 @@ class AnalyzePlanResponse(google.protobuf.message.Message): PERSIST_FIELD_NUMBER: builtins.int UNPERSIST_FIELD_NUMBER: builtins.int GET_STORAGE_LEVEL_FIELD_NUMBER: builtins.int + JSON_TO_DDL_FIELD_NUMBER: builtins.int session_id: builtins.str server_side_session_id: builtins.str """Server-side generated idempotency key that the client can use to assert that the server side @@ -923,6 +963,8 @@ class AnalyzePlanResponse(google.protobuf.message.Message): def unpersist(self) -> global___AnalyzePlanResponse.Unpersist: ... @property def get_storage_level(self) -> global___AnalyzePlanResponse.GetStorageLevel: ... + @property + def json_to_ddl(self) -> global___AnalyzePlanResponse.JsonToDDL: ... def __init__( self, *, @@ -941,6 +983,7 @@ class AnalyzePlanResponse(google.protobuf.message.Message): persist: global___AnalyzePlanResponse.Persist | None = ..., unpersist: global___AnalyzePlanResponse.Unpersist | None = ..., get_storage_level: global___AnalyzePlanResponse.GetStorageLevel | None = ..., + json_to_ddl: global___AnalyzePlanResponse.JsonToDDL | None = ..., ) -> None: ... def HasField( self, @@ -957,6 +1000,8 @@ class AnalyzePlanResponse(google.protobuf.message.Message): b"is_local", "is_streaming", b"is_streaming", + "json_to_ddl", + b"json_to_ddl", "persist", b"persist", "result", @@ -990,6 +1035,8 @@ class AnalyzePlanResponse(google.protobuf.message.Message): b"is_local", "is_streaming", b"is_streaming", + "json_to_ddl", + b"json_to_ddl", "persist", b"persist", "result", @@ -1029,6 +1076,7 @@ class AnalyzePlanResponse(google.protobuf.message.Message): "persist", "unpersist", "get_storage_level", + "json_to_ddl", ] | None ): ... @@ -1534,6 +1582,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): CREATE_RESOURCE_PROFILE_COMMAND_RESULT_FIELD_NUMBER: builtins.int EXECUTION_PROGRESS_FIELD_NUMBER: builtins.int CHECKPOINT_COMMAND_RESULT_FIELD_NUMBER: builtins.int + ML_COMMAND_RESULT_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int METRICS_FIELD_NUMBER: builtins.int OBSERVED_METRICS_FIELD_NUMBER: builtins.int @@ -1598,6 +1647,9 @@ class ExecutePlanResponse(google.protobuf.message.Message): def checkpoint_command_result(self) -> global___CheckpointCommandResult: """Response for command that checkpoints a DataFrame.""" @property + def ml_command_result(self) -> pyspark.sql.connect.proto.ml_pb2.MlCommandResult: + """ML command response""" + @property def extension(self) -> google.protobuf.any_pb2.Any: """Support arbitrary result objects.""" @property @@ -1639,6 +1691,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): | None = ..., execution_progress: global___ExecutePlanResponse.ExecutionProgress | None = ..., checkpoint_command_result: global___CheckpointCommandResult | None = ..., + ml_command_result: pyspark.sql.connect.proto.ml_pb2.MlCommandResult | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., metrics: global___ExecutePlanResponse.Metrics | None = ..., observed_metrics: collections.abc.Iterable[global___ExecutePlanResponse.ObservedMetrics] @@ -1662,6 +1715,8 @@ class ExecutePlanResponse(google.protobuf.message.Message): b"get_resources_command_result", "metrics", b"metrics", + "ml_command_result", + b"ml_command_result", "response_type", b"response_type", "result_complete", @@ -1697,6 +1752,8 @@ class ExecutePlanResponse(google.protobuf.message.Message): b"get_resources_command_result", "metrics", b"metrics", + "ml_command_result", + b"ml_command_result", "observed_metrics", b"observed_metrics", "operation_id", @@ -1740,6 +1797,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): "create_resource_profile_command_result", "execution_progress", "checkpoint_command_result", + "ml_command_result", "extension", ] | None @@ -1874,17 +1932,32 @@ class ConfigRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor PAIRS_FIELD_NUMBER: builtins.int + SILENT_FIELD_NUMBER: builtins.int @property def pairs( self, ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___KeyValue]: """(Required) The config key-value pairs to set.""" + silent: builtins.bool + """(Optional) Whether to ignore failures.""" def __init__( self, *, pairs: collections.abc.Iterable[global___KeyValue] | None = ..., + silent: builtins.bool | None = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["pairs", b"pairs"]) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["_silent", b"_silent", "silent", b"silent"] + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "_silent", b"_silent", "pairs", b"pairs", "silent", b"silent" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_silent", b"_silent"] + ) -> typing_extensions.Literal["silent"] | None: ... class Get(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -3169,6 +3242,7 @@ class ReleaseSessionRequest(google.protobuf.message.Message): SESSION_ID_FIELD_NUMBER: builtins.int USER_CONTEXT_FIELD_NUMBER: builtins.int CLIENT_TYPE_FIELD_NUMBER: builtins.int + ALLOW_RECONNECT_FIELD_NUMBER: builtins.int session_id: builtins.str """(Required) @@ -3187,12 +3261,27 @@ class ReleaseSessionRequest(google.protobuf.message.Message): can be used for language or version specific information and is only intended for logging purposes and will not be interpreted by the server. """ + allow_reconnect: builtins.bool + """Signals the server to allow the client to reconnect to the session after it is released. + + By default, the server tombstones the session upon release, preventing reconnections and + fully cleaning the session state. + + If this flag is set to true, the server may permit the client to reconnect to the session + post-release, even if the session state has been cleaned. This can result in missing state, + such as Temporary Views, Temporary UDFs, or the Current Catalog, in the reconnected session. + + Use this option sparingly and only when the client fully understands the implications of + reconnecting to a released session. The client must ensure that any queries executed do not + rely on the session state prior to its release. + """ def __init__( self, *, session_id: builtins.str = ..., user_context: global___UserContext | None = ..., client_type: builtins.str | None = ..., + allow_reconnect: builtins.bool = ..., ) -> None: ... def HasField( self, @@ -3210,6 +3299,8 @@ class ReleaseSessionRequest(google.protobuf.message.Message): field_name: typing_extensions.Literal[ "_client_type", b"_client_type", + "allow_reconnect", + b"allow_reconnect", "client_type", b"client_type", "session_id", diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index a7fcc1d7e0908..57a770f0226d9 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -38,10 +38,11 @@ from pyspark.sql.connect.proto import common_pb2 as spark_dot_connect_dot_common__pb2 from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_expressions__pb2 from pyspark.sql.connect.proto import relations_pb2 as spark_dot_connect_dot_relations__pb2 +from pyspark.sql.connect.proto import ml_pb2 as spark_dot_connect_dot_ml__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\r\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12t\n\x1fstreaming_query_manager_command\x18\t \x01(\x0b\x32+.spark.connect.StreamingQueryManagerCommandH\x00R\x1cstreamingQueryManagerCommand\x12m\n\x17register_table_function\x18\n \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R\x15registerTableFunction\x12\x81\x01\n$streaming_query_listener_bus_command\x18\x0b \x01(\x0b\x32/.spark.connect.StreamingQueryListenerBusCommandH\x00R streamingQueryListenerBusCommand\x12\x64\n\x14register_data_source\x18\x0c \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R\x12registerDataSource\x12t\n\x1f\x63reate_resource_profile_command\x18\r \x01(\x0b\x32+.spark.connect.CreateResourceProfileCommandH\x00R\x1c\x63reateResourceProfileCommand\x12Q\n\x12\x63heckpoint_command\x18\x0e \x01(\x0b\x32 .spark.connect.CheckpointCommandH\x00R\x11\x63heckpointCommand\x12\x84\x01\n%remove_cached_remote_relation_command\x18\x0f \x01(\x0b\x32\x30.spark.connect.RemoveCachedRemoteRelationCommandH\x00R!removeCachedRemoteRelationCommand\x12_\n\x18merge_into_table_command\x18\x10 \x01(\x0b\x32$.spark.connect.MergeIntoTableCommandH\x00R\x15mergeIntoTableCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xaa\x04\n\nSqlCommand\x12\x14\n\x03sql\x18\x01 \x01(\tB\x02\x18\x01R\x03sql\x12;\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12Z\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32-.spark.connect.SqlCommand.NamedArgumentsEntryB\x02\x18\x01R\x0enamedArguments\x12\x42\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionB\x02\x18\x01R\x0cposArguments\x12-\n\x05input\x18\x06 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\xca\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x12-\n\x12\x63lustering_columns\x18\n \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xdc\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x12-\n\x12\x63lustering_columns\x18\t \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\xd8\x06\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x12N\n\x0e\x66oreach_writer\x18\r \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\rforeachWriter\x12L\n\rforeach_batch\x18\x0e \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\x0c\x66oreachBatch\x12\x36\n\x17\x63lustering_column_names\x18\x0f \x03(\tR\x15\x63lusteringColumnNames\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"\xb3\x01\n\x18StreamingForeachFunction\x12\x43\n\x0fpython_function\x18\x01 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x0epythonFunction\x12\x46\n\x0escala_function\x18\x02 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\rscalaFunctionB\n\n\x08\x66unction"\xd4\x01\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12<\n\x18query_started_event_json\x18\x03 \x01(\tH\x00R\x15queryStartedEventJson\x88\x01\x01\x42\x1b\n\x19_query_started_event_json"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\xf5\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1a\xc5\x01\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x12$\n\x0b\x65rror_class\x18\x02 \x01(\tH\x01R\nerrorClass\x88\x01\x01\x12$\n\x0bstack_trace\x18\x03 \x01(\tH\x02R\nstackTrace\x88\x01\x01\x42\x14\n\x12_exception_messageB\x0e\n\x0c_error_classB\x0e\n\x0c_stack_trace\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_type"\xbd\x06\n\x1cStreamingQueryManagerCommand\x12\x18\n\x06\x61\x63tive\x18\x01 \x01(\x08H\x00R\x06\x61\x63tive\x12\x1d\n\tget_query\x18\x02 \x01(\tH\x00R\x08getQuery\x12|\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32\x46.spark.connect.StreamingQueryManagerCommand.AwaitAnyTerminationCommandH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12n\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0b\x61\x64\x64Listener\x12t\n\x0fremove_listener\x18\x06 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0eremoveListener\x12\'\n\x0elist_listeners\x18\x07 \x01(\x08H\x00R\rlistListeners\x1aO\n\x1a\x41waitAnyTerminationCommand\x12"\n\ntimeout_ms\x18\x01 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_ms\x1a\xcd\x01\n\x1dStreamingQueryListenerCommand\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x12U\n\x17python_listener_payload\x18\x02 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x15pythonListenerPayload\x88\x01\x01\x12\x0e\n\x02id\x18\x03 \x01(\tR\x02idB\x1a\n\x18_python_listener_payloadB\t\n\x07\x63ommand"\xb4\x08\n"StreamingQueryManagerCommandResult\x12X\n\x06\x61\x63tive\x18\x01 \x01(\x0b\x32>.spark.connect.StreamingQueryManagerCommandResult.ActiveResultH\x00R\x06\x61\x63tive\x12`\n\x05query\x18\x02 \x01(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceH\x00R\x05query\x12\x81\x01\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32K.spark.connect.StreamingQueryManagerCommandResult.AwaitAnyTerminationResultH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12#\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x08H\x00R\x0b\x61\x64\x64Listener\x12)\n\x0fremove_listener\x18\x06 \x01(\x08H\x00R\x0eremoveListener\x12{\n\x0elist_listeners\x18\x07 \x01(\x0b\x32R.spark.connect.StreamingQueryManagerCommandResult.ListStreamingQueryListenerResultH\x00R\rlistListeners\x1a\x7f\n\x0c\x41\x63tiveResult\x12o\n\x0e\x61\x63tive_queries\x18\x01 \x03(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceR\ractiveQueries\x1as\n\x16StreamingQueryInstance\x12\x37\n\x02id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x02id\x12\x17\n\x04name\x18\x02 \x01(\tH\x00R\x04name\x88\x01\x01\x42\x07\n\x05_name\x1a;\n\x19\x41waitAnyTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminated\x1aK\n\x1eStreamingQueryListenerInstance\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x1a\x45\n ListStreamingQueryListenerResult\x12!\n\x0clistener_ids\x18\x01 \x03(\tR\x0blistenerIdsB\r\n\x0bresult_type"\xad\x01\n StreamingQueryListenerBusCommand\x12;\n\x19\x61\x64\x64_listener_bus_listener\x18\x01 \x01(\x08H\x00R\x16\x61\x64\x64ListenerBusListener\x12\x41\n\x1cremove_listener_bus_listener\x18\x02 \x01(\x08H\x00R\x19removeListenerBusListenerB\t\n\x07\x63ommand"\x83\x01\n\x1bStreamingQueryListenerEvent\x12\x1d\n\nevent_json\x18\x01 \x01(\tR\teventJson\x12\x45\n\nevent_type\x18\x02 \x01(\x0e\x32&.spark.connect.StreamingQueryEventTypeR\teventType"\xcc\x01\n"StreamingQueryListenerEventsResult\x12\x42\n\x06\x65vents\x18\x01 \x03(\x0b\x32*.spark.connect.StreamingQueryListenerEventR\x06\x65vents\x12\x42\n\x1blistener_bus_listener_added\x18\x02 \x01(\x08H\x00R\x18listenerBusListenerAdded\x88\x01\x01\x42\x1e\n\x1c_listener_bus_listener_added"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01"X\n\x1c\x43reateResourceProfileCommand\x12\x38\n\x07profile\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ResourceProfileR\x07profile"C\n"CreateResourceProfileCommandResult\x12\x1d\n\nprofile_id\x18\x01 \x01(\x05R\tprofileId"d\n!RemoveCachedRemoteRelationCommand\x12?\n\x08relation\x18\x01 \x01(\x0b\x32#.spark.connect.CachedRemoteRelationR\x08relation"\xcd\x01\n\x11\x43heckpointCommand\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x14\n\x05local\x18\x02 \x01(\x08R\x05local\x12\x14\n\x05\x65\x61ger\x18\x03 \x01(\x08R\x05\x65\x61ger\x12\x45\n\rstorage_level\x18\x04 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level"\xe8\x03\n\x15MergeIntoTableCommand\x12*\n\x11target_table_name\x18\x01 \x01(\tR\x0ftargetTableName\x12\x43\n\x11source_table_plan\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x0fsourceTablePlan\x12\x42\n\x0fmerge_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0emergeCondition\x12>\n\rmatch_actions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cmatchActions\x12I\n\x13not_matched_actions\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x11notMatchedActions\x12[\n\x1dnot_matched_by_source_actions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x19notMatchedBySourceActions\x12\x32\n\x15with_schema_evolution\x18\x07 \x01(\x08R\x13withSchemaEvolution*\x85\x01\n\x17StreamingQueryEventType\x12\x1e\n\x1aQUERY_PROGRESS_UNSPECIFIED\x10\x00\x12\x18\n\x14QUERY_PROGRESS_EVENT\x10\x01\x12\x1a\n\x16QUERY_TERMINATED_EVENT\x10\x02\x12\x14\n\x10QUERY_IDLE_EVENT\x10\x03\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x16spark/connect/ml.proto"\xcb\r\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12t\n\x1fstreaming_query_manager_command\x18\t \x01(\x0b\x32+.spark.connect.StreamingQueryManagerCommandH\x00R\x1cstreamingQueryManagerCommand\x12m\n\x17register_table_function\x18\n \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R\x15registerTableFunction\x12\x81\x01\n$streaming_query_listener_bus_command\x18\x0b \x01(\x0b\x32/.spark.connect.StreamingQueryListenerBusCommandH\x00R streamingQueryListenerBusCommand\x12\x64\n\x14register_data_source\x18\x0c \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R\x12registerDataSource\x12t\n\x1f\x63reate_resource_profile_command\x18\r \x01(\x0b\x32+.spark.connect.CreateResourceProfileCommandH\x00R\x1c\x63reateResourceProfileCommand\x12Q\n\x12\x63heckpoint_command\x18\x0e \x01(\x0b\x32 .spark.connect.CheckpointCommandH\x00R\x11\x63heckpointCommand\x12\x84\x01\n%remove_cached_remote_relation_command\x18\x0f \x01(\x0b\x32\x30.spark.connect.RemoveCachedRemoteRelationCommandH\x00R!removeCachedRemoteRelationCommand\x12_\n\x18merge_into_table_command\x18\x10 \x01(\x0b\x32$.spark.connect.MergeIntoTableCommandH\x00R\x15mergeIntoTableCommand\x12\x39\n\nml_command\x18\x11 \x01(\x0b\x32\x18.spark.connect.MlCommandH\x00R\tmlCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xaa\x04\n\nSqlCommand\x12\x14\n\x03sql\x18\x01 \x01(\tB\x02\x18\x01R\x03sql\x12;\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12Z\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32-.spark.connect.SqlCommand.NamedArgumentsEntryB\x02\x18\x01R\x0enamedArguments\x12\x42\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionB\x02\x18\x01R\x0cposArguments\x12-\n\x05input\x18\x06 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\xca\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x12-\n\x12\x63lustering_columns\x18\n \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xdc\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x12-\n\x12\x63lustering_columns\x18\t \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\xd8\x06\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x12N\n\x0e\x66oreach_writer\x18\r \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\rforeachWriter\x12L\n\rforeach_batch\x18\x0e \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\x0c\x66oreachBatch\x12\x36\n\x17\x63lustering_column_names\x18\x0f \x03(\tR\x15\x63lusteringColumnNames\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"\xb3\x01\n\x18StreamingForeachFunction\x12\x43\n\x0fpython_function\x18\x01 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x0epythonFunction\x12\x46\n\x0escala_function\x18\x02 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\rscalaFunctionB\n\n\x08\x66unction"\xd4\x01\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12<\n\x18query_started_event_json\x18\x03 \x01(\tH\x00R\x15queryStartedEventJson\x88\x01\x01\x42\x1b\n\x19_query_started_event_json"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\xf5\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1a\xc5\x01\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x12$\n\x0b\x65rror_class\x18\x02 \x01(\tH\x01R\nerrorClass\x88\x01\x01\x12$\n\x0bstack_trace\x18\x03 \x01(\tH\x02R\nstackTrace\x88\x01\x01\x42\x14\n\x12_exception_messageB\x0e\n\x0c_error_classB\x0e\n\x0c_stack_trace\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_type"\xbd\x06\n\x1cStreamingQueryManagerCommand\x12\x18\n\x06\x61\x63tive\x18\x01 \x01(\x08H\x00R\x06\x61\x63tive\x12\x1d\n\tget_query\x18\x02 \x01(\tH\x00R\x08getQuery\x12|\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32\x46.spark.connect.StreamingQueryManagerCommand.AwaitAnyTerminationCommandH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12n\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0b\x61\x64\x64Listener\x12t\n\x0fremove_listener\x18\x06 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0eremoveListener\x12\'\n\x0elist_listeners\x18\x07 \x01(\x08H\x00R\rlistListeners\x1aO\n\x1a\x41waitAnyTerminationCommand\x12"\n\ntimeout_ms\x18\x01 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_ms\x1a\xcd\x01\n\x1dStreamingQueryListenerCommand\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x12U\n\x17python_listener_payload\x18\x02 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x15pythonListenerPayload\x88\x01\x01\x12\x0e\n\x02id\x18\x03 \x01(\tR\x02idB\x1a\n\x18_python_listener_payloadB\t\n\x07\x63ommand"\xb4\x08\n"StreamingQueryManagerCommandResult\x12X\n\x06\x61\x63tive\x18\x01 \x01(\x0b\x32>.spark.connect.StreamingQueryManagerCommandResult.ActiveResultH\x00R\x06\x61\x63tive\x12`\n\x05query\x18\x02 \x01(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceH\x00R\x05query\x12\x81\x01\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32K.spark.connect.StreamingQueryManagerCommandResult.AwaitAnyTerminationResultH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12#\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x08H\x00R\x0b\x61\x64\x64Listener\x12)\n\x0fremove_listener\x18\x06 \x01(\x08H\x00R\x0eremoveListener\x12{\n\x0elist_listeners\x18\x07 \x01(\x0b\x32R.spark.connect.StreamingQueryManagerCommandResult.ListStreamingQueryListenerResultH\x00R\rlistListeners\x1a\x7f\n\x0c\x41\x63tiveResult\x12o\n\x0e\x61\x63tive_queries\x18\x01 \x03(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceR\ractiveQueries\x1as\n\x16StreamingQueryInstance\x12\x37\n\x02id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x02id\x12\x17\n\x04name\x18\x02 \x01(\tH\x00R\x04name\x88\x01\x01\x42\x07\n\x05_name\x1a;\n\x19\x41waitAnyTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminated\x1aK\n\x1eStreamingQueryListenerInstance\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x1a\x45\n ListStreamingQueryListenerResult\x12!\n\x0clistener_ids\x18\x01 \x03(\tR\x0blistenerIdsB\r\n\x0bresult_type"\xad\x01\n StreamingQueryListenerBusCommand\x12;\n\x19\x61\x64\x64_listener_bus_listener\x18\x01 \x01(\x08H\x00R\x16\x61\x64\x64ListenerBusListener\x12\x41\n\x1cremove_listener_bus_listener\x18\x02 \x01(\x08H\x00R\x19removeListenerBusListenerB\t\n\x07\x63ommand"\x83\x01\n\x1bStreamingQueryListenerEvent\x12\x1d\n\nevent_json\x18\x01 \x01(\tR\teventJson\x12\x45\n\nevent_type\x18\x02 \x01(\x0e\x32&.spark.connect.StreamingQueryEventTypeR\teventType"\xcc\x01\n"StreamingQueryListenerEventsResult\x12\x42\n\x06\x65vents\x18\x01 \x03(\x0b\x32*.spark.connect.StreamingQueryListenerEventR\x06\x65vents\x12\x42\n\x1blistener_bus_listener_added\x18\x02 \x01(\x08H\x00R\x18listenerBusListenerAdded\x88\x01\x01\x42\x1e\n\x1c_listener_bus_listener_added"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01"X\n\x1c\x43reateResourceProfileCommand\x12\x38\n\x07profile\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ResourceProfileR\x07profile"C\n"CreateResourceProfileCommandResult\x12\x1d\n\nprofile_id\x18\x01 \x01(\x05R\tprofileId"d\n!RemoveCachedRemoteRelationCommand\x12?\n\x08relation\x18\x01 \x01(\x0b\x32#.spark.connect.CachedRemoteRelationR\x08relation"\xcd\x01\n\x11\x43heckpointCommand\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x14\n\x05local\x18\x02 \x01(\x08R\x05local\x12\x14\n\x05\x65\x61ger\x18\x03 \x01(\x08R\x05\x65\x61ger\x12\x45\n\rstorage_level\x18\x04 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level"\xe8\x03\n\x15MergeIntoTableCommand\x12*\n\x11target_table_name\x18\x01 \x01(\tR\x0ftargetTableName\x12\x43\n\x11source_table_plan\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x0fsourceTablePlan\x12\x42\n\x0fmerge_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0emergeCondition\x12>\n\rmatch_actions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cmatchActions\x12I\n\x13not_matched_actions\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x11notMatchedActions\x12[\n\x1dnot_matched_by_source_actions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x19notMatchedBySourceActions\x12\x32\n\x15with_schema_evolution\x18\x07 \x01(\x08R\x13withSchemaEvolution*\x85\x01\n\x17StreamingQueryEventType\x12\x1e\n\x1aQUERY_PROGRESS_UNSPECIFIED\x10\x00\x12\x18\n\x14QUERY_PROGRESS_EVENT\x10\x01\x12\x1a\n\x16QUERY_TERMINATED_EVENT\x10\x02\x12\x14\n\x10QUERY_IDLE_EVENT\x10\x03\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _globals = globals() @@ -78,114 +79,114 @@ _globals["_WRITESTREAMOPERATIONSTART_OPTIONSENTRY"]._serialized_options = b"8\001" _globals["_GETRESOURCESCOMMANDRESULT_RESOURCESENTRY"]._loaded_options = None _globals["_GETRESOURCESCOMMANDRESULT_RESOURCESENTRY"]._serialized_options = b"8\001" - _globals["_STREAMINGQUERYEVENTTYPE"]._serialized_start = 11252 - _globals["_STREAMINGQUERYEVENTTYPE"]._serialized_end = 11385 - _globals["_COMMAND"]._serialized_start = 167 - _globals["_COMMAND"]._serialized_end = 1847 - _globals["_SQLCOMMAND"]._serialized_start = 1850 - _globals["_SQLCOMMAND"]._serialized_end = 2404 - _globals["_SQLCOMMAND_ARGSENTRY"]._serialized_start = 2220 - _globals["_SQLCOMMAND_ARGSENTRY"]._serialized_end = 2310 - _globals["_SQLCOMMAND_NAMEDARGUMENTSENTRY"]._serialized_start = 2312 - _globals["_SQLCOMMAND_NAMEDARGUMENTSENTRY"]._serialized_end = 2404 - _globals["_CREATEDATAFRAMEVIEWCOMMAND"]._serialized_start = 2407 - _globals["_CREATEDATAFRAMEVIEWCOMMAND"]._serialized_end = 2557 - _globals["_WRITEOPERATION"]._serialized_start = 2560 - _globals["_WRITEOPERATION"]._serialized_end = 3658 - _globals["_WRITEOPERATION_OPTIONSENTRY"]._serialized_start = 3082 - _globals["_WRITEOPERATION_OPTIONSENTRY"]._serialized_end = 3140 - _globals["_WRITEOPERATION_SAVETABLE"]._serialized_start = 3143 - _globals["_WRITEOPERATION_SAVETABLE"]._serialized_end = 3401 - _globals["_WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD"]._serialized_start = 3277 - _globals["_WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD"]._serialized_end = 3401 - _globals["_WRITEOPERATION_BUCKETBY"]._serialized_start = 3403 - _globals["_WRITEOPERATION_BUCKETBY"]._serialized_end = 3494 - _globals["_WRITEOPERATION_SAVEMODE"]._serialized_start = 3497 - _globals["_WRITEOPERATION_SAVEMODE"]._serialized_end = 3634 - _globals["_WRITEOPERATIONV2"]._serialized_start = 3661 - _globals["_WRITEOPERATIONV2"]._serialized_end = 4521 - _globals["_WRITEOPERATIONV2_OPTIONSENTRY"]._serialized_start = 3082 - _globals["_WRITEOPERATIONV2_OPTIONSENTRY"]._serialized_end = 3140 - _globals["_WRITEOPERATIONV2_TABLEPROPERTIESENTRY"]._serialized_start = 4280 - _globals["_WRITEOPERATIONV2_TABLEPROPERTIESENTRY"]._serialized_end = 4346 - _globals["_WRITEOPERATIONV2_MODE"]._serialized_start = 4349 - _globals["_WRITEOPERATIONV2_MODE"]._serialized_end = 4508 - _globals["_WRITESTREAMOPERATIONSTART"]._serialized_start = 4524 - _globals["_WRITESTREAMOPERATIONSTART"]._serialized_end = 5380 - _globals["_WRITESTREAMOPERATIONSTART_OPTIONSENTRY"]._serialized_start = 3082 - _globals["_WRITESTREAMOPERATIONSTART_OPTIONSENTRY"]._serialized_end = 3140 - _globals["_STREAMINGFOREACHFUNCTION"]._serialized_start = 5383 - _globals["_STREAMINGFOREACHFUNCTION"]._serialized_end = 5562 - _globals["_WRITESTREAMOPERATIONSTARTRESULT"]._serialized_start = 5565 - _globals["_WRITESTREAMOPERATIONSTARTRESULT"]._serialized_end = 5777 - _globals["_STREAMINGQUERYINSTANCEID"]._serialized_start = 5779 - _globals["_STREAMINGQUERYINSTANCEID"]._serialized_end = 5844 - _globals["_STREAMINGQUERYCOMMAND"]._serialized_start = 5847 - _globals["_STREAMINGQUERYCOMMAND"]._serialized_end = 6479 - _globals["_STREAMINGQUERYCOMMAND_EXPLAINCOMMAND"]._serialized_start = 6346 - _globals["_STREAMINGQUERYCOMMAND_EXPLAINCOMMAND"]._serialized_end = 6390 - _globals["_STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND"]._serialized_start = 6392 - _globals["_STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND"]._serialized_end = 6468 - _globals["_STREAMINGQUERYCOMMANDRESULT"]._serialized_start = 6482 - _globals["_STREAMINGQUERYCOMMANDRESULT"]._serialized_end = 7623 - _globals["_STREAMINGQUERYCOMMANDRESULT_STATUSRESULT"]._serialized_start = 7065 - _globals["_STREAMINGQUERYCOMMANDRESULT_STATUSRESULT"]._serialized_end = 7235 - _globals["_STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT"]._serialized_start = 7237 - _globals["_STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT"]._serialized_end = 7309 - _globals["_STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT"]._serialized_start = 7311 - _globals["_STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT"]._serialized_end = 7350 - _globals["_STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT"]._serialized_start = 7353 - _globals["_STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT"]._serialized_end = 7550 - _globals["_STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT"]._serialized_start = 7552 - _globals["_STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT"]._serialized_end = 7608 - _globals["_STREAMINGQUERYMANAGERCOMMAND"]._serialized_start = 7626 - _globals["_STREAMINGQUERYMANAGERCOMMAND"]._serialized_end = 8455 - _globals["_STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND"]._serialized_start = 8157 - _globals["_STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND"]._serialized_end = 8236 - _globals["_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND"]._serialized_start = 8239 - _globals["_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND"]._serialized_end = 8444 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT"]._serialized_start = 8458 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT"]._serialized_end = 9534 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT"]._serialized_start = 9066 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT"]._serialized_end = 9193 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE"]._serialized_start = 9195 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE"]._serialized_end = 9310 + _globals["_STREAMINGQUERYEVENTTYPE"]._serialized_start = 11335 + _globals["_STREAMINGQUERYEVENTTYPE"]._serialized_end = 11468 + _globals["_COMMAND"]._serialized_start = 191 + _globals["_COMMAND"]._serialized_end = 1930 + _globals["_SQLCOMMAND"]._serialized_start = 1933 + _globals["_SQLCOMMAND"]._serialized_end = 2487 + _globals["_SQLCOMMAND_ARGSENTRY"]._serialized_start = 2303 + _globals["_SQLCOMMAND_ARGSENTRY"]._serialized_end = 2393 + _globals["_SQLCOMMAND_NAMEDARGUMENTSENTRY"]._serialized_start = 2395 + _globals["_SQLCOMMAND_NAMEDARGUMENTSENTRY"]._serialized_end = 2487 + _globals["_CREATEDATAFRAMEVIEWCOMMAND"]._serialized_start = 2490 + _globals["_CREATEDATAFRAMEVIEWCOMMAND"]._serialized_end = 2640 + _globals["_WRITEOPERATION"]._serialized_start = 2643 + _globals["_WRITEOPERATION"]._serialized_end = 3741 + _globals["_WRITEOPERATION_OPTIONSENTRY"]._serialized_start = 3165 + _globals["_WRITEOPERATION_OPTIONSENTRY"]._serialized_end = 3223 + _globals["_WRITEOPERATION_SAVETABLE"]._serialized_start = 3226 + _globals["_WRITEOPERATION_SAVETABLE"]._serialized_end = 3484 + _globals["_WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD"]._serialized_start = 3360 + _globals["_WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD"]._serialized_end = 3484 + _globals["_WRITEOPERATION_BUCKETBY"]._serialized_start = 3486 + _globals["_WRITEOPERATION_BUCKETBY"]._serialized_end = 3577 + _globals["_WRITEOPERATION_SAVEMODE"]._serialized_start = 3580 + _globals["_WRITEOPERATION_SAVEMODE"]._serialized_end = 3717 + _globals["_WRITEOPERATIONV2"]._serialized_start = 3744 + _globals["_WRITEOPERATIONV2"]._serialized_end = 4604 + _globals["_WRITEOPERATIONV2_OPTIONSENTRY"]._serialized_start = 3165 + _globals["_WRITEOPERATIONV2_OPTIONSENTRY"]._serialized_end = 3223 + _globals["_WRITEOPERATIONV2_TABLEPROPERTIESENTRY"]._serialized_start = 4363 + _globals["_WRITEOPERATIONV2_TABLEPROPERTIESENTRY"]._serialized_end = 4429 + _globals["_WRITEOPERATIONV2_MODE"]._serialized_start = 4432 + _globals["_WRITEOPERATIONV2_MODE"]._serialized_end = 4591 + _globals["_WRITESTREAMOPERATIONSTART"]._serialized_start = 4607 + _globals["_WRITESTREAMOPERATIONSTART"]._serialized_end = 5463 + _globals["_WRITESTREAMOPERATIONSTART_OPTIONSENTRY"]._serialized_start = 3165 + _globals["_WRITESTREAMOPERATIONSTART_OPTIONSENTRY"]._serialized_end = 3223 + _globals["_STREAMINGFOREACHFUNCTION"]._serialized_start = 5466 + _globals["_STREAMINGFOREACHFUNCTION"]._serialized_end = 5645 + _globals["_WRITESTREAMOPERATIONSTARTRESULT"]._serialized_start = 5648 + _globals["_WRITESTREAMOPERATIONSTARTRESULT"]._serialized_end = 5860 + _globals["_STREAMINGQUERYINSTANCEID"]._serialized_start = 5862 + _globals["_STREAMINGQUERYINSTANCEID"]._serialized_end = 5927 + _globals["_STREAMINGQUERYCOMMAND"]._serialized_start = 5930 + _globals["_STREAMINGQUERYCOMMAND"]._serialized_end = 6562 + _globals["_STREAMINGQUERYCOMMAND_EXPLAINCOMMAND"]._serialized_start = 6429 + _globals["_STREAMINGQUERYCOMMAND_EXPLAINCOMMAND"]._serialized_end = 6473 + _globals["_STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND"]._serialized_start = 6475 + _globals["_STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND"]._serialized_end = 6551 + _globals["_STREAMINGQUERYCOMMANDRESULT"]._serialized_start = 6565 + _globals["_STREAMINGQUERYCOMMANDRESULT"]._serialized_end = 7706 + _globals["_STREAMINGQUERYCOMMANDRESULT_STATUSRESULT"]._serialized_start = 7148 + _globals["_STREAMINGQUERYCOMMANDRESULT_STATUSRESULT"]._serialized_end = 7318 + _globals["_STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT"]._serialized_start = 7320 + _globals["_STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT"]._serialized_end = 7392 + _globals["_STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT"]._serialized_start = 7394 + _globals["_STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT"]._serialized_end = 7433 + _globals["_STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT"]._serialized_start = 7436 + _globals["_STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT"]._serialized_end = 7633 + _globals["_STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT"]._serialized_start = 7635 + _globals["_STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT"]._serialized_end = 7691 + _globals["_STREAMINGQUERYMANAGERCOMMAND"]._serialized_start = 7709 + _globals["_STREAMINGQUERYMANAGERCOMMAND"]._serialized_end = 8538 + _globals["_STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND"]._serialized_start = 8240 + _globals["_STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND"]._serialized_end = 8319 + _globals["_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND"]._serialized_start = 8322 + _globals["_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND"]._serialized_end = 8527 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT"]._serialized_start = 8541 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT"]._serialized_end = 9617 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT"]._serialized_start = 9149 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT"]._serialized_end = 9276 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE"]._serialized_start = 9278 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE"]._serialized_end = 9393 _globals[ "_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT" - ]._serialized_start = 9312 - _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT"]._serialized_end = 9371 + ]._serialized_start = 9395 + _globals["_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT"]._serialized_end = 9454 _globals[ "_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE" - ]._serialized_start = 9373 + ]._serialized_start = 9456 _globals[ "_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE" - ]._serialized_end = 9448 + ]._serialized_end = 9531 _globals[ "_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT" - ]._serialized_start = 9450 + ]._serialized_start = 9533 _globals[ "_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT" - ]._serialized_end = 9519 - _globals["_STREAMINGQUERYLISTENERBUSCOMMAND"]._serialized_start = 9537 - _globals["_STREAMINGQUERYLISTENERBUSCOMMAND"]._serialized_end = 9710 - _globals["_STREAMINGQUERYLISTENEREVENT"]._serialized_start = 9713 - _globals["_STREAMINGQUERYLISTENEREVENT"]._serialized_end = 9844 - _globals["_STREAMINGQUERYLISTENEREVENTSRESULT"]._serialized_start = 9847 - _globals["_STREAMINGQUERYLISTENEREVENTSRESULT"]._serialized_end = 10051 - _globals["_GETRESOURCESCOMMAND"]._serialized_start = 10053 - _globals["_GETRESOURCESCOMMAND"]._serialized_end = 10074 - _globals["_GETRESOURCESCOMMANDRESULT"]._serialized_start = 10077 - _globals["_GETRESOURCESCOMMANDRESULT"]._serialized_end = 10289 - _globals["_GETRESOURCESCOMMANDRESULT_RESOURCESENTRY"]._serialized_start = 10193 - _globals["_GETRESOURCESCOMMANDRESULT_RESOURCESENTRY"]._serialized_end = 10289 - _globals["_CREATERESOURCEPROFILECOMMAND"]._serialized_start = 10291 - _globals["_CREATERESOURCEPROFILECOMMAND"]._serialized_end = 10379 - _globals["_CREATERESOURCEPROFILECOMMANDRESULT"]._serialized_start = 10381 - _globals["_CREATERESOURCEPROFILECOMMANDRESULT"]._serialized_end = 10448 - _globals["_REMOVECACHEDREMOTERELATIONCOMMAND"]._serialized_start = 10450 - _globals["_REMOVECACHEDREMOTERELATIONCOMMAND"]._serialized_end = 10550 - _globals["_CHECKPOINTCOMMAND"]._serialized_start = 10553 - _globals["_CHECKPOINTCOMMAND"]._serialized_end = 10758 - _globals["_MERGEINTOTABLECOMMAND"]._serialized_start = 10761 - _globals["_MERGEINTOTABLECOMMAND"]._serialized_end = 11249 + ]._serialized_end = 9602 + _globals["_STREAMINGQUERYLISTENERBUSCOMMAND"]._serialized_start = 9620 + _globals["_STREAMINGQUERYLISTENERBUSCOMMAND"]._serialized_end = 9793 + _globals["_STREAMINGQUERYLISTENEREVENT"]._serialized_start = 9796 + _globals["_STREAMINGQUERYLISTENEREVENT"]._serialized_end = 9927 + _globals["_STREAMINGQUERYLISTENEREVENTSRESULT"]._serialized_start = 9930 + _globals["_STREAMINGQUERYLISTENEREVENTSRESULT"]._serialized_end = 10134 + _globals["_GETRESOURCESCOMMAND"]._serialized_start = 10136 + _globals["_GETRESOURCESCOMMAND"]._serialized_end = 10157 + _globals["_GETRESOURCESCOMMANDRESULT"]._serialized_start = 10160 + _globals["_GETRESOURCESCOMMANDRESULT"]._serialized_end = 10372 + _globals["_GETRESOURCESCOMMANDRESULT_RESOURCESENTRY"]._serialized_start = 10276 + _globals["_GETRESOURCESCOMMANDRESULT_RESOURCESENTRY"]._serialized_end = 10372 + _globals["_CREATERESOURCEPROFILECOMMAND"]._serialized_start = 10374 + _globals["_CREATERESOURCEPROFILECOMMAND"]._serialized_end = 10462 + _globals["_CREATERESOURCEPROFILECOMMANDRESULT"]._serialized_start = 10464 + _globals["_CREATERESOURCEPROFILECOMMANDRESULT"]._serialized_end = 10531 + _globals["_REMOVECACHEDREMOTERELATIONCOMMAND"]._serialized_start = 10533 + _globals["_REMOVECACHEDREMOTERELATIONCOMMAND"]._serialized_end = 10633 + _globals["_CHECKPOINTCOMMAND"]._serialized_start = 10636 + _globals["_CHECKPOINTCOMMAND"]._serialized_end = 10841 + _globals["_MERGEINTOTABLECOMMAND"]._serialized_start = 10844 + _globals["_MERGEINTOTABLECOMMAND"]._serialized_end = 11332 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index 6192a29607cbf..906f1aad10574 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -42,6 +42,7 @@ import google.protobuf.internal.enum_type_wrapper import google.protobuf.message import pyspark.sql.connect.proto.common_pb2 import pyspark.sql.connect.proto.expressions_pb2 +import pyspark.sql.connect.proto.ml_pb2 import pyspark.sql.connect.proto.relations_pb2 import sys import typing @@ -104,6 +105,7 @@ class Command(google.protobuf.message.Message): CHECKPOINT_COMMAND_FIELD_NUMBER: builtins.int REMOVE_CACHED_REMOTE_RELATION_COMMAND_FIELD_NUMBER: builtins.int MERGE_INTO_TABLE_COMMAND_FIELD_NUMBER: builtins.int + ML_COMMAND_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int @property def register_function( @@ -146,6 +148,8 @@ class Command(google.protobuf.message.Message): @property def merge_into_table_command(self) -> global___MergeIntoTableCommand: ... @property + def ml_command(self) -> pyspark.sql.connect.proto.ml_pb2.MlCommand: ... + @property def extension(self) -> google.protobuf.any_pb2.Any: """This field is used to mark extensions to the protocol. When plugins generate arbitrary Commands they can add them here. During the planning the correct resolution is done. @@ -174,6 +178,7 @@ class Command(google.protobuf.message.Message): remove_cached_remote_relation_command: global___RemoveCachedRemoteRelationCommand | None = ..., merge_into_table_command: global___MergeIntoTableCommand | None = ..., + ml_command: pyspark.sql.connect.proto.ml_pb2.MlCommand | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., ) -> None: ... def HasField( @@ -193,6 +198,8 @@ class Command(google.protobuf.message.Message): b"get_resources_command", "merge_into_table_command", b"merge_into_table_command", + "ml_command", + b"ml_command", "register_data_source", b"register_data_source", "register_function", @@ -234,6 +241,8 @@ class Command(google.protobuf.message.Message): b"get_resources_command", "merge_into_table_command", b"merge_into_table_command", + "ml_command", + b"ml_command", "register_data_source", b"register_data_source", "register_function", @@ -278,6 +287,7 @@ class Command(google.protobuf.message.Message): "checkpoint_command", "remove_cached_remote_relation_command", "merge_into_table_command", + "ml_command", "extension", ] | None diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.py b/python/pyspark/sql/connect/proto/expressions_pb2.py index 0d4730ac736e3..7edcbcac15c73 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.py +++ b/python/pyspark/sql/connect/proto/expressions_pb2.py @@ -40,7 +40,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1fspark/connect/expressions.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x19spark/connect/types.proto\x1a\x1aspark/connect/common.proto"\xc1\x30\n\nExpression\x12\x37\n\x06\x63ommon\x18\x12 \x01(\x0b\x32\x1f.spark.connect.ExpressionCommonR\x06\x63ommon\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12\x62\n\x14unresolved_attribute\x18\x02 \x01(\x0b\x32-.spark.connect.Expression.UnresolvedAttributeH\x00R\x13unresolvedAttribute\x12_\n\x13unresolved_function\x18\x03 \x01(\x0b\x32,.spark.connect.Expression.UnresolvedFunctionH\x00R\x12unresolvedFunction\x12Y\n\x11\x65xpression_string\x18\x04 \x01(\x0b\x32*.spark.connect.Expression.ExpressionStringH\x00R\x10\x65xpressionString\x12S\n\x0funresolved_star\x18\x05 \x01(\x0b\x32(.spark.connect.Expression.UnresolvedStarH\x00R\x0eunresolvedStar\x12\x37\n\x05\x61lias\x18\x06 \x01(\x0b\x32\x1f.spark.connect.Expression.AliasH\x00R\x05\x61lias\x12\x34\n\x04\x63\x61st\x18\x07 \x01(\x0b\x32\x1e.spark.connect.Expression.CastH\x00R\x04\x63\x61st\x12V\n\x10unresolved_regex\x18\x08 \x01(\x0b\x32).spark.connect.Expression.UnresolvedRegexH\x00R\x0funresolvedRegex\x12\x44\n\nsort_order\x18\t \x01(\x0b\x32#.spark.connect.Expression.SortOrderH\x00R\tsortOrder\x12S\n\x0flambda_function\x18\n \x01(\x0b\x32(.spark.connect.Expression.LambdaFunctionH\x00R\x0elambdaFunction\x12:\n\x06window\x18\x0b \x01(\x0b\x32 .spark.connect.Expression.WindowH\x00R\x06window\x12l\n\x18unresolved_extract_value\x18\x0c \x01(\x0b\x32\x30.spark.connect.Expression.UnresolvedExtractValueH\x00R\x16unresolvedExtractValue\x12M\n\rupdate_fields\x18\r \x01(\x0b\x32&.spark.connect.Expression.UpdateFieldsH\x00R\x0cupdateFields\x12\x82\x01\n unresolved_named_lambda_variable\x18\x0e \x01(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableH\x00R\x1dunresolvedNamedLambdaVariable\x12~\n#common_inline_user_defined_function\x18\x0f \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x1f\x63ommonInlineUserDefinedFunction\x12\x42\n\rcall_function\x18\x10 \x01(\x0b\x32\x1b.spark.connect.CallFunctionH\x00R\x0c\x63\x61llFunction\x12\x64\n\x19named_argument_expression\x18\x11 \x01(\x0b\x32&.spark.connect.NamedArgumentExpressionH\x00R\x17namedArgumentExpression\x12?\n\x0cmerge_action\x18\x13 \x01(\x0b\x32\x1a.spark.connect.MergeActionH\x00R\x0bmergeAction\x12g\n\x1atyped_aggregate_expression\x18\x14 \x01(\x0b\x32\'.spark.connect.TypedAggregateExpressionH\x00R\x18typedAggregateExpression\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x1a\x8f\x06\n\x06Window\x12\x42\n\x0fwindow_function\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0ewindowFunction\x12@\n\x0epartition_spec\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\rpartitionSpec\x12\x42\n\norder_spec\x18\x03 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\torderSpec\x12K\n\nframe_spec\x18\x04 \x01(\x0b\x32,.spark.connect.Expression.Window.WindowFrameR\tframeSpec\x1a\xed\x03\n\x0bWindowFrame\x12U\n\nframe_type\x18\x01 \x01(\x0e\x32\x36.spark.connect.Expression.Window.WindowFrame.FrameTypeR\tframeType\x12P\n\x05lower\x18\x02 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05lower\x12P\n\x05upper\x18\x03 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05upper\x1a\x91\x01\n\rFrameBoundary\x12!\n\x0b\x63urrent_row\x18\x01 \x01(\x08H\x00R\ncurrentRow\x12\x1e\n\tunbounded\x18\x02 \x01(\x08H\x00R\tunbounded\x12\x31\n\x05value\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\x05valueB\n\n\x08\x62oundary"O\n\tFrameType\x12\x18\n\x14\x46RAME_TYPE_UNDEFINED\x10\x00\x12\x12\n\x0e\x46RAME_TYPE_ROW\x10\x01\x12\x14\n\x10\x46RAME_TYPE_RANGE\x10\x02\x1a\xa9\x03\n\tSortOrder\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12O\n\tdirection\x18\x02 \x01(\x0e\x32\x31.spark.connect.Expression.SortOrder.SortDirectionR\tdirection\x12U\n\rnull_ordering\x18\x03 \x01(\x0e\x32\x30.spark.connect.Expression.SortOrder.NullOrderingR\x0cnullOrdering"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"U\n\x0cNullOrdering\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x1a\xbb\x02\n\x04\x43\x61st\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12-\n\x04type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04type\x12\x1b\n\x08type_str\x18\x03 \x01(\tH\x00R\x07typeStr\x12\x44\n\teval_mode\x18\x04 \x01(\x0e\x32\'.spark.connect.Expression.Cast.EvalModeR\x08\x65valMode"b\n\x08\x45valMode\x12\x19\n\x15\x45VAL_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x45VAL_MODE_LEGACY\x10\x01\x12\x12\n\x0e\x45VAL_MODE_ANSI\x10\x02\x12\x11\n\rEVAL_MODE_TRY\x10\x03\x42\x0e\n\x0c\x63\x61st_to_type\x1a\x9b\x0c\n\x07Literal\x12-\n\x04null\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04null\x12\x18\n\x06\x62inary\x18\x02 \x01(\x0cH\x00R\x06\x62inary\x12\x1a\n\x07\x62oolean\x18\x03 \x01(\x08H\x00R\x07\x62oolean\x12\x14\n\x04\x62yte\x18\x04 \x01(\x05H\x00R\x04\x62yte\x12\x16\n\x05short\x18\x05 \x01(\x05H\x00R\x05short\x12\x1a\n\x07integer\x18\x06 \x01(\x05H\x00R\x07integer\x12\x14\n\x04long\x18\x07 \x01(\x03H\x00R\x04long\x12\x16\n\x05\x66loat\x18\n \x01(\x02H\x00R\x05\x66loat\x12\x18\n\x06\x64ouble\x18\x0b \x01(\x01H\x00R\x06\x64ouble\x12\x45\n\x07\x64\x65\x63imal\x18\x0c \x01(\x0b\x32).spark.connect.Expression.Literal.DecimalH\x00R\x07\x64\x65\x63imal\x12\x18\n\x06string\x18\r \x01(\tH\x00R\x06string\x12\x14\n\x04\x64\x61te\x18\x10 \x01(\x05H\x00R\x04\x64\x61te\x12\x1e\n\ttimestamp\x18\x11 \x01(\x03H\x00R\ttimestamp\x12%\n\rtimestamp_ntz\x18\x12 \x01(\x03H\x00R\x0ctimestampNtz\x12\x61\n\x11\x63\x61lendar_interval\x18\x13 \x01(\x0b\x32\x32.spark.connect.Expression.Literal.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12\x30\n\x13year_month_interval\x18\x14 \x01(\x05H\x00R\x11yearMonthInterval\x12,\n\x11\x64\x61y_time_interval\x18\x15 \x01(\x03H\x00R\x0f\x64\x61yTimeInterval\x12?\n\x05\x61rray\x18\x16 \x01(\x0b\x32\'.spark.connect.Expression.Literal.ArrayH\x00R\x05\x61rray\x12\x39\n\x03map\x18\x17 \x01(\x0b\x32%.spark.connect.Expression.Literal.MapH\x00R\x03map\x12\x42\n\x06struct\x18\x18 \x01(\x0b\x32(.spark.connect.Expression.Literal.StructH\x00R\x06struct\x1au\n\x07\x44\x65\x63imal\x12\x14\n\x05value\x18\x01 \x01(\tR\x05value\x12!\n\tprecision\x18\x02 \x01(\x05H\x00R\tprecision\x88\x01\x01\x12\x19\n\x05scale\x18\x03 \x01(\x05H\x01R\x05scale\x88\x01\x01\x42\x0c\n\n_precisionB\x08\n\x06_scale\x1a\x62\n\x10\x43\x61lendarInterval\x12\x16\n\x06months\x18\x01 \x01(\x05R\x06months\x12\x12\n\x04\x64\x61ys\x18\x02 \x01(\x05R\x04\x64\x61ys\x12"\n\x0cmicroseconds\x18\x03 \x01(\x03R\x0cmicroseconds\x1a\x82\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lements\x1a\xe3\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12\x35\n\x04keys\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x04keys\x12\x39\n\x06values\x18\x04 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1a\x81\x01\n\x06Struct\x12\x38\n\x0bstruct_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\nstructType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lementsB\x0e\n\x0cliteral_type\x1a\xba\x01\n\x13UnresolvedAttribute\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12\x31\n\x12is_metadata_column\x18\x03 \x01(\x08H\x01R\x10isMetadataColumn\x88\x01\x01\x42\n\n\x08_plan_idB\x15\n\x13_is_metadata_column\x1a\xcc\x01\n\x12UnresolvedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x1f\n\x0bis_distinct\x18\x03 \x01(\x08R\nisDistinct\x12\x37\n\x18is_user_defined_function\x18\x04 \x01(\x08R\x15isUserDefinedFunction\x1a\x32\n\x10\x45xpressionString\x12\x1e\n\nexpression\x18\x01 \x01(\tR\nexpression\x1a|\n\x0eUnresolvedStar\x12,\n\x0funparsed_target\x18\x01 \x01(\tH\x00R\x0eunparsedTarget\x88\x01\x01\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x01R\x06planId\x88\x01\x01\x42\x12\n\x10_unparsed_targetB\n\n\x08_plan_id\x1aV\n\x0fUnresolvedRegex\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\x84\x01\n\x16UnresolvedExtractValue\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12\x39\n\nextraction\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nextraction\x1a\xbb\x01\n\x0cUpdateFields\x12\x46\n\x11struct_expression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x10structExpression\x12\x1d\n\nfield_name\x18\x02 \x01(\tR\tfieldName\x12\x44\n\x10value_expression\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0fvalueExpression\x1ax\n\x05\x41lias\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12\x12\n\x04name\x18\x02 \x03(\tR\x04name\x12\x1f\n\x08metadata\x18\x03 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x9e\x01\n\x0eLambdaFunction\x12\x35\n\x08\x66unction\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08\x66unction\x12U\n\targuments\x18\x02 \x03(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableR\targuments\x1a>\n\x1dUnresolvedNamedLambdaVariable\x12\x1d\n\nname_parts\x18\x01 \x03(\tR\tnamePartsB\x0b\n\texpr_type"A\n\x10\x45xpressionCommon\x12-\n\x06origin\x18\x01 \x01(\x0b\x32\x15.spark.connect.OriginR\x06origin"\xec\x02\n\x1f\x43ommonInlineUserDefinedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x39\n\npython_udf\x18\x04 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\tpythonUdf\x12I\n\x10scalar_scala_udf\x18\x05 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\x0escalarScalaUdf\x12\x33\n\x08java_udf\x18\x06 \x01(\x0b\x32\x16.spark.connect.JavaUDFH\x00R\x07javaUdfB\n\n\x08\x66unction"\xcc\x01\n\tPythonUDF\x12\x38\n\x0boutput_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVer\x12/\n\x13\x61\x64\x64itional_includes\x18\x05 \x03(\tR\x12\x61\x64\x64itionalIncludes"\xd6\x01\n\x0eScalarScalaUDF\x12\x18\n\x07payload\x18\x01 \x01(\x0cR\x07payload\x12\x37\n\ninputTypes\x18\x02 \x03(\x0b\x32\x17.spark.connect.DataTypeR\ninputTypes\x12\x37\n\noutputType\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1a\n\x08nullable\x18\x04 \x01(\x08R\x08nullable\x12\x1c\n\taggregate\x18\x05 \x01(\x08R\taggregate"\x95\x01\n\x07JavaUDF\x12\x1d\n\nclass_name\x18\x01 \x01(\tR\tclassName\x12=\n\x0boutput_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\noutputType\x88\x01\x01\x12\x1c\n\taggregate\x18\x03 \x01(\x08R\taggregateB\x0e\n\x0c_output_type"c\n\x18TypedAggregateExpression\x12G\n\x10scalar_scala_udf\x18\x01 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFR\x0escalarScalaUdf"l\n\x0c\x43\x61llFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"\\\n\x17NamedArgumentExpression\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value"\x80\x04\n\x0bMergeAction\x12\x46\n\x0b\x61\x63tion_type\x18\x01 \x01(\x0e\x32%.spark.connect.MergeAction.ActionTypeR\nactionType\x12<\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\tcondition\x88\x01\x01\x12G\n\x0b\x61ssignments\x18\x03 \x03(\x0b\x32%.spark.connect.MergeAction.AssignmentR\x0b\x61ssignments\x1aj\n\nAssignment\x12+\n\x03key\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value"\xa7\x01\n\nActionType\x12\x17\n\x13\x41\x43TION_TYPE_INVALID\x10\x00\x12\x16\n\x12\x41\x43TION_TYPE_DELETE\x10\x01\x12\x16\n\x12\x41\x43TION_TYPE_INSERT\x10\x02\x12\x1b\n\x17\x41\x43TION_TYPE_INSERT_STAR\x10\x03\x12\x16\n\x12\x41\x43TION_TYPE_UPDATE\x10\x04\x12\x1b\n\x17\x41\x43TION_TYPE_UPDATE_STAR\x10\x05\x42\x0c\n\n_conditionB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1fspark/connect/expressions.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x19spark/connect/types.proto\x1a\x1aspark/connect/common.proto"\x97\x32\n\nExpression\x12\x37\n\x06\x63ommon\x18\x12 \x01(\x0b\x32\x1f.spark.connect.ExpressionCommonR\x06\x63ommon\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12\x62\n\x14unresolved_attribute\x18\x02 \x01(\x0b\x32-.spark.connect.Expression.UnresolvedAttributeH\x00R\x13unresolvedAttribute\x12_\n\x13unresolved_function\x18\x03 \x01(\x0b\x32,.spark.connect.Expression.UnresolvedFunctionH\x00R\x12unresolvedFunction\x12Y\n\x11\x65xpression_string\x18\x04 \x01(\x0b\x32*.spark.connect.Expression.ExpressionStringH\x00R\x10\x65xpressionString\x12S\n\x0funresolved_star\x18\x05 \x01(\x0b\x32(.spark.connect.Expression.UnresolvedStarH\x00R\x0eunresolvedStar\x12\x37\n\x05\x61lias\x18\x06 \x01(\x0b\x32\x1f.spark.connect.Expression.AliasH\x00R\x05\x61lias\x12\x34\n\x04\x63\x61st\x18\x07 \x01(\x0b\x32\x1e.spark.connect.Expression.CastH\x00R\x04\x63\x61st\x12V\n\x10unresolved_regex\x18\x08 \x01(\x0b\x32).spark.connect.Expression.UnresolvedRegexH\x00R\x0funresolvedRegex\x12\x44\n\nsort_order\x18\t \x01(\x0b\x32#.spark.connect.Expression.SortOrderH\x00R\tsortOrder\x12S\n\x0flambda_function\x18\n \x01(\x0b\x32(.spark.connect.Expression.LambdaFunctionH\x00R\x0elambdaFunction\x12:\n\x06window\x18\x0b \x01(\x0b\x32 .spark.connect.Expression.WindowH\x00R\x06window\x12l\n\x18unresolved_extract_value\x18\x0c \x01(\x0b\x32\x30.spark.connect.Expression.UnresolvedExtractValueH\x00R\x16unresolvedExtractValue\x12M\n\rupdate_fields\x18\r \x01(\x0b\x32&.spark.connect.Expression.UpdateFieldsH\x00R\x0cupdateFields\x12\x82\x01\n unresolved_named_lambda_variable\x18\x0e \x01(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableH\x00R\x1dunresolvedNamedLambdaVariable\x12~\n#common_inline_user_defined_function\x18\x0f \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x1f\x63ommonInlineUserDefinedFunction\x12\x42\n\rcall_function\x18\x10 \x01(\x0b\x32\x1b.spark.connect.CallFunctionH\x00R\x0c\x63\x61llFunction\x12\x64\n\x19named_argument_expression\x18\x11 \x01(\x0b\x32&.spark.connect.NamedArgumentExpressionH\x00R\x17namedArgumentExpression\x12?\n\x0cmerge_action\x18\x13 \x01(\x0b\x32\x1a.spark.connect.MergeActionH\x00R\x0bmergeAction\x12g\n\x1atyped_aggregate_expression\x18\x14 \x01(\x0b\x32\'.spark.connect.TypedAggregateExpressionH\x00R\x18typedAggregateExpression\x12H\n\x0flazy_expression\x18\x15 \x01(\x0b\x32\x1d.spark.connect.LazyExpressionH\x00R\x0elazyExpression\x12T\n\x13subquery_expression\x18\x16 \x01(\x0b\x32!.spark.connect.SubqueryExpressionH\x00R\x12subqueryExpression\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x1a\x8f\x06\n\x06Window\x12\x42\n\x0fwindow_function\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0ewindowFunction\x12@\n\x0epartition_spec\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\rpartitionSpec\x12\x42\n\norder_spec\x18\x03 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\torderSpec\x12K\n\nframe_spec\x18\x04 \x01(\x0b\x32,.spark.connect.Expression.Window.WindowFrameR\tframeSpec\x1a\xed\x03\n\x0bWindowFrame\x12U\n\nframe_type\x18\x01 \x01(\x0e\x32\x36.spark.connect.Expression.Window.WindowFrame.FrameTypeR\tframeType\x12P\n\x05lower\x18\x02 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05lower\x12P\n\x05upper\x18\x03 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05upper\x1a\x91\x01\n\rFrameBoundary\x12!\n\x0b\x63urrent_row\x18\x01 \x01(\x08H\x00R\ncurrentRow\x12\x1e\n\tunbounded\x18\x02 \x01(\x08H\x00R\tunbounded\x12\x31\n\x05value\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\x05valueB\n\n\x08\x62oundary"O\n\tFrameType\x12\x18\n\x14\x46RAME_TYPE_UNDEFINED\x10\x00\x12\x12\n\x0e\x46RAME_TYPE_ROW\x10\x01\x12\x14\n\x10\x46RAME_TYPE_RANGE\x10\x02\x1a\xa9\x03\n\tSortOrder\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12O\n\tdirection\x18\x02 \x01(\x0e\x32\x31.spark.connect.Expression.SortOrder.SortDirectionR\tdirection\x12U\n\rnull_ordering\x18\x03 \x01(\x0e\x32\x30.spark.connect.Expression.SortOrder.NullOrderingR\x0cnullOrdering"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"U\n\x0cNullOrdering\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x1a\xbb\x02\n\x04\x43\x61st\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12-\n\x04type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04type\x12\x1b\n\x08type_str\x18\x03 \x01(\tH\x00R\x07typeStr\x12\x44\n\teval_mode\x18\x04 \x01(\x0e\x32\'.spark.connect.Expression.Cast.EvalModeR\x08\x65valMode"b\n\x08\x45valMode\x12\x19\n\x15\x45VAL_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x45VAL_MODE_LEGACY\x10\x01\x12\x12\n\x0e\x45VAL_MODE_ANSI\x10\x02\x12\x11\n\rEVAL_MODE_TRY\x10\x03\x42\x0e\n\x0c\x63\x61st_to_type\x1a\x9b\x0c\n\x07Literal\x12-\n\x04null\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04null\x12\x18\n\x06\x62inary\x18\x02 \x01(\x0cH\x00R\x06\x62inary\x12\x1a\n\x07\x62oolean\x18\x03 \x01(\x08H\x00R\x07\x62oolean\x12\x14\n\x04\x62yte\x18\x04 \x01(\x05H\x00R\x04\x62yte\x12\x16\n\x05short\x18\x05 \x01(\x05H\x00R\x05short\x12\x1a\n\x07integer\x18\x06 \x01(\x05H\x00R\x07integer\x12\x14\n\x04long\x18\x07 \x01(\x03H\x00R\x04long\x12\x16\n\x05\x66loat\x18\n \x01(\x02H\x00R\x05\x66loat\x12\x18\n\x06\x64ouble\x18\x0b \x01(\x01H\x00R\x06\x64ouble\x12\x45\n\x07\x64\x65\x63imal\x18\x0c \x01(\x0b\x32).spark.connect.Expression.Literal.DecimalH\x00R\x07\x64\x65\x63imal\x12\x18\n\x06string\x18\r \x01(\tH\x00R\x06string\x12\x14\n\x04\x64\x61te\x18\x10 \x01(\x05H\x00R\x04\x64\x61te\x12\x1e\n\ttimestamp\x18\x11 \x01(\x03H\x00R\ttimestamp\x12%\n\rtimestamp_ntz\x18\x12 \x01(\x03H\x00R\x0ctimestampNtz\x12\x61\n\x11\x63\x61lendar_interval\x18\x13 \x01(\x0b\x32\x32.spark.connect.Expression.Literal.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12\x30\n\x13year_month_interval\x18\x14 \x01(\x05H\x00R\x11yearMonthInterval\x12,\n\x11\x64\x61y_time_interval\x18\x15 \x01(\x03H\x00R\x0f\x64\x61yTimeInterval\x12?\n\x05\x61rray\x18\x16 \x01(\x0b\x32\'.spark.connect.Expression.Literal.ArrayH\x00R\x05\x61rray\x12\x39\n\x03map\x18\x17 \x01(\x0b\x32%.spark.connect.Expression.Literal.MapH\x00R\x03map\x12\x42\n\x06struct\x18\x18 \x01(\x0b\x32(.spark.connect.Expression.Literal.StructH\x00R\x06struct\x1au\n\x07\x44\x65\x63imal\x12\x14\n\x05value\x18\x01 \x01(\tR\x05value\x12!\n\tprecision\x18\x02 \x01(\x05H\x00R\tprecision\x88\x01\x01\x12\x19\n\x05scale\x18\x03 \x01(\x05H\x01R\x05scale\x88\x01\x01\x42\x0c\n\n_precisionB\x08\n\x06_scale\x1a\x62\n\x10\x43\x61lendarInterval\x12\x16\n\x06months\x18\x01 \x01(\x05R\x06months\x12\x12\n\x04\x64\x61ys\x18\x02 \x01(\x05R\x04\x64\x61ys\x12"\n\x0cmicroseconds\x18\x03 \x01(\x03R\x0cmicroseconds\x1a\x82\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lements\x1a\xe3\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12\x35\n\x04keys\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x04keys\x12\x39\n\x06values\x18\x04 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1a\x81\x01\n\x06Struct\x12\x38\n\x0bstruct_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\nstructType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lementsB\x0e\n\x0cliteral_type\x1a\xba\x01\n\x13UnresolvedAttribute\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12\x31\n\x12is_metadata_column\x18\x03 \x01(\x08H\x01R\x10isMetadataColumn\x88\x01\x01\x42\n\n\x08_plan_idB\x15\n\x13_is_metadata_column\x1a\x82\x02\n\x12UnresolvedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x1f\n\x0bis_distinct\x18\x03 \x01(\x08R\nisDistinct\x12\x37\n\x18is_user_defined_function\x18\x04 \x01(\x08R\x15isUserDefinedFunction\x12$\n\x0bis_internal\x18\x05 \x01(\x08H\x00R\nisInternal\x88\x01\x01\x42\x0e\n\x0c_is_internal\x1a\x32\n\x10\x45xpressionString\x12\x1e\n\nexpression\x18\x01 \x01(\tR\nexpression\x1a|\n\x0eUnresolvedStar\x12,\n\x0funparsed_target\x18\x01 \x01(\tH\x00R\x0eunparsedTarget\x88\x01\x01\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x01R\x06planId\x88\x01\x01\x42\x12\n\x10_unparsed_targetB\n\n\x08_plan_id\x1aV\n\x0fUnresolvedRegex\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\x84\x01\n\x16UnresolvedExtractValue\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12\x39\n\nextraction\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nextraction\x1a\xbb\x01\n\x0cUpdateFields\x12\x46\n\x11struct_expression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x10structExpression\x12\x1d\n\nfield_name\x18\x02 \x01(\tR\tfieldName\x12\x44\n\x10value_expression\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0fvalueExpression\x1ax\n\x05\x41lias\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12\x12\n\x04name\x18\x02 \x03(\tR\x04name\x12\x1f\n\x08metadata\x18\x03 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x9e\x01\n\x0eLambdaFunction\x12\x35\n\x08\x66unction\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08\x66unction\x12U\n\targuments\x18\x02 \x03(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableR\targuments\x1a>\n\x1dUnresolvedNamedLambdaVariable\x12\x1d\n\nname_parts\x18\x01 \x03(\tR\tnamePartsB\x0b\n\texpr_type"A\n\x10\x45xpressionCommon\x12-\n\x06origin\x18\x01 \x01(\x0b\x32\x15.spark.connect.OriginR\x06origin"\xec\x02\n\x1f\x43ommonInlineUserDefinedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x39\n\npython_udf\x18\x04 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\tpythonUdf\x12I\n\x10scalar_scala_udf\x18\x05 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\x0escalarScalaUdf\x12\x33\n\x08java_udf\x18\x06 \x01(\x0b\x32\x16.spark.connect.JavaUDFH\x00R\x07javaUdfB\n\n\x08\x66unction"\xcc\x01\n\tPythonUDF\x12\x38\n\x0boutput_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVer\x12/\n\x13\x61\x64\x64itional_includes\x18\x05 \x03(\tR\x12\x61\x64\x64itionalIncludes"\xd6\x01\n\x0eScalarScalaUDF\x12\x18\n\x07payload\x18\x01 \x01(\x0cR\x07payload\x12\x37\n\ninputTypes\x18\x02 \x03(\x0b\x32\x17.spark.connect.DataTypeR\ninputTypes\x12\x37\n\noutputType\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1a\n\x08nullable\x18\x04 \x01(\x08R\x08nullable\x12\x1c\n\taggregate\x18\x05 \x01(\x08R\taggregate"\x95\x01\n\x07JavaUDF\x12\x1d\n\nclass_name\x18\x01 \x01(\tR\tclassName\x12=\n\x0boutput_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\noutputType\x88\x01\x01\x12\x1c\n\taggregate\x18\x03 \x01(\x08R\taggregateB\x0e\n\x0c_output_type"c\n\x18TypedAggregateExpression\x12G\n\x10scalar_scala_udf\x18\x01 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFR\x0escalarScalaUdf"l\n\x0c\x43\x61llFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"\\\n\x17NamedArgumentExpression\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value"\x80\x04\n\x0bMergeAction\x12\x46\n\x0b\x61\x63tion_type\x18\x01 \x01(\x0e\x32%.spark.connect.MergeAction.ActionTypeR\nactionType\x12<\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\tcondition\x88\x01\x01\x12G\n\x0b\x61ssignments\x18\x03 \x03(\x0b\x32%.spark.connect.MergeAction.AssignmentR\x0b\x61ssignments\x1aj\n\nAssignment\x12+\n\x03key\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value"\xa7\x01\n\nActionType\x12\x17\n\x13\x41\x43TION_TYPE_INVALID\x10\x00\x12\x16\n\x12\x41\x43TION_TYPE_DELETE\x10\x01\x12\x16\n\x12\x41\x43TION_TYPE_INSERT\x10\x02\x12\x1b\n\x17\x41\x43TION_TYPE_INSERT_STAR\x10\x03\x12\x16\n\x12\x41\x43TION_TYPE_UPDATE\x10\x04\x12\x1b\n\x17\x41\x43TION_TYPE_UPDATE_STAR\x10\x05\x42\x0c\n\n_condition"A\n\x0eLazyExpression\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild"\xe1\x01\n\x12SubqueryExpression\x12\x17\n\x07plan_id\x18\x01 \x01(\x03R\x06planId\x12S\n\rsubquery_type\x18\x02 \x01(\x0e\x32..spark.connect.SubqueryExpression.SubqueryTypeR\x0csubqueryType"]\n\x0cSubqueryType\x12\x19\n\x15SUBQUERY_TYPE_UNKNOWN\x10\x00\x12\x18\n\x14SUBQUERY_TYPE_SCALAR\x10\x01\x12\x18\n\x14SUBQUERY_TYPE_EXISTS\x10\x02\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _globals = globals() @@ -54,77 +54,83 @@ "DESCRIPTOR" ]._serialized_options = b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" _globals["_EXPRESSION"]._serialized_start = 133 - _globals["_EXPRESSION"]._serialized_end = 6342 - _globals["_EXPRESSION_WINDOW"]._serialized_start = 1900 - _globals["_EXPRESSION_WINDOW"]._serialized_end = 2683 - _globals["_EXPRESSION_WINDOW_WINDOWFRAME"]._serialized_start = 2190 - _globals["_EXPRESSION_WINDOW_WINDOWFRAME"]._serialized_end = 2683 - _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY"]._serialized_start = 2457 - _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY"]._serialized_end = 2602 - _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE"]._serialized_start = 2604 - _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE"]._serialized_end = 2683 - _globals["_EXPRESSION_SORTORDER"]._serialized_start = 2686 - _globals["_EXPRESSION_SORTORDER"]._serialized_end = 3111 - _globals["_EXPRESSION_SORTORDER_SORTDIRECTION"]._serialized_start = 2916 - _globals["_EXPRESSION_SORTORDER_SORTDIRECTION"]._serialized_end = 3024 - _globals["_EXPRESSION_SORTORDER_NULLORDERING"]._serialized_start = 3026 - _globals["_EXPRESSION_SORTORDER_NULLORDERING"]._serialized_end = 3111 - _globals["_EXPRESSION_CAST"]._serialized_start = 3114 - _globals["_EXPRESSION_CAST"]._serialized_end = 3429 - _globals["_EXPRESSION_CAST_EVALMODE"]._serialized_start = 3315 - _globals["_EXPRESSION_CAST_EVALMODE"]._serialized_end = 3413 - _globals["_EXPRESSION_LITERAL"]._serialized_start = 3432 - _globals["_EXPRESSION_LITERAL"]._serialized_end = 4995 - _globals["_EXPRESSION_LITERAL_DECIMAL"]._serialized_start = 4267 - _globals["_EXPRESSION_LITERAL_DECIMAL"]._serialized_end = 4384 - _globals["_EXPRESSION_LITERAL_CALENDARINTERVAL"]._serialized_start = 4386 - _globals["_EXPRESSION_LITERAL_CALENDARINTERVAL"]._serialized_end = 4484 - _globals["_EXPRESSION_LITERAL_ARRAY"]._serialized_start = 4487 - _globals["_EXPRESSION_LITERAL_ARRAY"]._serialized_end = 4617 - _globals["_EXPRESSION_LITERAL_MAP"]._serialized_start = 4620 - _globals["_EXPRESSION_LITERAL_MAP"]._serialized_end = 4847 - _globals["_EXPRESSION_LITERAL_STRUCT"]._serialized_start = 4850 - _globals["_EXPRESSION_LITERAL_STRUCT"]._serialized_end = 4979 - _globals["_EXPRESSION_UNRESOLVEDATTRIBUTE"]._serialized_start = 4998 - _globals["_EXPRESSION_UNRESOLVEDATTRIBUTE"]._serialized_end = 5184 - _globals["_EXPRESSION_UNRESOLVEDFUNCTION"]._serialized_start = 5187 - _globals["_EXPRESSION_UNRESOLVEDFUNCTION"]._serialized_end = 5391 - _globals["_EXPRESSION_EXPRESSIONSTRING"]._serialized_start = 5393 - _globals["_EXPRESSION_EXPRESSIONSTRING"]._serialized_end = 5443 - _globals["_EXPRESSION_UNRESOLVEDSTAR"]._serialized_start = 5445 - _globals["_EXPRESSION_UNRESOLVEDSTAR"]._serialized_end = 5569 - _globals["_EXPRESSION_UNRESOLVEDREGEX"]._serialized_start = 5571 - _globals["_EXPRESSION_UNRESOLVEDREGEX"]._serialized_end = 5657 - _globals["_EXPRESSION_UNRESOLVEDEXTRACTVALUE"]._serialized_start = 5660 - _globals["_EXPRESSION_UNRESOLVEDEXTRACTVALUE"]._serialized_end = 5792 - _globals["_EXPRESSION_UPDATEFIELDS"]._serialized_start = 5795 - _globals["_EXPRESSION_UPDATEFIELDS"]._serialized_end = 5982 - _globals["_EXPRESSION_ALIAS"]._serialized_start = 5984 - _globals["_EXPRESSION_ALIAS"]._serialized_end = 6104 - _globals["_EXPRESSION_LAMBDAFUNCTION"]._serialized_start = 6107 - _globals["_EXPRESSION_LAMBDAFUNCTION"]._serialized_end = 6265 - _globals["_EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE"]._serialized_start = 6267 - _globals["_EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE"]._serialized_end = 6329 - _globals["_EXPRESSIONCOMMON"]._serialized_start = 6344 - _globals["_EXPRESSIONCOMMON"]._serialized_end = 6409 - _globals["_COMMONINLINEUSERDEFINEDFUNCTION"]._serialized_start = 6412 - _globals["_COMMONINLINEUSERDEFINEDFUNCTION"]._serialized_end = 6776 - _globals["_PYTHONUDF"]._serialized_start = 6779 - _globals["_PYTHONUDF"]._serialized_end = 6983 - _globals["_SCALARSCALAUDF"]._serialized_start = 6986 - _globals["_SCALARSCALAUDF"]._serialized_end = 7200 - _globals["_JAVAUDF"]._serialized_start = 7203 - _globals["_JAVAUDF"]._serialized_end = 7352 - _globals["_TYPEDAGGREGATEEXPRESSION"]._serialized_start = 7354 - _globals["_TYPEDAGGREGATEEXPRESSION"]._serialized_end = 7453 - _globals["_CALLFUNCTION"]._serialized_start = 7455 - _globals["_CALLFUNCTION"]._serialized_end = 7563 - _globals["_NAMEDARGUMENTEXPRESSION"]._serialized_start = 7565 - _globals["_NAMEDARGUMENTEXPRESSION"]._serialized_end = 7657 - _globals["_MERGEACTION"]._serialized_start = 7660 - _globals["_MERGEACTION"]._serialized_end = 8172 - _globals["_MERGEACTION_ASSIGNMENT"]._serialized_start = 7882 - _globals["_MERGEACTION_ASSIGNMENT"]._serialized_end = 7988 - _globals["_MERGEACTION_ACTIONTYPE"]._serialized_start = 7991 - _globals["_MERGEACTION_ACTIONTYPE"]._serialized_end = 8158 + _globals["_EXPRESSION"]._serialized_end = 6556 + _globals["_EXPRESSION_WINDOW"]._serialized_start = 2060 + _globals["_EXPRESSION_WINDOW"]._serialized_end = 2843 + _globals["_EXPRESSION_WINDOW_WINDOWFRAME"]._serialized_start = 2350 + _globals["_EXPRESSION_WINDOW_WINDOWFRAME"]._serialized_end = 2843 + _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY"]._serialized_start = 2617 + _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY"]._serialized_end = 2762 + _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE"]._serialized_start = 2764 + _globals["_EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE"]._serialized_end = 2843 + _globals["_EXPRESSION_SORTORDER"]._serialized_start = 2846 + _globals["_EXPRESSION_SORTORDER"]._serialized_end = 3271 + _globals["_EXPRESSION_SORTORDER_SORTDIRECTION"]._serialized_start = 3076 + _globals["_EXPRESSION_SORTORDER_SORTDIRECTION"]._serialized_end = 3184 + _globals["_EXPRESSION_SORTORDER_NULLORDERING"]._serialized_start = 3186 + _globals["_EXPRESSION_SORTORDER_NULLORDERING"]._serialized_end = 3271 + _globals["_EXPRESSION_CAST"]._serialized_start = 3274 + _globals["_EXPRESSION_CAST"]._serialized_end = 3589 + _globals["_EXPRESSION_CAST_EVALMODE"]._serialized_start = 3475 + _globals["_EXPRESSION_CAST_EVALMODE"]._serialized_end = 3573 + _globals["_EXPRESSION_LITERAL"]._serialized_start = 3592 + _globals["_EXPRESSION_LITERAL"]._serialized_end = 5155 + _globals["_EXPRESSION_LITERAL_DECIMAL"]._serialized_start = 4427 + _globals["_EXPRESSION_LITERAL_DECIMAL"]._serialized_end = 4544 + _globals["_EXPRESSION_LITERAL_CALENDARINTERVAL"]._serialized_start = 4546 + _globals["_EXPRESSION_LITERAL_CALENDARINTERVAL"]._serialized_end = 4644 + _globals["_EXPRESSION_LITERAL_ARRAY"]._serialized_start = 4647 + _globals["_EXPRESSION_LITERAL_ARRAY"]._serialized_end = 4777 + _globals["_EXPRESSION_LITERAL_MAP"]._serialized_start = 4780 + _globals["_EXPRESSION_LITERAL_MAP"]._serialized_end = 5007 + _globals["_EXPRESSION_LITERAL_STRUCT"]._serialized_start = 5010 + _globals["_EXPRESSION_LITERAL_STRUCT"]._serialized_end = 5139 + _globals["_EXPRESSION_UNRESOLVEDATTRIBUTE"]._serialized_start = 5158 + _globals["_EXPRESSION_UNRESOLVEDATTRIBUTE"]._serialized_end = 5344 + _globals["_EXPRESSION_UNRESOLVEDFUNCTION"]._serialized_start = 5347 + _globals["_EXPRESSION_UNRESOLVEDFUNCTION"]._serialized_end = 5605 + _globals["_EXPRESSION_EXPRESSIONSTRING"]._serialized_start = 5607 + _globals["_EXPRESSION_EXPRESSIONSTRING"]._serialized_end = 5657 + _globals["_EXPRESSION_UNRESOLVEDSTAR"]._serialized_start = 5659 + _globals["_EXPRESSION_UNRESOLVEDSTAR"]._serialized_end = 5783 + _globals["_EXPRESSION_UNRESOLVEDREGEX"]._serialized_start = 5785 + _globals["_EXPRESSION_UNRESOLVEDREGEX"]._serialized_end = 5871 + _globals["_EXPRESSION_UNRESOLVEDEXTRACTVALUE"]._serialized_start = 5874 + _globals["_EXPRESSION_UNRESOLVEDEXTRACTVALUE"]._serialized_end = 6006 + _globals["_EXPRESSION_UPDATEFIELDS"]._serialized_start = 6009 + _globals["_EXPRESSION_UPDATEFIELDS"]._serialized_end = 6196 + _globals["_EXPRESSION_ALIAS"]._serialized_start = 6198 + _globals["_EXPRESSION_ALIAS"]._serialized_end = 6318 + _globals["_EXPRESSION_LAMBDAFUNCTION"]._serialized_start = 6321 + _globals["_EXPRESSION_LAMBDAFUNCTION"]._serialized_end = 6479 + _globals["_EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE"]._serialized_start = 6481 + _globals["_EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE"]._serialized_end = 6543 + _globals["_EXPRESSIONCOMMON"]._serialized_start = 6558 + _globals["_EXPRESSIONCOMMON"]._serialized_end = 6623 + _globals["_COMMONINLINEUSERDEFINEDFUNCTION"]._serialized_start = 6626 + _globals["_COMMONINLINEUSERDEFINEDFUNCTION"]._serialized_end = 6990 + _globals["_PYTHONUDF"]._serialized_start = 6993 + _globals["_PYTHONUDF"]._serialized_end = 7197 + _globals["_SCALARSCALAUDF"]._serialized_start = 7200 + _globals["_SCALARSCALAUDF"]._serialized_end = 7414 + _globals["_JAVAUDF"]._serialized_start = 7417 + _globals["_JAVAUDF"]._serialized_end = 7566 + _globals["_TYPEDAGGREGATEEXPRESSION"]._serialized_start = 7568 + _globals["_TYPEDAGGREGATEEXPRESSION"]._serialized_end = 7667 + _globals["_CALLFUNCTION"]._serialized_start = 7669 + _globals["_CALLFUNCTION"]._serialized_end = 7777 + _globals["_NAMEDARGUMENTEXPRESSION"]._serialized_start = 7779 + _globals["_NAMEDARGUMENTEXPRESSION"]._serialized_end = 7871 + _globals["_MERGEACTION"]._serialized_start = 7874 + _globals["_MERGEACTION"]._serialized_end = 8386 + _globals["_MERGEACTION_ASSIGNMENT"]._serialized_start = 8096 + _globals["_MERGEACTION_ASSIGNMENT"]._serialized_end = 8202 + _globals["_MERGEACTION_ACTIONTYPE"]._serialized_start = 8205 + _globals["_MERGEACTION_ACTIONTYPE"]._serialized_end = 8372 + _globals["_LAZYEXPRESSION"]._serialized_start = 8388 + _globals["_LAZYEXPRESSION"]._serialized_end = 8453 + _globals["_SUBQUERYEXPRESSION"]._serialized_start = 8456 + _globals["_SUBQUERYEXPRESSION"]._serialized_end = 8681 + _globals["_SUBQUERYEXPRESSION_SUBQUERYTYPE"]._serialized_start = 8588 + _globals["_SUBQUERYEXPRESSION_SUBQUERYTYPE"]._serialized_end = 8681 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.pyi b/python/pyspark/sql/connect/proto/expressions_pb2.pyi index 1566eb1b1e9e2..1a8c60f673054 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.pyi +++ b/python/pyspark/sql/connect/proto/expressions_pb2.pyi @@ -847,6 +847,7 @@ class Expression(google.protobuf.message.Message): ARGUMENTS_FIELD_NUMBER: builtins.int IS_DISTINCT_FIELD_NUMBER: builtins.int IS_USER_DEFINED_FUNCTION_FIELD_NUMBER: builtins.int + IS_INTERNAL_FIELD_NUMBER: builtins.int function_name: builtins.str """(Required) name (or unparsed name for user defined function) for the unresolved function.""" @property @@ -864,6 +865,11 @@ class Expression(google.protobuf.message.Message): When it is not a user defined function, Connect will use the function name directly. When it is a user defined function, Connect will parse the function name first. """ + is_internal: builtins.bool + """(Optional) Indicate if this function is defined in the internal function registry. + If not set, the server will try to look up the function in the internal function registry + and decide appropriately. + """ def __init__( self, *, @@ -871,20 +877,34 @@ class Expression(google.protobuf.message.Message): arguments: collections.abc.Iterable[global___Expression] | None = ..., is_distinct: builtins.bool = ..., is_user_defined_function: builtins.bool = ..., + is_internal: builtins.bool | None = ..., ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "_is_internal", b"_is_internal", "is_internal", b"is_internal" + ], + ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ + "_is_internal", + b"_is_internal", "arguments", b"arguments", "function_name", b"function_name", "is_distinct", b"is_distinct", + "is_internal", + b"is_internal", "is_user_defined_function", b"is_user_defined_function", ], ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_is_internal", b"_is_internal"] + ) -> typing_extensions.Literal["is_internal"] | None: ... class ExpressionString(google.protobuf.message.Message): """Expression as string.""" @@ -1184,6 +1204,8 @@ class Expression(google.protobuf.message.Message): NAMED_ARGUMENT_EXPRESSION_FIELD_NUMBER: builtins.int MERGE_ACTION_FIELD_NUMBER: builtins.int TYPED_AGGREGATE_EXPRESSION_FIELD_NUMBER: builtins.int + LAZY_EXPRESSION_FIELD_NUMBER: builtins.int + SUBQUERY_EXPRESSION_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int @property def common(self) -> global___ExpressionCommon: ... @@ -1228,6 +1250,10 @@ class Expression(google.protobuf.message.Message): @property def typed_aggregate_expression(self) -> global___TypedAggregateExpression: ... @property + def lazy_expression(self) -> global___LazyExpression: ... + @property + def subquery_expression(self) -> global___SubqueryExpression: ... + @property def extension(self) -> google.protobuf.any_pb2.Any: """This field is used to mark extensions to the protocol. When plugins generate arbitrary relations they can add them here. During the planning the correct resolution is done. @@ -1256,6 +1282,8 @@ class Expression(google.protobuf.message.Message): named_argument_expression: global___NamedArgumentExpression | None = ..., merge_action: global___MergeAction | None = ..., typed_aggregate_expression: global___TypedAggregateExpression | None = ..., + lazy_expression: global___LazyExpression | None = ..., + subquery_expression: global___SubqueryExpression | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., ) -> None: ... def HasField( @@ -1279,6 +1307,8 @@ class Expression(google.protobuf.message.Message): b"extension", "lambda_function", b"lambda_function", + "lazy_expression", + b"lazy_expression", "literal", b"literal", "merge_action", @@ -1287,6 +1317,8 @@ class Expression(google.protobuf.message.Message): b"named_argument_expression", "sort_order", b"sort_order", + "subquery_expression", + b"subquery_expression", "typed_aggregate_expression", b"typed_aggregate_expression", "unresolved_attribute", @@ -1328,6 +1360,8 @@ class Expression(google.protobuf.message.Message): b"extension", "lambda_function", b"lambda_function", + "lazy_expression", + b"lazy_expression", "literal", b"literal", "merge_action", @@ -1336,6 +1370,8 @@ class Expression(google.protobuf.message.Message): b"named_argument_expression", "sort_order", b"sort_order", + "subquery_expression", + b"subquery_expression", "typed_aggregate_expression", b"typed_aggregate_expression", "unresolved_attribute", @@ -1379,6 +1415,8 @@ class Expression(google.protobuf.message.Message): "named_argument_expression", "merge_action", "typed_aggregate_expression", + "lazy_expression", + "subquery_expression", "extension", ] | None @@ -1801,3 +1839,66 @@ class MergeAction(google.protobuf.message.Message): ) -> typing_extensions.Literal["condition"] | None: ... global___MergeAction = MergeAction + +class LazyExpression(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + CHILD_FIELD_NUMBER: builtins.int + @property + def child(self) -> global___Expression: + """(Required) The expression to be marked as lazy.""" + def __init__( + self, + *, + child: global___Expression | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["child", b"child"] + ) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["child", b"child"]) -> None: ... + +global___LazyExpression = LazyExpression + +class SubqueryExpression(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _SubqueryType: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _SubqueryTypeEnumTypeWrapper( + google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[ + SubqueryExpression._SubqueryType.ValueType + ], + builtins.type, + ): # noqa: F821 + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + SUBQUERY_TYPE_UNKNOWN: SubqueryExpression._SubqueryType.ValueType # 0 + SUBQUERY_TYPE_SCALAR: SubqueryExpression._SubqueryType.ValueType # 1 + SUBQUERY_TYPE_EXISTS: SubqueryExpression._SubqueryType.ValueType # 2 + + class SubqueryType(_SubqueryType, metaclass=_SubqueryTypeEnumTypeWrapper): ... + SUBQUERY_TYPE_UNKNOWN: SubqueryExpression.SubqueryType.ValueType # 0 + SUBQUERY_TYPE_SCALAR: SubqueryExpression.SubqueryType.ValueType # 1 + SUBQUERY_TYPE_EXISTS: SubqueryExpression.SubqueryType.ValueType # 2 + + PLAN_ID_FIELD_NUMBER: builtins.int + SUBQUERY_TYPE_FIELD_NUMBER: builtins.int + plan_id: builtins.int + """(Required) The id of corresponding connect plan.""" + subquery_type: global___SubqueryExpression.SubqueryType.ValueType + """(Required) The type of the subquery.""" + def __init__( + self, + *, + plan_id: builtins.int = ..., + subquery_type: global___SubqueryExpression.SubqueryType.ValueType = ..., + ) -> None: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "plan_id", b"plan_id", "subquery_type", b"subquery_type" + ], + ) -> None: ... + +global___SubqueryExpression = SubqueryExpression diff --git a/python/pyspark/sql/connect/proto/ml_common_pb2.py b/python/pyspark/sql/connect/proto/ml_common_pb2.py new file mode 100644 index 0000000000000..70e0e91652892 --- /dev/null +++ b/python/pyspark/sql/connect/proto/ml_common_pb2.py @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: spark/connect/ml_common.proto +# Protobuf Python Version: 5.28.3 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, 5, 28, 3, "", "spark/connect/ml_common.proto" +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_expressions__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x1dspark/connect/ml_common.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\x98\x01\n\x08MlParams\x12;\n\x06params\x18\x01 \x03(\x0b\x32#.spark.connect.MlParams.ParamsEntryR\x06params\x1aO\n\x0bParamsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12*\n\x05value\x18\x02 \x01(\x0b\x32\x14.spark.connect.ParamR\x05value:\x02\x38\x01"\xb6\x01\n\x05Param\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12/\n\x06vector\x18\x02 \x01(\x0b\x32\x15.spark.connect.VectorH\x00R\x06vector\x12/\n\x06matrix\x18\x03 \x01(\x0b\x32\x15.spark.connect.MatrixH\x00R\x06matrixB\x0c\n\nparam_type"\xc9\x01\n\nMlOperator\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x10\n\x03uid\x18\x02 \x01(\tR\x03uid\x12:\n\x04type\x18\x03 \x01(\x0e\x32&.spark.connect.MlOperator.OperatorTypeR\x04type"Y\n\x0cOperatorType\x12\x0f\n\x0bUNSPECIFIED\x10\x00\x12\r\n\tESTIMATOR\x10\x01\x12\x0f\n\x0bTRANSFORMER\x10\x02\x12\r\n\tEVALUATOR\x10\x03\x12\t\n\x05MODEL\x10\x04"\x1b\n\tObjectRef\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id"\xed\x01\n\x06Vector\x12\x33\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.spark.connect.Vector.DenseH\x00R\x05\x64\x65nse\x12\x36\n\x06sparse\x18\x02 \x01(\x0b\x32\x1c.spark.connect.Vector.SparseH\x00R\x06sparse\x1a\x1d\n\x05\x44\x65nse\x12\x14\n\x05value\x18\x01 \x03(\x01R\x05value\x1aH\n\x06Sparse\x12\x12\n\x04size\x18\x01 \x01(\x05R\x04size\x12\x14\n\x05index\x18\x02 \x03(\x05R\x05index\x12\x14\n\x05value\x18\x03 \x03(\x01R\x05valueB\r\n\x0bvector_type"\xaf\x03\n\x06Matrix\x12\x33\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.spark.connect.Matrix.DenseH\x00R\x05\x64\x65nse\x12\x36\n\x06sparse\x18\x02 \x01(\x0b\x32\x1c.spark.connect.Matrix.SparseH\x00R\x06sparse\x1ax\n\x05\x44\x65nse\x12\x19\n\x08num_rows\x18\x01 \x01(\x05R\x07numRows\x12\x19\n\x08num_cols\x18\x02 \x01(\x05R\x07numCols\x12\x14\n\x05value\x18\x03 \x03(\x01R\x05value\x12#\n\ris_transposed\x18\x04 \x01(\x08R\x0cisTransposed\x1a\xae\x01\n\x06Sparse\x12\x19\n\x08num_rows\x18\x01 \x01(\x05R\x07numRows\x12\x19\n\x08num_cols\x18\x02 \x01(\x05R\x07numCols\x12\x16\n\x06\x63olptr\x18\x03 \x03(\x05R\x06\x63olptr\x12\x1b\n\trow_index\x18\x04 \x03(\x05R\x08rowIndex\x12\x14\n\x05value\x18\x05 \x03(\x01R\x05value\x12#\n\ris_transposed\x18\x06 \x01(\x08R\x0cisTransposedB\r\n\x0bmatrix_typeB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages( + DESCRIPTOR, "pyspark.sql.connect.proto.ml_common_pb2", _globals +) +if not _descriptor._USE_C_DESCRIPTORS: + _globals["DESCRIPTOR"]._loaded_options = None + _globals[ + "DESCRIPTOR" + ]._serialized_options = b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" + _globals["_MLPARAMS_PARAMSENTRY"]._loaded_options = None + _globals["_MLPARAMS_PARAMSENTRY"]._serialized_options = b"8\001" + _globals["_MLPARAMS"]._serialized_start = 82 + _globals["_MLPARAMS"]._serialized_end = 234 + _globals["_MLPARAMS_PARAMSENTRY"]._serialized_start = 155 + _globals["_MLPARAMS_PARAMSENTRY"]._serialized_end = 234 + _globals["_PARAM"]._serialized_start = 237 + _globals["_PARAM"]._serialized_end = 419 + _globals["_MLOPERATOR"]._serialized_start = 422 + _globals["_MLOPERATOR"]._serialized_end = 623 + _globals["_MLOPERATOR_OPERATORTYPE"]._serialized_start = 534 + _globals["_MLOPERATOR_OPERATORTYPE"]._serialized_end = 623 + _globals["_OBJECTREF"]._serialized_start = 625 + _globals["_OBJECTREF"]._serialized_end = 652 + _globals["_VECTOR"]._serialized_start = 655 + _globals["_VECTOR"]._serialized_end = 892 + _globals["_VECTOR_DENSE"]._serialized_start = 774 + _globals["_VECTOR_DENSE"]._serialized_end = 803 + _globals["_VECTOR_SPARSE"]._serialized_start = 805 + _globals["_VECTOR_SPARSE"]._serialized_end = 877 + _globals["_MATRIX"]._serialized_start = 895 + _globals["_MATRIX"]._serialized_end = 1326 + _globals["_MATRIX_DENSE"]._serialized_start = 1014 + _globals["_MATRIX_DENSE"]._serialized_end = 1134 + _globals["_MATRIX_SPARSE"]._serialized_start = 1137 + _globals["_MATRIX_SPARSE"]._serialized_end = 1311 +# @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/ml_common_pb2.pyi b/python/pyspark/sql/connect/proto/ml_common_pb2.pyi new file mode 100644 index 0000000000000..64029b6679f19 --- /dev/null +++ b/python/pyspark/sql/connect/proto/ml_common_pb2.pyi @@ -0,0 +1,427 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import pyspark.sql.connect.proto.expressions_pb2 +import sys +import typing + +if sys.version_info >= (3, 10): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +class MlParams(google.protobuf.message.Message): + """MlParams stores param settings for ML Estimator / Transformer / Evaluator""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class ParamsEntry(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + key: builtins.str + @property + def value(self) -> global___Param: ... + def __init__( + self, + *, + key: builtins.str = ..., + value: global___Param | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["value", b"value"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"] + ) -> None: ... + + PARAMS_FIELD_NUMBER: builtins.int + @property + def params( + self, + ) -> google.protobuf.internal.containers.MessageMap[builtins.str, global___Param]: + """User-supplied params""" + def __init__( + self, + *, + params: collections.abc.Mapping[builtins.str, global___Param] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["params", b"params"]) -> None: ... + +global___MlParams = MlParams + +class Param(google.protobuf.message.Message): + """Represents the parameter type of the ML instance, or the returned value + of the attribute + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LITERAL_FIELD_NUMBER: builtins.int + VECTOR_FIELD_NUMBER: builtins.int + MATRIX_FIELD_NUMBER: builtins.int + @property + def literal(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression.Literal: ... + @property + def vector(self) -> global___Vector: ... + @property + def matrix(self) -> global___Matrix: ... + def __init__( + self, + *, + literal: pyspark.sql.connect.proto.expressions_pb2.Expression.Literal | None = ..., + vector: global___Vector | None = ..., + matrix: global___Matrix | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "literal", + b"literal", + "matrix", + b"matrix", + "param_type", + b"param_type", + "vector", + b"vector", + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "literal", + b"literal", + "matrix", + b"matrix", + "param_type", + b"param_type", + "vector", + b"vector", + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["param_type", b"param_type"] + ) -> typing_extensions.Literal["literal", "vector", "matrix"] | None: ... + +global___Param = Param + +class MlOperator(google.protobuf.message.Message): + """MLOperator represents the ML operators like (Estimator, Transformer or Evaluator)""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _OperatorType: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _OperatorTypeEnumTypeWrapper( + google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[ + MlOperator._OperatorType.ValueType + ], + builtins.type, + ): # noqa: F821 + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + UNSPECIFIED: MlOperator._OperatorType.ValueType # 0 + ESTIMATOR: MlOperator._OperatorType.ValueType # 1 + TRANSFORMER: MlOperator._OperatorType.ValueType # 2 + EVALUATOR: MlOperator._OperatorType.ValueType # 3 + MODEL: MlOperator._OperatorType.ValueType # 4 + + class OperatorType(_OperatorType, metaclass=_OperatorTypeEnumTypeWrapper): ... + UNSPECIFIED: MlOperator.OperatorType.ValueType # 0 + ESTIMATOR: MlOperator.OperatorType.ValueType # 1 + TRANSFORMER: MlOperator.OperatorType.ValueType # 2 + EVALUATOR: MlOperator.OperatorType.ValueType # 3 + MODEL: MlOperator.OperatorType.ValueType # 4 + + NAME_FIELD_NUMBER: builtins.int + UID_FIELD_NUMBER: builtins.int + TYPE_FIELD_NUMBER: builtins.int + name: builtins.str + """The qualified name of the ML operator.""" + uid: builtins.str + """Unique id of the ML operator""" + type: global___MlOperator.OperatorType.ValueType + """Represents what the ML operator is""" + def __init__( + self, + *, + name: builtins.str = ..., + uid: builtins.str = ..., + type: global___MlOperator.OperatorType.ValueType = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["name", b"name", "type", b"type", "uid", b"uid"] + ) -> None: ... + +global___MlOperator = MlOperator + +class ObjectRef(google.protobuf.message.Message): + """Represents a reference to the cached object which could be a model + or summary evaluated by a model + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ID_FIELD_NUMBER: builtins.int + id: builtins.str + """The ID is used to lookup the object on the server side.""" + def __init__( + self, + *, + id: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["id", b"id"]) -> None: ... + +global___ObjectRef = ObjectRef + +class Vector(google.protobuf.message.Message): + """See pyspark.ml.linalg.Vector""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class Dense(google.protobuf.message.Message): + """See pyspark.ml.linalg.DenseVector""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VALUE_FIELD_NUMBER: builtins.int + @property + def value( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]: ... + def __init__( + self, + *, + value: collections.abc.Iterable[builtins.float] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["value", b"value"]) -> None: ... + + class Sparse(google.protobuf.message.Message): + """See pyspark.ml.linalg.SparseVector""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SIZE_FIELD_NUMBER: builtins.int + INDEX_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + size: builtins.int + @property + def index( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: ... + @property + def value( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]: ... + def __init__( + self, + *, + size: builtins.int = ..., + index: collections.abc.Iterable[builtins.int] | None = ..., + value: collections.abc.Iterable[builtins.float] | None = ..., + ) -> None: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "index", b"index", "size", b"size", "value", b"value" + ], + ) -> None: ... + + DENSE_FIELD_NUMBER: builtins.int + SPARSE_FIELD_NUMBER: builtins.int + @property + def dense(self) -> global___Vector.Dense: ... + @property + def sparse(self) -> global___Vector.Sparse: ... + def __init__( + self, + *, + dense: global___Vector.Dense | None = ..., + sparse: global___Vector.Sparse | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "dense", b"dense", "sparse", b"sparse", "vector_type", b"vector_type" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "dense", b"dense", "sparse", b"sparse", "vector_type", b"vector_type" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["vector_type", b"vector_type"] + ) -> typing_extensions.Literal["dense", "sparse"] | None: ... + +global___Vector = Vector + +class Matrix(google.protobuf.message.Message): + """See pyspark.ml.linalg.Matrix""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class Dense(google.protobuf.message.Message): + """See pyspark.ml.linalg.DenseMatrix""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NUM_ROWS_FIELD_NUMBER: builtins.int + NUM_COLS_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + IS_TRANSPOSED_FIELD_NUMBER: builtins.int + num_rows: builtins.int + num_cols: builtins.int + @property + def value( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]: ... + is_transposed: builtins.bool + def __init__( + self, + *, + num_rows: builtins.int = ..., + num_cols: builtins.int = ..., + value: collections.abc.Iterable[builtins.float] | None = ..., + is_transposed: builtins.bool = ..., + ) -> None: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "is_transposed", + b"is_transposed", + "num_cols", + b"num_cols", + "num_rows", + b"num_rows", + "value", + b"value", + ], + ) -> None: ... + + class Sparse(google.protobuf.message.Message): + """See pyspark.ml.linalg.SparseMatrix""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NUM_ROWS_FIELD_NUMBER: builtins.int + NUM_COLS_FIELD_NUMBER: builtins.int + COLPTR_FIELD_NUMBER: builtins.int + ROW_INDEX_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + IS_TRANSPOSED_FIELD_NUMBER: builtins.int + num_rows: builtins.int + num_cols: builtins.int + @property + def colptr( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: ... + @property + def row_index( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: ... + @property + def value( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]: ... + is_transposed: builtins.bool + def __init__( + self, + *, + num_rows: builtins.int = ..., + num_cols: builtins.int = ..., + colptr: collections.abc.Iterable[builtins.int] | None = ..., + row_index: collections.abc.Iterable[builtins.int] | None = ..., + value: collections.abc.Iterable[builtins.float] | None = ..., + is_transposed: builtins.bool = ..., + ) -> None: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "colptr", + b"colptr", + "is_transposed", + b"is_transposed", + "num_cols", + b"num_cols", + "num_rows", + b"num_rows", + "row_index", + b"row_index", + "value", + b"value", + ], + ) -> None: ... + + DENSE_FIELD_NUMBER: builtins.int + SPARSE_FIELD_NUMBER: builtins.int + @property + def dense(self) -> global___Matrix.Dense: ... + @property + def sparse(self) -> global___Matrix.Sparse: ... + def __init__( + self, + *, + dense: global___Matrix.Dense | None = ..., + sparse: global___Matrix.Sparse | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "dense", b"dense", "matrix_type", b"matrix_type", "sparse", b"sparse" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "dense", b"dense", "matrix_type", b"matrix_type", "sparse", b"sparse" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["matrix_type", b"matrix_type"] + ) -> typing_extensions.Literal["dense", "sparse"] | None: ... + +global___Matrix = Matrix diff --git a/python/pyspark/sql/connect/proto/ml_pb2.py b/python/pyspark/sql/connect/proto/ml_pb2.py new file mode 100644 index 0000000000000..5005f82d5d533 --- /dev/null +++ b/python/pyspark/sql/connect/proto/ml_pb2.py @@ -0,0 +1,71 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: spark/connect/ml.proto +# Protobuf Python Version: 5.28.3 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, 5, 28, 3, "", "spark/connect/ml.proto" +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from pyspark.sql.connect.proto import relations_pb2 as spark_dot_connect_dot_relations__pb2 +from pyspark.sql.connect.proto import ml_common_pb2 as spark_dot_connect_dot_ml__common__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x16spark/connect/ml.proto\x12\rspark.connect\x1a\x1dspark/connect/relations.proto\x1a\x1dspark/connect/ml_common.proto"\xc6\x07\n\tMlCommand\x12\x30\n\x03\x66it\x18\x01 \x01(\x0b\x32\x1c.spark.connect.MlCommand.FitH\x00R\x03\x66it\x12,\n\x05\x66\x65tch\x18\x02 \x01(\x0b\x32\x14.spark.connect.FetchH\x00R\x05\x66\x65tch\x12\x39\n\x06\x64\x65lete\x18\x03 \x01(\x0b\x32\x1f.spark.connect.MlCommand.DeleteH\x00R\x06\x64\x65lete\x12\x36\n\x05write\x18\x04 \x01(\x0b\x32\x1e.spark.connect.MlCommand.WriteH\x00R\x05write\x12\x33\n\x04read\x18\x05 \x01(\x0b\x32\x1d.spark.connect.MlCommand.ReadH\x00R\x04read\x1a\xa2\x01\n\x03\x46it\x12\x37\n\testimator\x18\x01 \x01(\x0b\x32\x19.spark.connect.MlOperatorR\testimator\x12/\n\x06params\x18\x02 \x01(\x0b\x32\x17.spark.connect.MlParamsR\x06params\x12\x31\n\x07\x64\x61taset\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x07\x64\x61taset\x1a;\n\x06\x44\x65lete\x12\x31\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefR\x06objRef\x1a\xf0\x02\n\x05Write\x12\x37\n\x08operator\x18\x01 \x01(\x0b\x32\x19.spark.connect.MlOperatorH\x00R\x08operator\x12\x33\n\x07obj_ref\x18\x02 \x01(\x0b\x32\x18.spark.connect.ObjectRefH\x00R\x06objRef\x12/\n\x06params\x18\x03 \x01(\x0b\x32\x17.spark.connect.MlParamsR\x06params\x12\x12\n\x04path\x18\x04 \x01(\tR\x04path\x12)\n\x10should_overwrite\x18\x05 \x01(\x08R\x0fshouldOverwrite\x12\x45\n\x07options\x18\x06 \x03(\x0b\x32+.spark.connect.MlCommand.Write.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x06\n\x04type\x1aQ\n\x04Read\x12\x35\n\x08operator\x18\x01 \x01(\x0b\x32\x19.spark.connect.MlOperatorR\x08operator\x12\x12\n\x04path\x18\x02 \x01(\tR\x04pathB\t\n\x07\x63ommand"\xe9\x02\n\x0fMlCommandResult\x12,\n\x05param\x18\x01 \x01(\x0b\x32\x14.spark.connect.ParamH\x00R\x05param\x12\x1a\n\x07summary\x18\x02 \x01(\tH\x00R\x07summary\x12T\n\roperator_info\x18\x03 \x01(\x0b\x32-.spark.connect.MlCommandResult.MlOperatorInfoH\x00R\x0coperatorInfo\x1a\xa6\x01\n\x0eMlOperatorInfo\x12\x33\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefH\x00R\x06objRef\x12\x14\n\x04name\x18\x02 \x01(\tH\x00R\x04name\x12\x10\n\x03uid\x18\x03 \x01(\tR\x03uid\x12/\n\x06params\x18\x04 \x01(\x0b\x32\x17.spark.connect.MlParamsR\x06paramsB\x06\n\x04typeB\r\n\x0bresult_typeB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "pyspark.sql.connect.proto.ml_pb2", _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals["DESCRIPTOR"]._loaded_options = None + _globals[ + "DESCRIPTOR" + ]._serialized_options = b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" + _globals["_MLCOMMAND_WRITE_OPTIONSENTRY"]._loaded_options = None + _globals["_MLCOMMAND_WRITE_OPTIONSENTRY"]._serialized_options = b"8\001" + _globals["_MLCOMMAND"]._serialized_start = 104 + _globals["_MLCOMMAND"]._serialized_end = 1070 + _globals["_MLCOMMAND_FIT"]._serialized_start = 382 + _globals["_MLCOMMAND_FIT"]._serialized_end = 544 + _globals["_MLCOMMAND_DELETE"]._serialized_start = 546 + _globals["_MLCOMMAND_DELETE"]._serialized_end = 605 + _globals["_MLCOMMAND_WRITE"]._serialized_start = 608 + _globals["_MLCOMMAND_WRITE"]._serialized_end = 976 + _globals["_MLCOMMAND_WRITE_OPTIONSENTRY"]._serialized_start = 910 + _globals["_MLCOMMAND_WRITE_OPTIONSENTRY"]._serialized_end = 968 + _globals["_MLCOMMAND_READ"]._serialized_start = 978 + _globals["_MLCOMMAND_READ"]._serialized_end = 1059 + _globals["_MLCOMMANDRESULT"]._serialized_start = 1073 + _globals["_MLCOMMANDRESULT"]._serialized_end = 1434 + _globals["_MLCOMMANDRESULT_MLOPERATORINFO"]._serialized_start = 1253 + _globals["_MLCOMMANDRESULT_MLOPERATORINFO"]._serialized_end = 1419 +# @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/ml_pb2.pyi b/python/pyspark/sql/connect/proto/ml_pb2.pyi new file mode 100644 index 0000000000000..95bfefb524e2a --- /dev/null +++ b/python/pyspark/sql/connect/proto/ml_pb2.pyi @@ -0,0 +1,393 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.message +import pyspark.sql.connect.proto.ml_common_pb2 +import pyspark.sql.connect.proto.relations_pb2 +import sys + +if sys.version_info >= (3, 8): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +class MlCommand(google.protobuf.message.Message): + """Command for ML""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class Fit(google.protobuf.message.Message): + """Command for estimator.fit(dataset)""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ESTIMATOR_FIELD_NUMBER: builtins.int + PARAMS_FIELD_NUMBER: builtins.int + DATASET_FIELD_NUMBER: builtins.int + @property + def estimator(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlOperator: + """Estimator information""" + @property + def params(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlParams: + """parameters of the Estimator""" + @property + def dataset(self) -> pyspark.sql.connect.proto.relations_pb2.Relation: + """the training dataset""" + def __init__( + self, + *, + estimator: pyspark.sql.connect.proto.ml_common_pb2.MlOperator | None = ..., + params: pyspark.sql.connect.proto.ml_common_pb2.MlParams | None = ..., + dataset: pyspark.sql.connect.proto.relations_pb2.Relation | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "dataset", b"dataset", "estimator", b"estimator", "params", b"params" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "dataset", b"dataset", "estimator", b"estimator", "params", b"params" + ], + ) -> None: ... + + class Delete(google.protobuf.message.Message): + """Command to delete the cached object which could be a model + or summary evaluated by a model + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OBJ_REF_FIELD_NUMBER: builtins.int + @property + def obj_ref(self) -> pyspark.sql.connect.proto.ml_common_pb2.ObjectRef: ... + def __init__( + self, + *, + obj_ref: pyspark.sql.connect.proto.ml_common_pb2.ObjectRef | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["obj_ref", b"obj_ref"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["obj_ref", b"obj_ref"] + ) -> None: ... + + class Write(google.protobuf.message.Message): + """Command to write ML operator""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class OptionsEntry(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + key: builtins.str + value: builtins.str + def __init__( + self, + *, + key: builtins.str = ..., + value: builtins.str = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"] + ) -> None: ... + + OPERATOR_FIELD_NUMBER: builtins.int + OBJ_REF_FIELD_NUMBER: builtins.int + PARAMS_FIELD_NUMBER: builtins.int + PATH_FIELD_NUMBER: builtins.int + SHOULD_OVERWRITE_FIELD_NUMBER: builtins.int + OPTIONS_FIELD_NUMBER: builtins.int + @property + def operator(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlOperator: + """Estimator or evaluator""" + @property + def obj_ref(self) -> pyspark.sql.connect.proto.ml_common_pb2.ObjectRef: + """The cached model""" + @property + def params(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlParams: + """The parameters of operator which could be estimator/evaluator or a cached model""" + path: builtins.str + """Save the ML instance to the path""" + should_overwrite: builtins.bool + """Overwrites if the output path already exists.""" + @property + def options( + self, + ) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]: + """The options of the writer""" + def __init__( + self, + *, + operator: pyspark.sql.connect.proto.ml_common_pb2.MlOperator | None = ..., + obj_ref: pyspark.sql.connect.proto.ml_common_pb2.ObjectRef | None = ..., + params: pyspark.sql.connect.proto.ml_common_pb2.MlParams | None = ..., + path: builtins.str = ..., + should_overwrite: builtins.bool = ..., + options: collections.abc.Mapping[builtins.str, builtins.str] | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "obj_ref", b"obj_ref", "operator", b"operator", "params", b"params", "type", b"type" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "obj_ref", + b"obj_ref", + "operator", + b"operator", + "options", + b"options", + "params", + b"params", + "path", + b"path", + "should_overwrite", + b"should_overwrite", + "type", + b"type", + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["type", b"type"] + ) -> typing_extensions.Literal["operator", "obj_ref"] | None: ... + + class Read(google.protobuf.message.Message): + """Command to load ML operator.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OPERATOR_FIELD_NUMBER: builtins.int + PATH_FIELD_NUMBER: builtins.int + @property + def operator(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlOperator: + """ML operator information""" + path: builtins.str + """Load the ML instance from the input path""" + def __init__( + self, + *, + operator: pyspark.sql.connect.proto.ml_common_pb2.MlOperator | None = ..., + path: builtins.str = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["operator", b"operator"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["operator", b"operator", "path", b"path"] + ) -> None: ... + + FIT_FIELD_NUMBER: builtins.int + FETCH_FIELD_NUMBER: builtins.int + DELETE_FIELD_NUMBER: builtins.int + WRITE_FIELD_NUMBER: builtins.int + READ_FIELD_NUMBER: builtins.int + @property + def fit(self) -> global___MlCommand.Fit: ... + @property + def fetch(self) -> pyspark.sql.connect.proto.relations_pb2.Fetch: ... + @property + def delete(self) -> global___MlCommand.Delete: ... + @property + def write(self) -> global___MlCommand.Write: ... + @property + def read(self) -> global___MlCommand.Read: ... + def __init__( + self, + *, + fit: global___MlCommand.Fit | None = ..., + fetch: pyspark.sql.connect.proto.relations_pb2.Fetch | None = ..., + delete: global___MlCommand.Delete | None = ..., + write: global___MlCommand.Write | None = ..., + read: global___MlCommand.Read | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "command", + b"command", + "delete", + b"delete", + "fetch", + b"fetch", + "fit", + b"fit", + "read", + b"read", + "write", + b"write", + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "command", + b"command", + "delete", + b"delete", + "fetch", + b"fetch", + "fit", + b"fit", + "read", + b"read", + "write", + b"write", + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["command", b"command"] + ) -> typing_extensions.Literal["fit", "fetch", "delete", "write", "read"] | None: ... + +global___MlCommand = MlCommand + +class MlCommandResult(google.protobuf.message.Message): + """The result of MlCommand""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class MlOperatorInfo(google.protobuf.message.Message): + """Represents an operator info""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OBJ_REF_FIELD_NUMBER: builtins.int + NAME_FIELD_NUMBER: builtins.int + UID_FIELD_NUMBER: builtins.int + PARAMS_FIELD_NUMBER: builtins.int + @property + def obj_ref(self) -> pyspark.sql.connect.proto.ml_common_pb2.ObjectRef: + """The cached object which could be a model or summary evaluated by a model""" + name: builtins.str + """Operator name""" + uid: builtins.str + @property + def params(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlParams: ... + def __init__( + self, + *, + obj_ref: pyspark.sql.connect.proto.ml_common_pb2.ObjectRef | None = ..., + name: builtins.str = ..., + uid: builtins.str = ..., + params: pyspark.sql.connect.proto.ml_common_pb2.MlParams | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "name", b"name", "obj_ref", b"obj_ref", "params", b"params", "type", b"type" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "name", + b"name", + "obj_ref", + b"obj_ref", + "params", + b"params", + "type", + b"type", + "uid", + b"uid", + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["type", b"type"] + ) -> typing_extensions.Literal["obj_ref", "name"] | None: ... + + PARAM_FIELD_NUMBER: builtins.int + SUMMARY_FIELD_NUMBER: builtins.int + OPERATOR_INFO_FIELD_NUMBER: builtins.int + @property + def param(self) -> pyspark.sql.connect.proto.ml_common_pb2.Param: + """The result of the attribute""" + summary: builtins.str + """Evaluate a Dataset in a model and return the cached ID of summary""" + @property + def operator_info(self) -> global___MlCommandResult.MlOperatorInfo: + """Operator information""" + def __init__( + self, + *, + param: pyspark.sql.connect.proto.ml_common_pb2.Param | None = ..., + summary: builtins.str = ..., + operator_info: global___MlCommandResult.MlOperatorInfo | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "operator_info", + b"operator_info", + "param", + b"param", + "result_type", + b"result_type", + "summary", + b"summary", + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "operator_info", + b"operator_info", + "param", + b"param", + "result_type", + b"result_type", + "summary", + b"summary", + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["result_type", b"result_type"] + ) -> typing_extensions.Literal["param", "summary", "operator_info"] | None: ... + +global___MlCommandResult = MlCommandResult diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 479abcfb597a1..4327d0240b355 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -39,10 +39,11 @@ from pyspark.sql.connect.proto import types_pb2 as spark_dot_connect_dot_types__pb2 from pyspark.sql.connect.proto import catalog_pb2 as spark_dot_connect_dot_catalog__pb2 from pyspark.sql.connect.proto import common_pb2 as spark_dot_connect_dot_common__pb2 +from pyspark.sql.connect.proto import ml_common_pb2 as spark_dot_connect_dot_ml__common__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto\x1a\x1aspark/connect/common.proto"\x9c\x1c\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12*\n\x05to_df\x18\x12 \x01(\x0b\x32\x13.spark.connect.ToDFH\x00R\x04toDf\x12U\n\x14with_columns_renamed\x18\x13 \x01(\x0b\x32!.spark.connect.WithColumnsRenamedH\x00R\x12withColumnsRenamed\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12)\n\x04tail\x18\x16 \x01(\x0b\x32\x13.spark.connect.TailH\x00R\x04tail\x12?\n\x0cwith_columns\x18\x17 \x01(\x0b\x32\x1a.spark.connect.WithColumnsH\x00R\x0bwithColumns\x12)\n\x04hint\x18\x18 \x01(\x0b\x32\x13.spark.connect.HintH\x00R\x04hint\x12\x32\n\x07unpivot\x18\x19 \x01(\x0b\x32\x16.spark.connect.UnpivotH\x00R\x07unpivot\x12\x36\n\tto_schema\x18\x1a \x01(\x0b\x32\x17.spark.connect.ToSchemaH\x00R\x08toSchema\x12\x64\n\x19repartition_by_expression\x18\x1b \x01(\x0b\x32&.spark.connect.RepartitionByExpressionH\x00R\x17repartitionByExpression\x12\x45\n\x0emap_partitions\x18\x1c \x01(\x0b\x32\x1c.spark.connect.MapPartitionsH\x00R\rmapPartitions\x12H\n\x0f\x63ollect_metrics\x18\x1d \x01(\x0b\x32\x1d.spark.connect.CollectMetricsH\x00R\x0e\x63ollectMetrics\x12,\n\x05parse\x18\x1e \x01(\x0b\x32\x14.spark.connect.ParseH\x00R\x05parse\x12\x36\n\tgroup_map\x18\x1f \x01(\x0b\x32\x17.spark.connect.GroupMapH\x00R\x08groupMap\x12=\n\x0c\x63o_group_map\x18 \x01(\x0b\x32\x19.spark.connect.CoGroupMapH\x00R\ncoGroupMap\x12\x45\n\x0ewith_watermark\x18! \x01(\x0b\x32\x1c.spark.connect.WithWatermarkH\x00R\rwithWatermark\x12\x63\n\x1a\x61pply_in_pandas_with_state\x18" \x01(\x0b\x32%.spark.connect.ApplyInPandasWithStateH\x00R\x16\x61pplyInPandasWithState\x12<\n\x0bhtml_string\x18# \x01(\x0b\x32\x19.spark.connect.HtmlStringH\x00R\nhtmlString\x12X\n\x15\x63\x61\x63hed_local_relation\x18$ \x01(\x0b\x32".spark.connect.CachedLocalRelationH\x00R\x13\x63\x61\x63hedLocalRelation\x12[\n\x16\x63\x61\x63hed_remote_relation\x18% \x01(\x0b\x32#.spark.connect.CachedRemoteRelationH\x00R\x14\x63\x61\x63hedRemoteRelation\x12\x8e\x01\n)common_inline_user_defined_table_function\x18& \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R$commonInlineUserDefinedTableFunction\x12\x37\n\nas_of_join\x18\' \x01(\x0b\x32\x17.spark.connect.AsOfJoinH\x00R\x08\x61sOfJoin\x12\x85\x01\n&common_inline_user_defined_data_source\x18( \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R!commonInlineUserDefinedDataSource\x12\x45\n\x0ewith_relations\x18) \x01(\x0b\x32\x1c.spark.connect.WithRelationsH\x00R\rwithRelations\x12\x38\n\ttranspose\x18* \x01(\x0b\x32\x18.spark.connect.TransposeH\x00R\ttranspose\x12w\n unresolved_table_valued_function\x18+ \x01(\x0b\x32,.spark.connect.UnresolvedTableValuedFunctionH\x00R\x1dunresolvedTableValuedFunction\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x30\n\x07\x64rop_na\x18[ \x01(\x0b\x32\x15.spark.connect.NADropH\x00R\x06\x64ropNa\x12\x34\n\x07replace\x18\\ \x01(\x0b\x32\x18.spark.connect.NAReplaceH\x00R\x07replace\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x39\n\x08\x64\x65scribe\x18\x66 \x01(\x0b\x32\x1b.spark.connect.StatDescribeH\x00R\x08\x64\x65scribe\x12*\n\x03\x63ov\x18g \x01(\x0b\x32\x16.spark.connect.StatCovH\x00R\x03\x63ov\x12-\n\x04\x63orr\x18h \x01(\x0b\x32\x17.spark.connect.StatCorrH\x00R\x04\x63orr\x12L\n\x0f\x61pprox_quantile\x18i \x01(\x0b\x32!.spark.connect.StatApproxQuantileH\x00R\x0e\x61pproxQuantile\x12=\n\nfreq_items\x18j \x01(\x0b\x32\x1c.spark.connect.StatFreqItemsH\x00R\tfreqItems\x12:\n\tsample_by\x18k \x01(\x0b\x32\x1b.spark.connect.StatSampleByH\x00R\x08sampleBy\x12\x33\n\x07\x63\x61talog\x18\xc8\x01 \x01(\x0b\x32\x16.spark.connect.CatalogH\x00R\x07\x63\x61talog\x12\x35\n\textension\x18\xe6\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\t\n\x07Unknown"\x8e\x01\n\x0eRelationCommon\x12#\n\x0bsource_info\x18\x01 \x01(\tB\x02\x18\x01R\nsourceInfo\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12-\n\x06origin\x18\x03 \x01(\x0b\x32\x15.spark.connect.OriginR\x06originB\n\n\x08_plan_id"\xde\x03\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32\x1c.spark.connect.SQL.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12O\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32&.spark.connect.SQL.NamedArgumentsEntryR\x0enamedArguments\x12>\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cposArguments\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"u\n\rWithRelations\x12+\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04root\x12\x37\n\nreferences\x18\x02 \x03(\x0b\x32\x17.spark.connect.RelationR\nreferences"\x97\x05\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x12!\n\x0cis_streaming\x18\x03 \x01(\x08R\x0bisStreaming\x1a\xc0\x01\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x45\n\x07options\x18\x02 \x03(\x0b\x32+.spark.connect.Read.NamedTable.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x95\x02\n\nDataSource\x12\x1b\n\x06\x66ormat\x18\x01 \x01(\tH\x00R\x06\x66ormat\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x12\x14\n\x05paths\x18\x04 \x03(\tR\x05paths\x12\x1e\n\npredicates\x18\x05 \x03(\tR\npredicates\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_formatB\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\x95\x05\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns\x12K\n\x0ejoin_data_type\x18\x06 \x01(\x0b\x32 .spark.connect.Join.JoinDataTypeH\x00R\x0cjoinDataType\x88\x01\x01\x1a\\\n\x0cJoinDataType\x12$\n\x0eis_left_struct\x18\x01 \x01(\x08R\x0cisLeftStruct\x12&\n\x0fis_right_struct\x18\x02 \x01(\x08R\risRightStruct"\xd0\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06\x12\x13\n\x0fJOIN_TYPE_CROSS\x10\x07\x42\x11\n\x0f_join_data_type"\xdf\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01\x12\x37\n\x15\x61llow_missing_columns\x18\x06 \x01(\x08H\x02R\x13\x61llowMissingColumns\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_nameB\x18\n\x16_allow_missing_columns"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"K\n\x04Tail\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"\xfe\x05\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x41\n\ngroup_type\x18\x02 \x01(\x0e\x32".spark.connect.Aggregate.GroupTypeR\tgroupType\x12L\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12N\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x14\x61ggregateExpressions\x12\x34\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.spark.connect.Aggregate.PivotR\x05pivot\x12J\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.spark.connect.Aggregate.GroupingSetsR\x0cgroupingSets\x1ao\n\x05Pivot\x12+\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1aL\n\x0cGroupingSets\x12<\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0bgroupingSet"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05"\xa0\x01\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x05order\x18\x02 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\x05order\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x42\x0c\n\n_is_global"\x8d\x01\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x33\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07\x63olumns\x12!\n\x0c\x63olumn_names\x18\x03 \x03(\tR\x0b\x63olumnNames"\xf0\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x12.\n\x10within_watermark\x18\x04 \x01(\x08H\x01R\x0fwithinWatermark\x88\x01\x01\x42\x16\n\x14_all_columns_as_keysB\x13\n\x11_within_watermark"Y\n\rLocalRelation\x12\x17\n\x04\x64\x61ta\x18\x01 \x01(\x0cH\x00R\x04\x64\x61ta\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x42\x07\n\x05_dataB\t\n\x07_schema"H\n\x13\x43\x61\x63hedLocalRelation\x12\x12\n\x04hash\x18\x03 \x01(\tR\x04hashJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03R\x06userIdR\tsessionId"7\n\x14\x43\x61\x63hedRemoteRelation\x12\x1f\n\x0brelation_id\x18\x01 \x01(\tR\nrelationId"\x91\x02\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x12/\n\x13\x64\x65terministic_order\x18\x06 \x01(\x08R\x12\x64\x65terministicOrderB\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8e\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"r\n\nHtmlString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"Q\n\x0cStatDescribe\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"`\n\x07StatCov\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x89\x01\n\x08StatCorr\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2\x12\x1b\n\x06method\x18\x04 \x01(\tH\x00R\x06method\x88\x01\x01\x42\t\n\x07_method"\xa4\x01\n\x12StatApproxQuantile\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12$\n\rprobabilities\x18\x03 \x03(\x01R\rprobabilities\x12%\n\x0erelative_error\x18\x04 \x01(\x01R\rrelativeError"}\n\rStatFreqItems\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x1d\n\x07support\x18\x03 \x01(\x01H\x00R\x07support\x88\x01\x01\x42\n\n\x08_support"\xb5\x02\n\x0cStatSampleBy\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03\x63ol\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x42\n\tfractions\x18\x03 \x03(\x0b\x32$.spark.connect.StatSampleBy.FractionR\tfractions\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x00R\x04seed\x88\x01\x01\x1a\x63\n\x08\x46raction\x12;\n\x07stratum\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x07stratum\x12\x1a\n\x08\x66raction\x18\x02 \x01(\x01R\x08\x66ractionB\x07\n\x05_seed"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"\x86\x01\n\x06NADrop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\'\n\rmin_non_nulls\x18\x03 \x01(\x05H\x00R\x0bminNonNulls\x88\x01\x01\x42\x10\n\x0e_min_non_nulls"\xa8\x02\n\tNAReplace\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12H\n\x0creplacements\x18\x03 \x03(\x0b\x32$.spark.connect.NAReplace.ReplacementR\x0creplacements\x1a\x8d\x01\n\x0bReplacement\x12>\n\told_value\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08oldValue\x12>\n\tnew_value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08newValue"X\n\x04ToDF\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\xfe\x02\n\x12WithColumnsRenamed\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12i\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x37.spark.connect.WithColumnsRenamed.RenameColumnsMapEntryB\x02\x18\x01R\x10renameColumnsMap\x12\x42\n\x07renames\x18\x03 \x03(\x0b\x32(.spark.connect.WithColumnsRenamed.RenameR\x07renames\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x45\n\x06Rename\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12 \n\x0cnew_col_name\x18\x02 \x01(\tR\nnewColName"w\n\x0bWithColumns\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x07\x61liases\x18\x02 \x03(\x0b\x32\x1f.spark.connect.Expression.AliasR\x07\x61liases"\x86\x01\n\rWithWatermark\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\nevent_time\x18\x02 \x01(\tR\teventTime\x12\'\n\x0f\x64\x65lay_threshold\x18\x03 \x01(\tR\x0e\x64\x65layThreshold"\x84\x01\n\x04Hint\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x39\n\nparameters\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\nparameters"\xc7\x02\n\x07Unpivot\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03ids\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x03ids\x12:\n\x06values\x18\x03 \x01(\x0b\x32\x1d.spark.connect.Unpivot.ValuesH\x00R\x06values\x88\x01\x01\x12\x30\n\x14variable_column_name\x18\x04 \x01(\tR\x12variableColumnName\x12*\n\x11value_column_name\x18\x05 \x01(\tR\x0fvalueColumnName\x1a;\n\x06Values\x12\x31\n\x06values\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x06valuesB\t\n\x07_values"z\n\tTranspose\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12>\n\rindex_columns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cindexColumns"}\n\x1dUnresolvedTableValuedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"j\n\x08ToSchema\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06schema\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema"\xcb\x01\n\x17RepartitionByExpression\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x0fpartition_exprs\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0epartitionExprs\x12*\n\x0enum_partitions\x18\x03 \x01(\x05H\x00R\rnumPartitions\x88\x01\x01\x42\x11\n\x0f_num_partitions"\xe8\x01\n\rMapPartitions\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x04\x66unc\x18\x02 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12"\n\nis_barrier\x18\x03 \x01(\x08H\x00R\tisBarrier\x88\x01\x01\x12"\n\nprofile_id\x18\x04 \x01(\x05H\x01R\tprofileId\x88\x01\x01\x42\r\n\x0b_is_barrierB\r\n\x0b_profile_id"\xfb\x04\n\x08GroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12J\n\x13sorting_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x12sortingExpressions\x12<\n\rinitial_input\x18\x05 \x01(\x0b\x32\x17.spark.connect.RelationR\x0cinitialInput\x12[\n\x1cinitial_grouping_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x1ainitialGroupingExpressions\x12;\n\x18is_map_groups_with_state\x18\x07 \x01(\x08H\x00R\x14isMapGroupsWithState\x88\x01\x01\x12$\n\x0boutput_mode\x18\x08 \x01(\tH\x01R\noutputMode\x88\x01\x01\x12&\n\x0ctimeout_conf\x18\t \x01(\tH\x02R\x0btimeoutConf\x88\x01\x01\x42\x1b\n\x19_is_map_groups_with_stateB\x0e\n\x0c_output_modeB\x0f\n\r_timeout_conf"\x8e\x04\n\nCoGroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12W\n\x1ainput_grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18inputGroupingExpressions\x12-\n\x05other\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05other\x12W\n\x1aother_grouping_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18otherGroupingExpressions\x12\x42\n\x04\x66unc\x18\x05 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12U\n\x19input_sorting_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17inputSortingExpressions\x12U\n\x19other_sorting_expressions\x18\x07 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17otherSortingExpressions"\xe5\x02\n\x16\x41pplyInPandasWithState\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12#\n\routput_schema\x18\x04 \x01(\tR\x0coutputSchema\x12!\n\x0cstate_schema\x18\x05 \x01(\tR\x0bstateSchema\x12\x1f\n\x0boutput_mode\x18\x06 \x01(\tR\noutputMode\x12!\n\x0ctimeout_conf\x18\x07 \x01(\tR\x0btimeoutConf"\xf4\x01\n$CommonInlineUserDefinedTableFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12<\n\x0bpython_udtf\x18\x04 \x01(\x0b\x32\x19.spark.connect.PythonUDTFH\x00R\npythonUdtfB\n\n\x08\x66unction"\xb1\x01\n\nPythonUDTF\x12=\n\x0breturn_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\nreturnType\x88\x01\x01\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVerB\x0e\n\x0c_return_type"\x97\x01\n!CommonInlineUserDefinedDataSource\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12O\n\x12python_data_source\x18\x02 \x01(\x0b\x32\x1f.spark.connect.PythonDataSourceH\x00R\x10pythonDataSourceB\r\n\x0b\x64\x61ta_source"K\n\x10PythonDataSource\x12\x18\n\x07\x63ommand\x18\x01 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x02 \x01(\tR\tpythonVer"\x88\x01\n\x0e\x43ollectMetrics\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x33\n\x07metrics\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07metrics"\x84\x03\n\x05Parse\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x38\n\x06\x66ormat\x18\x02 \x01(\x0e\x32 .spark.connect.Parse.ParseFormatR\x06\x66ormat\x12\x34\n\x06schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x06schema\x88\x01\x01\x12;\n\x07options\x18\x04 \x03(\x0b\x32!.spark.connect.Parse.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"X\n\x0bParseFormat\x12\x1c\n\x18PARSE_FORMAT_UNSPECIFIED\x10\x00\x12\x14\n\x10PARSE_FORMAT_CSV\x10\x01\x12\x15\n\x11PARSE_FORMAT_JSON\x10\x02\x42\t\n\x07_schema"\xdb\x03\n\x08\x41sOfJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12\x37\n\nleft_as_of\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08leftAsOf\x12\x39\n\x0bright_as_of\x18\x04 \x01(\x0b\x32\x19.spark.connect.ExpressionR\trightAsOf\x12\x36\n\tjoin_expr\x18\x05 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08joinExpr\x12#\n\rusing_columns\x18\x06 \x03(\tR\x0cusingColumns\x12\x1b\n\tjoin_type\x18\x07 \x01(\tR\x08joinType\x12\x37\n\ttolerance\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\ttolerance\x12.\n\x13\x61llow_exact_matches\x18\t \x01(\x08R\x11\x61llowExactMatches\x12\x1c\n\tdirection\x18\n \x01(\tR\tdirectionB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto\x1a\x1aspark/connect/common.proto\x1a\x1dspark/connect/ml_common.proto"\x9c\x1d\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12*\n\x05to_df\x18\x12 \x01(\x0b\x32\x13.spark.connect.ToDFH\x00R\x04toDf\x12U\n\x14with_columns_renamed\x18\x13 \x01(\x0b\x32!.spark.connect.WithColumnsRenamedH\x00R\x12withColumnsRenamed\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12)\n\x04tail\x18\x16 \x01(\x0b\x32\x13.spark.connect.TailH\x00R\x04tail\x12?\n\x0cwith_columns\x18\x17 \x01(\x0b\x32\x1a.spark.connect.WithColumnsH\x00R\x0bwithColumns\x12)\n\x04hint\x18\x18 \x01(\x0b\x32\x13.spark.connect.HintH\x00R\x04hint\x12\x32\n\x07unpivot\x18\x19 \x01(\x0b\x32\x16.spark.connect.UnpivotH\x00R\x07unpivot\x12\x36\n\tto_schema\x18\x1a \x01(\x0b\x32\x17.spark.connect.ToSchemaH\x00R\x08toSchema\x12\x64\n\x19repartition_by_expression\x18\x1b \x01(\x0b\x32&.spark.connect.RepartitionByExpressionH\x00R\x17repartitionByExpression\x12\x45\n\x0emap_partitions\x18\x1c \x01(\x0b\x32\x1c.spark.connect.MapPartitionsH\x00R\rmapPartitions\x12H\n\x0f\x63ollect_metrics\x18\x1d \x01(\x0b\x32\x1d.spark.connect.CollectMetricsH\x00R\x0e\x63ollectMetrics\x12,\n\x05parse\x18\x1e \x01(\x0b\x32\x14.spark.connect.ParseH\x00R\x05parse\x12\x36\n\tgroup_map\x18\x1f \x01(\x0b\x32\x17.spark.connect.GroupMapH\x00R\x08groupMap\x12=\n\x0c\x63o_group_map\x18 \x01(\x0b\x32\x19.spark.connect.CoGroupMapH\x00R\ncoGroupMap\x12\x45\n\x0ewith_watermark\x18! \x01(\x0b\x32\x1c.spark.connect.WithWatermarkH\x00R\rwithWatermark\x12\x63\n\x1a\x61pply_in_pandas_with_state\x18" \x01(\x0b\x32%.spark.connect.ApplyInPandasWithStateH\x00R\x16\x61pplyInPandasWithState\x12<\n\x0bhtml_string\x18# \x01(\x0b\x32\x19.spark.connect.HtmlStringH\x00R\nhtmlString\x12X\n\x15\x63\x61\x63hed_local_relation\x18$ \x01(\x0b\x32".spark.connect.CachedLocalRelationH\x00R\x13\x63\x61\x63hedLocalRelation\x12[\n\x16\x63\x61\x63hed_remote_relation\x18% \x01(\x0b\x32#.spark.connect.CachedRemoteRelationH\x00R\x14\x63\x61\x63hedRemoteRelation\x12\x8e\x01\n)common_inline_user_defined_table_function\x18& \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R$commonInlineUserDefinedTableFunction\x12\x37\n\nas_of_join\x18\' \x01(\x0b\x32\x17.spark.connect.AsOfJoinH\x00R\x08\x61sOfJoin\x12\x85\x01\n&common_inline_user_defined_data_source\x18( \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R!commonInlineUserDefinedDataSource\x12\x45\n\x0ewith_relations\x18) \x01(\x0b\x32\x1c.spark.connect.WithRelationsH\x00R\rwithRelations\x12\x38\n\ttranspose\x18* \x01(\x0b\x32\x18.spark.connect.TransposeH\x00R\ttranspose\x12w\n unresolved_table_valued_function\x18+ \x01(\x0b\x32,.spark.connect.UnresolvedTableValuedFunctionH\x00R\x1dunresolvedTableValuedFunction\x12?\n\x0clateral_join\x18, \x01(\x0b\x32\x1a.spark.connect.LateralJoinH\x00R\x0blateralJoin\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x30\n\x07\x64rop_na\x18[ \x01(\x0b\x32\x15.spark.connect.NADropH\x00R\x06\x64ropNa\x12\x34\n\x07replace\x18\\ \x01(\x0b\x32\x18.spark.connect.NAReplaceH\x00R\x07replace\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x39\n\x08\x64\x65scribe\x18\x66 \x01(\x0b\x32\x1b.spark.connect.StatDescribeH\x00R\x08\x64\x65scribe\x12*\n\x03\x63ov\x18g \x01(\x0b\x32\x16.spark.connect.StatCovH\x00R\x03\x63ov\x12-\n\x04\x63orr\x18h \x01(\x0b\x32\x17.spark.connect.StatCorrH\x00R\x04\x63orr\x12L\n\x0f\x61pprox_quantile\x18i \x01(\x0b\x32!.spark.connect.StatApproxQuantileH\x00R\x0e\x61pproxQuantile\x12=\n\nfreq_items\x18j \x01(\x0b\x32\x1c.spark.connect.StatFreqItemsH\x00R\tfreqItems\x12:\n\tsample_by\x18k \x01(\x0b\x32\x1b.spark.connect.StatSampleByH\x00R\x08sampleBy\x12\x33\n\x07\x63\x61talog\x18\xc8\x01 \x01(\x0b\x32\x16.spark.connect.CatalogH\x00R\x07\x63\x61talog\x12=\n\x0bml_relation\x18\xac\x02 \x01(\x0b\x32\x19.spark.connect.MlRelationH\x00R\nmlRelation\x12\x35\n\textension\x18\xe6\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\xf8\x02\n\nMlRelation\x12\x43\n\ttransform\x18\x01 \x01(\x0b\x32#.spark.connect.MlRelation.TransformH\x00R\ttransform\x12,\n\x05\x66\x65tch\x18\x02 \x01(\x0b\x32\x14.spark.connect.FetchH\x00R\x05\x66\x65tch\x1a\xeb\x01\n\tTransform\x12\x33\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefH\x00R\x06objRef\x12=\n\x0btransformer\x18\x02 \x01(\x0b\x32\x19.spark.connect.MlOperatorH\x00R\x0btransformer\x12-\n\x05input\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06params\x18\x04 \x01(\x0b\x32\x17.spark.connect.MlParamsR\x06paramsB\n\n\x08operatorB\t\n\x07ml_type"\xbe\x02\n\x05\x46\x65tch\x12\x31\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefR\x06objRef\x12\x35\n\x07methods\x18\x02 \x03(\x0b\x32\x1b.spark.connect.Fetch.MethodR\x07methods\x1a\xca\x01\n\x06Method\x12\x16\n\x06method\x18\x01 \x01(\tR\x06method\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32 .spark.connect.Fetch.Method.ArgsR\x04\x61rgs\x1ar\n\x04\x41rgs\x12,\n\x05param\x18\x01 \x01(\x0b\x32\x14.spark.connect.ParamH\x00R\x05param\x12/\n\x05input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x05inputB\x0b\n\targs_type"\t\n\x07Unknown"\x8e\x01\n\x0eRelationCommon\x12#\n\x0bsource_info\x18\x01 \x01(\tB\x02\x18\x01R\nsourceInfo\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12-\n\x06origin\x18\x03 \x01(\x0b\x32\x15.spark.connect.OriginR\x06originB\n\n\x08_plan_id"\xde\x03\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32\x1c.spark.connect.SQL.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12O\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32&.spark.connect.SQL.NamedArgumentsEntryR\x0enamedArguments\x12>\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cposArguments\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"u\n\rWithRelations\x12+\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04root\x12\x37\n\nreferences\x18\x02 \x03(\x0b\x32\x17.spark.connect.RelationR\nreferences"\x97\x05\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x12!\n\x0cis_streaming\x18\x03 \x01(\x08R\x0bisStreaming\x1a\xc0\x01\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x45\n\x07options\x18\x02 \x03(\x0b\x32+.spark.connect.Read.NamedTable.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x95\x02\n\nDataSource\x12\x1b\n\x06\x66ormat\x18\x01 \x01(\tH\x00R\x06\x66ormat\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x12\x14\n\x05paths\x18\x04 \x03(\tR\x05paths\x12\x1e\n\npredicates\x18\x05 \x03(\tR\npredicates\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_formatB\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\x95\x05\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns\x12K\n\x0ejoin_data_type\x18\x06 \x01(\x0b\x32 .spark.connect.Join.JoinDataTypeH\x00R\x0cjoinDataType\x88\x01\x01\x1a\\\n\x0cJoinDataType\x12$\n\x0eis_left_struct\x18\x01 \x01(\x08R\x0cisLeftStruct\x12&\n\x0fis_right_struct\x18\x02 \x01(\x08R\risRightStruct"\xd0\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06\x12\x13\n\x0fJOIN_TYPE_CROSS\x10\x07\x42\x11\n\x0f_join_data_type"\xdf\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01\x12\x37\n\x15\x61llow_missing_columns\x18\x06 \x01(\x08H\x02R\x13\x61llowMissingColumns\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_nameB\x18\n\x16_allow_missing_columns"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"K\n\x04Tail\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"\xfe\x05\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x41\n\ngroup_type\x18\x02 \x01(\x0e\x32".spark.connect.Aggregate.GroupTypeR\tgroupType\x12L\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12N\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x14\x61ggregateExpressions\x12\x34\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.spark.connect.Aggregate.PivotR\x05pivot\x12J\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.spark.connect.Aggregate.GroupingSetsR\x0cgroupingSets\x1ao\n\x05Pivot\x12+\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1aL\n\x0cGroupingSets\x12<\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0bgroupingSet"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05"\xa0\x01\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x05order\x18\x02 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\x05order\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x42\x0c\n\n_is_global"\x8d\x01\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x33\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07\x63olumns\x12!\n\x0c\x63olumn_names\x18\x03 \x03(\tR\x0b\x63olumnNames"\xf0\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x12.\n\x10within_watermark\x18\x04 \x01(\x08H\x01R\x0fwithinWatermark\x88\x01\x01\x42\x16\n\x14_all_columns_as_keysB\x13\n\x11_within_watermark"Y\n\rLocalRelation\x12\x17\n\x04\x64\x61ta\x18\x01 \x01(\x0cH\x00R\x04\x64\x61ta\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x42\x07\n\x05_dataB\t\n\x07_schema"H\n\x13\x43\x61\x63hedLocalRelation\x12\x12\n\x04hash\x18\x03 \x01(\tR\x04hashJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03R\x06userIdR\tsessionId"7\n\x14\x43\x61\x63hedRemoteRelation\x12\x1f\n\x0brelation_id\x18\x01 \x01(\tR\nrelationId"\x91\x02\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x12/\n\x13\x64\x65terministic_order\x18\x06 \x01(\x08R\x12\x64\x65terministicOrderB\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8e\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"r\n\nHtmlString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"Q\n\x0cStatDescribe\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"`\n\x07StatCov\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x89\x01\n\x08StatCorr\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2\x12\x1b\n\x06method\x18\x04 \x01(\tH\x00R\x06method\x88\x01\x01\x42\t\n\x07_method"\xa4\x01\n\x12StatApproxQuantile\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12$\n\rprobabilities\x18\x03 \x03(\x01R\rprobabilities\x12%\n\x0erelative_error\x18\x04 \x01(\x01R\rrelativeError"}\n\rStatFreqItems\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x1d\n\x07support\x18\x03 \x01(\x01H\x00R\x07support\x88\x01\x01\x42\n\n\x08_support"\xb5\x02\n\x0cStatSampleBy\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03\x63ol\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x42\n\tfractions\x18\x03 \x03(\x0b\x32$.spark.connect.StatSampleBy.FractionR\tfractions\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x00R\x04seed\x88\x01\x01\x1a\x63\n\x08\x46raction\x12;\n\x07stratum\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x07stratum\x12\x1a\n\x08\x66raction\x18\x02 \x01(\x01R\x08\x66ractionB\x07\n\x05_seed"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"\x86\x01\n\x06NADrop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\'\n\rmin_non_nulls\x18\x03 \x01(\x05H\x00R\x0bminNonNulls\x88\x01\x01\x42\x10\n\x0e_min_non_nulls"\xa8\x02\n\tNAReplace\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12H\n\x0creplacements\x18\x03 \x03(\x0b\x32$.spark.connect.NAReplace.ReplacementR\x0creplacements\x1a\x8d\x01\n\x0bReplacement\x12>\n\told_value\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08oldValue\x12>\n\tnew_value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08newValue"X\n\x04ToDF\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\xfe\x02\n\x12WithColumnsRenamed\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12i\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x37.spark.connect.WithColumnsRenamed.RenameColumnsMapEntryB\x02\x18\x01R\x10renameColumnsMap\x12\x42\n\x07renames\x18\x03 \x03(\x0b\x32(.spark.connect.WithColumnsRenamed.RenameR\x07renames\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x45\n\x06Rename\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12 \n\x0cnew_col_name\x18\x02 \x01(\tR\nnewColName"w\n\x0bWithColumns\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x07\x61liases\x18\x02 \x03(\x0b\x32\x1f.spark.connect.Expression.AliasR\x07\x61liases"\x86\x01\n\rWithWatermark\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\nevent_time\x18\x02 \x01(\tR\teventTime\x12\'\n\x0f\x64\x65lay_threshold\x18\x03 \x01(\tR\x0e\x64\x65layThreshold"\x84\x01\n\x04Hint\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x39\n\nparameters\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\nparameters"\xc7\x02\n\x07Unpivot\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03ids\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x03ids\x12:\n\x06values\x18\x03 \x01(\x0b\x32\x1d.spark.connect.Unpivot.ValuesH\x00R\x06values\x88\x01\x01\x12\x30\n\x14variable_column_name\x18\x04 \x01(\tR\x12variableColumnName\x12*\n\x11value_column_name\x18\x05 \x01(\tR\x0fvalueColumnName\x1a;\n\x06Values\x12\x31\n\x06values\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x06valuesB\t\n\x07_values"z\n\tTranspose\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12>\n\rindex_columns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cindexColumns"}\n\x1dUnresolvedTableValuedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"j\n\x08ToSchema\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06schema\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema"\xcb\x01\n\x17RepartitionByExpression\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x0fpartition_exprs\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0epartitionExprs\x12*\n\x0enum_partitions\x18\x03 \x01(\x05H\x00R\rnumPartitions\x88\x01\x01\x42\x11\n\x0f_num_partitions"\xe8\x01\n\rMapPartitions\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x04\x66unc\x18\x02 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12"\n\nis_barrier\x18\x03 \x01(\x08H\x00R\tisBarrier\x88\x01\x01\x12"\n\nprofile_id\x18\x04 \x01(\x05H\x01R\tprofileId\x88\x01\x01\x42\r\n\x0b_is_barrierB\r\n\x0b_profile_id"\xcd\x05\n\x08GroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12J\n\x13sorting_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x12sortingExpressions\x12<\n\rinitial_input\x18\x05 \x01(\x0b\x32\x17.spark.connect.RelationR\x0cinitialInput\x12[\n\x1cinitial_grouping_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x1ainitialGroupingExpressions\x12;\n\x18is_map_groups_with_state\x18\x07 \x01(\x08H\x00R\x14isMapGroupsWithState\x88\x01\x01\x12$\n\x0boutput_mode\x18\x08 \x01(\tH\x01R\noutputMode\x88\x01\x01\x12&\n\x0ctimeout_conf\x18\t \x01(\tH\x02R\x0btimeoutConf\x88\x01\x01\x12?\n\x0cstate_schema\x18\n \x01(\x0b\x32\x17.spark.connect.DataTypeH\x03R\x0bstateSchema\x88\x01\x01\x42\x1b\n\x19_is_map_groups_with_stateB\x0e\n\x0c_output_modeB\x0f\n\r_timeout_confB\x0f\n\r_state_schema"\x8e\x04\n\nCoGroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12W\n\x1ainput_grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18inputGroupingExpressions\x12-\n\x05other\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05other\x12W\n\x1aother_grouping_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18otherGroupingExpressions\x12\x42\n\x04\x66unc\x18\x05 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12U\n\x19input_sorting_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17inputSortingExpressions\x12U\n\x19other_sorting_expressions\x18\x07 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17otherSortingExpressions"\xe5\x02\n\x16\x41pplyInPandasWithState\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12#\n\routput_schema\x18\x04 \x01(\tR\x0coutputSchema\x12!\n\x0cstate_schema\x18\x05 \x01(\tR\x0bstateSchema\x12\x1f\n\x0boutput_mode\x18\x06 \x01(\tR\noutputMode\x12!\n\x0ctimeout_conf\x18\x07 \x01(\tR\x0btimeoutConf"\xf4\x01\n$CommonInlineUserDefinedTableFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12<\n\x0bpython_udtf\x18\x04 \x01(\x0b\x32\x19.spark.connect.PythonUDTFH\x00R\npythonUdtfB\n\n\x08\x66unction"\xb1\x01\n\nPythonUDTF\x12=\n\x0breturn_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\nreturnType\x88\x01\x01\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVerB\x0e\n\x0c_return_type"\x97\x01\n!CommonInlineUserDefinedDataSource\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12O\n\x12python_data_source\x18\x02 \x01(\x0b\x32\x1f.spark.connect.PythonDataSourceH\x00R\x10pythonDataSourceB\r\n\x0b\x64\x61ta_source"K\n\x10PythonDataSource\x12\x18\n\x07\x63ommand\x18\x01 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x02 \x01(\tR\tpythonVer"\x88\x01\n\x0e\x43ollectMetrics\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x33\n\x07metrics\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07metrics"\x84\x03\n\x05Parse\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x38\n\x06\x66ormat\x18\x02 \x01(\x0e\x32 .spark.connect.Parse.ParseFormatR\x06\x66ormat\x12\x34\n\x06schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x06schema\x88\x01\x01\x12;\n\x07options\x18\x04 \x03(\x0b\x32!.spark.connect.Parse.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"X\n\x0bParseFormat\x12\x1c\n\x18PARSE_FORMAT_UNSPECIFIED\x10\x00\x12\x14\n\x10PARSE_FORMAT_CSV\x10\x01\x12\x15\n\x11PARSE_FORMAT_JSON\x10\x02\x42\t\n\x07_schema"\xdb\x03\n\x08\x41sOfJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12\x37\n\nleft_as_of\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08leftAsOf\x12\x39\n\x0bright_as_of\x18\x04 \x01(\x0b\x32\x19.spark.connect.ExpressionR\trightAsOf\x12\x36\n\tjoin_expr\x18\x05 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08joinExpr\x12#\n\rusing_columns\x18\x06 \x03(\tR\x0cusingColumns\x12\x1b\n\tjoin_type\x18\x07 \x01(\tR\x08joinType\x12\x37\n\ttolerance\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\ttolerance\x12.\n\x13\x61llow_exact_matches\x18\t \x01(\x08R\x11\x61llowExactMatches\x12\x1c\n\tdirection\x18\n \x01(\tR\tdirection"\xe6\x01\n\x0bLateralJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinTypeB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _globals = globals() @@ -77,158 +78,170 @@ ]._serialized_options = b"\030\001" _globals["_PARSE_OPTIONSENTRY"]._loaded_options = None _globals["_PARSE_OPTIONSENTRY"]._serialized_options = b"8\001" - _globals["_RELATION"]._serialized_start = 193 - _globals["_RELATION"]._serialized_end = 3805 - _globals["_UNKNOWN"]._serialized_start = 3807 - _globals["_UNKNOWN"]._serialized_end = 3816 - _globals["_RELATIONCOMMON"]._serialized_start = 3819 - _globals["_RELATIONCOMMON"]._serialized_end = 3961 - _globals["_SQL"]._serialized_start = 3964 - _globals["_SQL"]._serialized_end = 4442 - _globals["_SQL_ARGSENTRY"]._serialized_start = 4258 - _globals["_SQL_ARGSENTRY"]._serialized_end = 4348 - _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_start = 4350 - _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_end = 4442 - _globals["_WITHRELATIONS"]._serialized_start = 4444 - _globals["_WITHRELATIONS"]._serialized_end = 4561 - _globals["_READ"]._serialized_start = 4564 - _globals["_READ"]._serialized_end = 5227 - _globals["_READ_NAMEDTABLE"]._serialized_start = 4742 - _globals["_READ_NAMEDTABLE"]._serialized_end = 4934 - _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_start = 4876 - _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_end = 4934 - _globals["_READ_DATASOURCE"]._serialized_start = 4937 - _globals["_READ_DATASOURCE"]._serialized_end = 5214 - _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_start = 4876 - _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_end = 4934 - _globals["_PROJECT"]._serialized_start = 5229 - _globals["_PROJECT"]._serialized_end = 5346 - _globals["_FILTER"]._serialized_start = 5348 - _globals["_FILTER"]._serialized_end = 5460 - _globals["_JOIN"]._serialized_start = 5463 - _globals["_JOIN"]._serialized_end = 6124 - _globals["_JOIN_JOINDATATYPE"]._serialized_start = 5802 - _globals["_JOIN_JOINDATATYPE"]._serialized_end = 5894 - _globals["_JOIN_JOINTYPE"]._serialized_start = 5897 - _globals["_JOIN_JOINTYPE"]._serialized_end = 6105 - _globals["_SETOPERATION"]._serialized_start = 6127 - _globals["_SETOPERATION"]._serialized_end = 6606 - _globals["_SETOPERATION_SETOPTYPE"]._serialized_start = 6443 - _globals["_SETOPERATION_SETOPTYPE"]._serialized_end = 6557 - _globals["_LIMIT"]._serialized_start = 6608 - _globals["_LIMIT"]._serialized_end = 6684 - _globals["_OFFSET"]._serialized_start = 6686 - _globals["_OFFSET"]._serialized_end = 6765 - _globals["_TAIL"]._serialized_start = 6767 - _globals["_TAIL"]._serialized_end = 6842 - _globals["_AGGREGATE"]._serialized_start = 6845 - _globals["_AGGREGATE"]._serialized_end = 7611 - _globals["_AGGREGATE_PIVOT"]._serialized_start = 7260 - _globals["_AGGREGATE_PIVOT"]._serialized_end = 7371 - _globals["_AGGREGATE_GROUPINGSETS"]._serialized_start = 7373 - _globals["_AGGREGATE_GROUPINGSETS"]._serialized_end = 7449 - _globals["_AGGREGATE_GROUPTYPE"]._serialized_start = 7452 - _globals["_AGGREGATE_GROUPTYPE"]._serialized_end = 7611 - _globals["_SORT"]._serialized_start = 7614 - _globals["_SORT"]._serialized_end = 7774 - _globals["_DROP"]._serialized_start = 7777 - _globals["_DROP"]._serialized_end = 7918 - _globals["_DEDUPLICATE"]._serialized_start = 7921 - _globals["_DEDUPLICATE"]._serialized_end = 8161 - _globals["_LOCALRELATION"]._serialized_start = 8163 - _globals["_LOCALRELATION"]._serialized_end = 8252 - _globals["_CACHEDLOCALRELATION"]._serialized_start = 8254 - _globals["_CACHEDLOCALRELATION"]._serialized_end = 8326 - _globals["_CACHEDREMOTERELATION"]._serialized_start = 8328 - _globals["_CACHEDREMOTERELATION"]._serialized_end = 8383 - _globals["_SAMPLE"]._serialized_start = 8386 - _globals["_SAMPLE"]._serialized_end = 8659 - _globals["_RANGE"]._serialized_start = 8662 - _globals["_RANGE"]._serialized_end = 8807 - _globals["_SUBQUERYALIAS"]._serialized_start = 8809 - _globals["_SUBQUERYALIAS"]._serialized_end = 8923 - _globals["_REPARTITION"]._serialized_start = 8926 - _globals["_REPARTITION"]._serialized_end = 9068 - _globals["_SHOWSTRING"]._serialized_start = 9071 - _globals["_SHOWSTRING"]._serialized_end = 9213 - _globals["_HTMLSTRING"]._serialized_start = 9215 - _globals["_HTMLSTRING"]._serialized_end = 9329 - _globals["_STATSUMMARY"]._serialized_start = 9331 - _globals["_STATSUMMARY"]._serialized_end = 9423 - _globals["_STATDESCRIBE"]._serialized_start = 9425 - _globals["_STATDESCRIBE"]._serialized_end = 9506 - _globals["_STATCROSSTAB"]._serialized_start = 9508 - _globals["_STATCROSSTAB"]._serialized_end = 9609 - _globals["_STATCOV"]._serialized_start = 9611 - _globals["_STATCOV"]._serialized_end = 9707 - _globals["_STATCORR"]._serialized_start = 9710 - _globals["_STATCORR"]._serialized_end = 9847 - _globals["_STATAPPROXQUANTILE"]._serialized_start = 9850 - _globals["_STATAPPROXQUANTILE"]._serialized_end = 10014 - _globals["_STATFREQITEMS"]._serialized_start = 10016 - _globals["_STATFREQITEMS"]._serialized_end = 10141 - _globals["_STATSAMPLEBY"]._serialized_start = 10144 - _globals["_STATSAMPLEBY"]._serialized_end = 10453 - _globals["_STATSAMPLEBY_FRACTION"]._serialized_start = 10345 - _globals["_STATSAMPLEBY_FRACTION"]._serialized_end = 10444 - _globals["_NAFILL"]._serialized_start = 10456 - _globals["_NAFILL"]._serialized_end = 10590 - _globals["_NADROP"]._serialized_start = 10593 - _globals["_NADROP"]._serialized_end = 10727 - _globals["_NAREPLACE"]._serialized_start = 10730 - _globals["_NAREPLACE"]._serialized_end = 11026 - _globals["_NAREPLACE_REPLACEMENT"]._serialized_start = 10885 - _globals["_NAREPLACE_REPLACEMENT"]._serialized_end = 11026 - _globals["_TODF"]._serialized_start = 11028 - _globals["_TODF"]._serialized_end = 11116 - _globals["_WITHCOLUMNSRENAMED"]._serialized_start = 11119 - _globals["_WITHCOLUMNSRENAMED"]._serialized_end = 11501 - _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_start = 11363 - _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_end = 11430 - _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_start = 11432 - _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_end = 11501 - _globals["_WITHCOLUMNS"]._serialized_start = 11503 - _globals["_WITHCOLUMNS"]._serialized_end = 11622 - _globals["_WITHWATERMARK"]._serialized_start = 11625 - _globals["_WITHWATERMARK"]._serialized_end = 11759 - _globals["_HINT"]._serialized_start = 11762 - _globals["_HINT"]._serialized_end = 11894 - _globals["_UNPIVOT"]._serialized_start = 11897 - _globals["_UNPIVOT"]._serialized_end = 12224 - _globals["_UNPIVOT_VALUES"]._serialized_start = 12154 - _globals["_UNPIVOT_VALUES"]._serialized_end = 12213 - _globals["_TRANSPOSE"]._serialized_start = 12226 - _globals["_TRANSPOSE"]._serialized_end = 12348 - _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_start = 12350 - _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_end = 12475 - _globals["_TOSCHEMA"]._serialized_start = 12477 - _globals["_TOSCHEMA"]._serialized_end = 12583 - _globals["_REPARTITIONBYEXPRESSION"]._serialized_start = 12586 - _globals["_REPARTITIONBYEXPRESSION"]._serialized_end = 12789 - _globals["_MAPPARTITIONS"]._serialized_start = 12792 - _globals["_MAPPARTITIONS"]._serialized_end = 13024 - _globals["_GROUPMAP"]._serialized_start = 13027 - _globals["_GROUPMAP"]._serialized_end = 13662 - _globals["_COGROUPMAP"]._serialized_start = 13665 - _globals["_COGROUPMAP"]._serialized_end = 14191 - _globals["_APPLYINPANDASWITHSTATE"]._serialized_start = 14194 - _globals["_APPLYINPANDASWITHSTATE"]._serialized_end = 14551 - _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_start = 14554 - _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_end = 14798 - _globals["_PYTHONUDTF"]._serialized_start = 14801 - _globals["_PYTHONUDTF"]._serialized_end = 14978 - _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_start = 14981 - _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_end = 15132 - _globals["_PYTHONDATASOURCE"]._serialized_start = 15134 - _globals["_PYTHONDATASOURCE"]._serialized_end = 15209 - _globals["_COLLECTMETRICS"]._serialized_start = 15212 - _globals["_COLLECTMETRICS"]._serialized_end = 15348 - _globals["_PARSE"]._serialized_start = 15351 - _globals["_PARSE"]._serialized_end = 15739 - _globals["_PARSE_OPTIONSENTRY"]._serialized_start = 4876 - _globals["_PARSE_OPTIONSENTRY"]._serialized_end = 4934 - _globals["_PARSE_PARSEFORMAT"]._serialized_start = 15640 - _globals["_PARSE_PARSEFORMAT"]._serialized_end = 15728 - _globals["_ASOFJOIN"]._serialized_start = 15742 - _globals["_ASOFJOIN"]._serialized_end = 16217 + _globals["_RELATION"]._serialized_start = 224 + _globals["_RELATION"]._serialized_end = 3964 + _globals["_MLRELATION"]._serialized_start = 3967 + _globals["_MLRELATION"]._serialized_end = 4343 + _globals["_MLRELATION_TRANSFORM"]._serialized_start = 4097 + _globals["_MLRELATION_TRANSFORM"]._serialized_end = 4332 + _globals["_FETCH"]._serialized_start = 4346 + _globals["_FETCH"]._serialized_end = 4664 + _globals["_FETCH_METHOD"]._serialized_start = 4462 + _globals["_FETCH_METHOD"]._serialized_end = 4664 + _globals["_FETCH_METHOD_ARGS"]._serialized_start = 4550 + _globals["_FETCH_METHOD_ARGS"]._serialized_end = 4664 + _globals["_UNKNOWN"]._serialized_start = 4666 + _globals["_UNKNOWN"]._serialized_end = 4675 + _globals["_RELATIONCOMMON"]._serialized_start = 4678 + _globals["_RELATIONCOMMON"]._serialized_end = 4820 + _globals["_SQL"]._serialized_start = 4823 + _globals["_SQL"]._serialized_end = 5301 + _globals["_SQL_ARGSENTRY"]._serialized_start = 5117 + _globals["_SQL_ARGSENTRY"]._serialized_end = 5207 + _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_start = 5209 + _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_end = 5301 + _globals["_WITHRELATIONS"]._serialized_start = 5303 + _globals["_WITHRELATIONS"]._serialized_end = 5420 + _globals["_READ"]._serialized_start = 5423 + _globals["_READ"]._serialized_end = 6086 + _globals["_READ_NAMEDTABLE"]._serialized_start = 5601 + _globals["_READ_NAMEDTABLE"]._serialized_end = 5793 + _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_start = 5735 + _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_end = 5793 + _globals["_READ_DATASOURCE"]._serialized_start = 5796 + _globals["_READ_DATASOURCE"]._serialized_end = 6073 + _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_start = 5735 + _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_end = 5793 + _globals["_PROJECT"]._serialized_start = 6088 + _globals["_PROJECT"]._serialized_end = 6205 + _globals["_FILTER"]._serialized_start = 6207 + _globals["_FILTER"]._serialized_end = 6319 + _globals["_JOIN"]._serialized_start = 6322 + _globals["_JOIN"]._serialized_end = 6983 + _globals["_JOIN_JOINDATATYPE"]._serialized_start = 6661 + _globals["_JOIN_JOINDATATYPE"]._serialized_end = 6753 + _globals["_JOIN_JOINTYPE"]._serialized_start = 6756 + _globals["_JOIN_JOINTYPE"]._serialized_end = 6964 + _globals["_SETOPERATION"]._serialized_start = 6986 + _globals["_SETOPERATION"]._serialized_end = 7465 + _globals["_SETOPERATION_SETOPTYPE"]._serialized_start = 7302 + _globals["_SETOPERATION_SETOPTYPE"]._serialized_end = 7416 + _globals["_LIMIT"]._serialized_start = 7467 + _globals["_LIMIT"]._serialized_end = 7543 + _globals["_OFFSET"]._serialized_start = 7545 + _globals["_OFFSET"]._serialized_end = 7624 + _globals["_TAIL"]._serialized_start = 7626 + _globals["_TAIL"]._serialized_end = 7701 + _globals["_AGGREGATE"]._serialized_start = 7704 + _globals["_AGGREGATE"]._serialized_end = 8470 + _globals["_AGGREGATE_PIVOT"]._serialized_start = 8119 + _globals["_AGGREGATE_PIVOT"]._serialized_end = 8230 + _globals["_AGGREGATE_GROUPINGSETS"]._serialized_start = 8232 + _globals["_AGGREGATE_GROUPINGSETS"]._serialized_end = 8308 + _globals["_AGGREGATE_GROUPTYPE"]._serialized_start = 8311 + _globals["_AGGREGATE_GROUPTYPE"]._serialized_end = 8470 + _globals["_SORT"]._serialized_start = 8473 + _globals["_SORT"]._serialized_end = 8633 + _globals["_DROP"]._serialized_start = 8636 + _globals["_DROP"]._serialized_end = 8777 + _globals["_DEDUPLICATE"]._serialized_start = 8780 + _globals["_DEDUPLICATE"]._serialized_end = 9020 + _globals["_LOCALRELATION"]._serialized_start = 9022 + _globals["_LOCALRELATION"]._serialized_end = 9111 + _globals["_CACHEDLOCALRELATION"]._serialized_start = 9113 + _globals["_CACHEDLOCALRELATION"]._serialized_end = 9185 + _globals["_CACHEDREMOTERELATION"]._serialized_start = 9187 + _globals["_CACHEDREMOTERELATION"]._serialized_end = 9242 + _globals["_SAMPLE"]._serialized_start = 9245 + _globals["_SAMPLE"]._serialized_end = 9518 + _globals["_RANGE"]._serialized_start = 9521 + _globals["_RANGE"]._serialized_end = 9666 + _globals["_SUBQUERYALIAS"]._serialized_start = 9668 + _globals["_SUBQUERYALIAS"]._serialized_end = 9782 + _globals["_REPARTITION"]._serialized_start = 9785 + _globals["_REPARTITION"]._serialized_end = 9927 + _globals["_SHOWSTRING"]._serialized_start = 9930 + _globals["_SHOWSTRING"]._serialized_end = 10072 + _globals["_HTMLSTRING"]._serialized_start = 10074 + _globals["_HTMLSTRING"]._serialized_end = 10188 + _globals["_STATSUMMARY"]._serialized_start = 10190 + _globals["_STATSUMMARY"]._serialized_end = 10282 + _globals["_STATDESCRIBE"]._serialized_start = 10284 + _globals["_STATDESCRIBE"]._serialized_end = 10365 + _globals["_STATCROSSTAB"]._serialized_start = 10367 + _globals["_STATCROSSTAB"]._serialized_end = 10468 + _globals["_STATCOV"]._serialized_start = 10470 + _globals["_STATCOV"]._serialized_end = 10566 + _globals["_STATCORR"]._serialized_start = 10569 + _globals["_STATCORR"]._serialized_end = 10706 + _globals["_STATAPPROXQUANTILE"]._serialized_start = 10709 + _globals["_STATAPPROXQUANTILE"]._serialized_end = 10873 + _globals["_STATFREQITEMS"]._serialized_start = 10875 + _globals["_STATFREQITEMS"]._serialized_end = 11000 + _globals["_STATSAMPLEBY"]._serialized_start = 11003 + _globals["_STATSAMPLEBY"]._serialized_end = 11312 + _globals["_STATSAMPLEBY_FRACTION"]._serialized_start = 11204 + _globals["_STATSAMPLEBY_FRACTION"]._serialized_end = 11303 + _globals["_NAFILL"]._serialized_start = 11315 + _globals["_NAFILL"]._serialized_end = 11449 + _globals["_NADROP"]._serialized_start = 11452 + _globals["_NADROP"]._serialized_end = 11586 + _globals["_NAREPLACE"]._serialized_start = 11589 + _globals["_NAREPLACE"]._serialized_end = 11885 + _globals["_NAREPLACE_REPLACEMENT"]._serialized_start = 11744 + _globals["_NAREPLACE_REPLACEMENT"]._serialized_end = 11885 + _globals["_TODF"]._serialized_start = 11887 + _globals["_TODF"]._serialized_end = 11975 + _globals["_WITHCOLUMNSRENAMED"]._serialized_start = 11978 + _globals["_WITHCOLUMNSRENAMED"]._serialized_end = 12360 + _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_start = 12222 + _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_end = 12289 + _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_start = 12291 + _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_end = 12360 + _globals["_WITHCOLUMNS"]._serialized_start = 12362 + _globals["_WITHCOLUMNS"]._serialized_end = 12481 + _globals["_WITHWATERMARK"]._serialized_start = 12484 + _globals["_WITHWATERMARK"]._serialized_end = 12618 + _globals["_HINT"]._serialized_start = 12621 + _globals["_HINT"]._serialized_end = 12753 + _globals["_UNPIVOT"]._serialized_start = 12756 + _globals["_UNPIVOT"]._serialized_end = 13083 + _globals["_UNPIVOT_VALUES"]._serialized_start = 13013 + _globals["_UNPIVOT_VALUES"]._serialized_end = 13072 + _globals["_TRANSPOSE"]._serialized_start = 13085 + _globals["_TRANSPOSE"]._serialized_end = 13207 + _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_start = 13209 + _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_end = 13334 + _globals["_TOSCHEMA"]._serialized_start = 13336 + _globals["_TOSCHEMA"]._serialized_end = 13442 + _globals["_REPARTITIONBYEXPRESSION"]._serialized_start = 13445 + _globals["_REPARTITIONBYEXPRESSION"]._serialized_end = 13648 + _globals["_MAPPARTITIONS"]._serialized_start = 13651 + _globals["_MAPPARTITIONS"]._serialized_end = 13883 + _globals["_GROUPMAP"]._serialized_start = 13886 + _globals["_GROUPMAP"]._serialized_end = 14603 + _globals["_COGROUPMAP"]._serialized_start = 14606 + _globals["_COGROUPMAP"]._serialized_end = 15132 + _globals["_APPLYINPANDASWITHSTATE"]._serialized_start = 15135 + _globals["_APPLYINPANDASWITHSTATE"]._serialized_end = 15492 + _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_start = 15495 + _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_end = 15739 + _globals["_PYTHONUDTF"]._serialized_start = 15742 + _globals["_PYTHONUDTF"]._serialized_end = 15919 + _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_start = 15922 + _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_end = 16073 + _globals["_PYTHONDATASOURCE"]._serialized_start = 16075 + _globals["_PYTHONDATASOURCE"]._serialized_end = 16150 + _globals["_COLLECTMETRICS"]._serialized_start = 16153 + _globals["_COLLECTMETRICS"]._serialized_end = 16289 + _globals["_PARSE"]._serialized_start = 16292 + _globals["_PARSE"]._serialized_end = 16680 + _globals["_PARSE_OPTIONSENTRY"]._serialized_start = 5735 + _globals["_PARSE_OPTIONSENTRY"]._serialized_end = 5793 + _globals["_PARSE_PARSEFORMAT"]._serialized_start = 16581 + _globals["_PARSE_PARSEFORMAT"]._serialized_end = 16669 + _globals["_ASOFJOIN"]._serialized_start = 16683 + _globals["_ASOFJOIN"]._serialized_end = 17158 + _globals["_LATERALJOIN"]._serialized_start = 17161 + _globals["_LATERALJOIN"]._serialized_end = 17391 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 03753056c6bf1..0c8cf8dd3eda8 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -43,6 +43,7 @@ import google.protobuf.message import pyspark.sql.connect.proto.catalog_pb2 import pyspark.sql.connect.proto.common_pb2 import pyspark.sql.connect.proto.expressions_pb2 +import pyspark.sql.connect.proto.ml_common_pb2 import pyspark.sql.connect.proto.types_pb2 import sys import typing @@ -106,6 +107,7 @@ class Relation(google.protobuf.message.Message): WITH_RELATIONS_FIELD_NUMBER: builtins.int TRANSPOSE_FIELD_NUMBER: builtins.int UNRESOLVED_TABLE_VALUED_FUNCTION_FIELD_NUMBER: builtins.int + LATERAL_JOIN_FIELD_NUMBER: builtins.int FILL_NA_FIELD_NUMBER: builtins.int DROP_NA_FIELD_NUMBER: builtins.int REPLACE_FIELD_NUMBER: builtins.int @@ -118,6 +120,7 @@ class Relation(google.protobuf.message.Message): FREQ_ITEMS_FIELD_NUMBER: builtins.int SAMPLE_BY_FIELD_NUMBER: builtins.int CATALOG_FIELD_NUMBER: builtins.int + ML_RELATION_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int UNKNOWN_FIELD_NUMBER: builtins.int @property @@ -211,6 +214,8 @@ class Relation(google.protobuf.message.Message): @property def unresolved_table_valued_function(self) -> global___UnresolvedTableValuedFunction: ... @property + def lateral_join(self) -> global___LateralJoin: ... + @property def fill_na(self) -> global___NAFill: """NA functions""" @property @@ -238,6 +243,9 @@ class Relation(google.protobuf.message.Message): def catalog(self) -> pyspark.sql.connect.proto.catalog_pb2.Catalog: """Catalog API (experimental / unstable)""" @property + def ml_relation(self) -> global___MlRelation: + """ML relation""" + @property def extension(self) -> google.protobuf.any_pb2.Any: """This field is used to mark extensions to the protocol. When plugins generate arbitrary relations they can add them here. During the planning the correct resolution is done. @@ -292,6 +300,7 @@ class Relation(google.protobuf.message.Message): with_relations: global___WithRelations | None = ..., transpose: global___Transpose | None = ..., unresolved_table_valued_function: global___UnresolvedTableValuedFunction | None = ..., + lateral_join: global___LateralJoin | None = ..., fill_na: global___NAFill | None = ..., drop_na: global___NADrop | None = ..., replace: global___NAReplace | None = ..., @@ -304,6 +313,7 @@ class Relation(google.protobuf.message.Message): freq_items: global___StatFreqItems | None = ..., sample_by: global___StatSampleBy | None = ..., catalog: pyspark.sql.connect.proto.catalog_pb2.Catalog | None = ..., + ml_relation: global___MlRelation | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., unknown: global___Unknown | None = ..., ) -> None: ... @@ -364,12 +374,16 @@ class Relation(google.protobuf.message.Message): b"html_string", "join", b"join", + "lateral_join", + b"lateral_join", "limit", b"limit", "local_relation", b"local_relation", "map_partitions", b"map_partitions", + "ml_relation", + b"ml_relation", "offset", b"offset", "parse", @@ -485,12 +499,16 @@ class Relation(google.protobuf.message.Message): b"html_string", "join", b"join", + "lateral_join", + b"lateral_join", "limit", b"limit", "local_relation", b"local_relation", "map_partitions", b"map_partitions", + "ml_relation", + b"ml_relation", "offset", b"offset", "parse", @@ -595,6 +613,7 @@ class Relation(google.protobuf.message.Message): "with_relations", "transpose", "unresolved_table_valued_function", + "lateral_join", "fill_na", "drop_na", "replace", @@ -607,6 +626,7 @@ class Relation(google.protobuf.message.Message): "freq_items", "sample_by", "catalog", + "ml_relation", "extension", "unknown", ] @@ -615,6 +635,198 @@ class Relation(google.protobuf.message.Message): global___Relation = Relation +class MlRelation(google.protobuf.message.Message): + """Relation to represent ML world""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class Transform(google.protobuf.message.Message): + """Relation to represent transform(input) of the operator + which could be a cached model or a new transformer + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OBJ_REF_FIELD_NUMBER: builtins.int + TRANSFORMER_FIELD_NUMBER: builtins.int + INPUT_FIELD_NUMBER: builtins.int + PARAMS_FIELD_NUMBER: builtins.int + @property + def obj_ref(self) -> pyspark.sql.connect.proto.ml_common_pb2.ObjectRef: + """Object reference""" + @property + def transformer(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlOperator: + """Could be an ML transformer like VectorAssembler""" + @property + def input(self) -> global___Relation: + """the input dataframe""" + @property + def params(self) -> pyspark.sql.connect.proto.ml_common_pb2.MlParams: + """the operator specific parameters""" + def __init__( + self, + *, + obj_ref: pyspark.sql.connect.proto.ml_common_pb2.ObjectRef | None = ..., + transformer: pyspark.sql.connect.proto.ml_common_pb2.MlOperator | None = ..., + input: global___Relation | None = ..., + params: pyspark.sql.connect.proto.ml_common_pb2.MlParams | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "input", + b"input", + "obj_ref", + b"obj_ref", + "operator", + b"operator", + "params", + b"params", + "transformer", + b"transformer", + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "input", + b"input", + "obj_ref", + b"obj_ref", + "operator", + b"operator", + "params", + b"params", + "transformer", + b"transformer", + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["operator", b"operator"] + ) -> typing_extensions.Literal["obj_ref", "transformer"] | None: ... + + TRANSFORM_FIELD_NUMBER: builtins.int + FETCH_FIELD_NUMBER: builtins.int + @property + def transform(self) -> global___MlRelation.Transform: ... + @property + def fetch(self) -> global___Fetch: ... + def __init__( + self, + *, + transform: global___MlRelation.Transform | None = ..., + fetch: global___Fetch | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "fetch", b"fetch", "ml_type", b"ml_type", "transform", b"transform" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "fetch", b"fetch", "ml_type", b"ml_type", "transform", b"transform" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["ml_type", b"ml_type"] + ) -> typing_extensions.Literal["transform", "fetch"] | None: ... + +global___MlRelation = MlRelation + +class Fetch(google.protobuf.message.Message): + """Message for fetching attribute from object on the server side. + Fetch can be represented as a Relation or a ML command + Command: model.coefficients, model.summary.weightedPrecision which + returns the final literal result + Relation: model.summary.roc which returns a DataFrame (Relation) + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class Method(google.protobuf.message.Message): + """Represents a method with inclusion of method name and its arguments""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class Args(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + PARAM_FIELD_NUMBER: builtins.int + INPUT_FIELD_NUMBER: builtins.int + @property + def param(self) -> pyspark.sql.connect.proto.ml_common_pb2.Param: ... + @property + def input(self) -> global___Relation: ... + def __init__( + self, + *, + param: pyspark.sql.connect.proto.ml_common_pb2.Param | None = ..., + input: global___Relation | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "args_type", b"args_type", "input", b"input", "param", b"param" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "args_type", b"args_type", "input", b"input", "param", b"param" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["args_type", b"args_type"] + ) -> typing_extensions.Literal["param", "input"] | None: ... + + METHOD_FIELD_NUMBER: builtins.int + ARGS_FIELD_NUMBER: builtins.int + method: builtins.str + """(Required) the method name""" + @property + def args( + self, + ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[ + global___Fetch.Method.Args + ]: + """(Optional) the arguments of the method""" + def __init__( + self, + *, + method: builtins.str = ..., + args: collections.abc.Iterable[global___Fetch.Method.Args] | None = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["args", b"args", "method", b"method"] + ) -> None: ... + + OBJ_REF_FIELD_NUMBER: builtins.int + METHODS_FIELD_NUMBER: builtins.int + @property + def obj_ref(self) -> pyspark.sql.connect.proto.ml_common_pb2.ObjectRef: + """(Required) reference to the object on the server side""" + @property + def methods( + self, + ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Fetch.Method]: + """(Required) the calling method chains""" + def __init__( + self, + *, + obj_ref: pyspark.sql.connect.proto.ml_common_pb2.ObjectRef | None = ..., + methods: collections.abc.Iterable[global___Fetch.Method] | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["obj_ref", b"obj_ref"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["methods", b"methods", "obj_ref", b"obj_ref"] + ) -> None: ... + +global___Fetch = Fetch + class Unknown(google.protobuf.message.Message): """Used for testing purposes only.""" @@ -3400,6 +3612,7 @@ class GroupMap(google.protobuf.message.Message): IS_MAP_GROUPS_WITH_STATE_FIELD_NUMBER: builtins.int OUTPUT_MODE_FIELD_NUMBER: builtins.int TIMEOUT_CONF_FIELD_NUMBER: builtins.int + STATE_SCHEMA_FIELD_NUMBER: builtins.int @property def input(self) -> global___Relation: """(Required) Input relation for Group Map API: apply, applyInPandas.""" @@ -3438,6 +3651,9 @@ class GroupMap(google.protobuf.message.Message): """(Optional) The output mode of the function.""" timeout_conf: builtins.str """(Optional) Timeout configuration for groups that do not receive data for a while.""" + @property + def state_schema(self) -> pyspark.sql.connect.proto.types_pb2.DataType: + """(Optional) The schema for the grouped state.""" def __init__( self, *, @@ -3460,6 +3676,7 @@ class GroupMap(google.protobuf.message.Message): is_map_groups_with_state: builtins.bool | None = ..., output_mode: builtins.str | None = ..., timeout_conf: builtins.str | None = ..., + state_schema: pyspark.sql.connect.proto.types_pb2.DataType | None = ..., ) -> None: ... def HasField( self, @@ -3468,6 +3685,8 @@ class GroupMap(google.protobuf.message.Message): b"_is_map_groups_with_state", "_output_mode", b"_output_mode", + "_state_schema", + b"_state_schema", "_timeout_conf", b"_timeout_conf", "func", @@ -3480,6 +3699,8 @@ class GroupMap(google.protobuf.message.Message): b"is_map_groups_with_state", "output_mode", b"output_mode", + "state_schema", + b"state_schema", "timeout_conf", b"timeout_conf", ], @@ -3491,6 +3712,8 @@ class GroupMap(google.protobuf.message.Message): b"_is_map_groups_with_state", "_output_mode", b"_output_mode", + "_state_schema", + b"_state_schema", "_timeout_conf", b"_timeout_conf", "func", @@ -3509,6 +3732,8 @@ class GroupMap(google.protobuf.message.Message): b"output_mode", "sorting_expressions", b"sorting_expressions", + "state_schema", + b"state_schema", "timeout_conf", b"timeout_conf", ], @@ -3525,6 +3750,10 @@ class GroupMap(google.protobuf.message.Message): self, oneof_group: typing_extensions.Literal["_output_mode", b"_output_mode"] ) -> typing_extensions.Literal["output_mode"] | None: ... @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_state_schema", b"_state_schema"] + ) -> typing_extensions.Literal["state_schema"] | None: ... + @typing.overload def WhichOneof( self, oneof_group: typing_extensions.Literal["_timeout_conf", b"_timeout_conf"] ) -> typing_extensions.Literal["timeout_conf"] | None: ... @@ -4109,3 +4338,56 @@ class AsOfJoin(google.protobuf.message.Message): ) -> None: ... global___AsOfJoin = AsOfJoin + +class LateralJoin(google.protobuf.message.Message): + """Relation of type [[LateralJoin]]. + + `left` and `right` must be present. + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LEFT_FIELD_NUMBER: builtins.int + RIGHT_FIELD_NUMBER: builtins.int + JOIN_CONDITION_FIELD_NUMBER: builtins.int + JOIN_TYPE_FIELD_NUMBER: builtins.int + @property + def left(self) -> global___Relation: + """(Required) Left input relation for a Join.""" + @property + def right(self) -> global___Relation: + """(Required) Right input relation for a Join.""" + @property + def join_condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: + """(Optional) The join condition.""" + join_type: global___Join.JoinType.ValueType + """(Required) The join type.""" + def __init__( + self, + *, + left: global___Relation | None = ..., + right: global___Relation | None = ..., + join_condition: pyspark.sql.connect.proto.expressions_pb2.Expression | None = ..., + join_type: global___Join.JoinType.ValueType = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "join_condition", b"join_condition", "left", b"left", "right", b"right" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "join_condition", + b"join_condition", + "join_type", + b"join_type", + "left", + b"left", + "right", + b"right", + ], + ) -> None: ... + +global___LateralJoin = LateralJoin diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py index aeb0f98d71076..6cc38aca4fc4c 100644 --- a/python/pyspark/sql/connect/readwriter.py +++ b/python/pyspark/sql/connect/readwriter.py @@ -751,7 +751,7 @@ def parquet( self.mode(mode) if partitionBy is not None: self.partitionBy(partitionBy) - self.option("compression", compression) + self._set_opts(compression=compression) self.format("parquet").save(path) parquet.__doc__ = PySparkDataFrameWriter.parquet.__doc__ diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index bfd79092ccf4d..59349a17886bb 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -113,13 +113,6 @@ from pyspark.sql.connect.shell.progress import ProgressHandler from pyspark.sql.connect.datasource import DataSourceRegistration -try: - import memory_profiler # noqa: F401 - - has_memory_profiler = True -except Exception: - has_memory_profiler = False - class SparkSession: # The active SparkSession for the current thread @@ -207,34 +200,26 @@ def _apply_options(self, session: "SparkSession") -> None: for i in range(int(os.environ.get("PYSPARK_REMOTE_INIT_CONF_LEN", "0"))): init_opts = json.loads(os.environ[f"PYSPARK_REMOTE_INIT_CONF_{i}"]) + # The options are applied after session creation, + # so options ["spark.remote", "spark.master"] always take no effect. + invalid_opts = ["spark.remote", "spark.master"] + with self._lock: + opts = {} + + # Only attempts to set Spark SQL configurations. + # If the configurations are static, it might throw an exception so + # simply ignore it for now. for k, v in init_opts.items(): - # the options are applied after session creation, - # so following options always take no effect - if k not in [ - "spark.remote", - "spark.master", - ] and k.startswith("spark.sql."): - # Only attempts to set Spark SQL configurations. - # If the configurations are static, it might throw an exception so - # simply ignore it for now. - try: - session.conf.set(k, v) - except Exception as e: - logger.warn(f"Failed to set configuration {k} due to {e}") + if k not in invalid_opts and k.startswith("spark.sql."): + opts[k] = v - with self._lock: for k, v in self._options.items(): - # the options are applied after session creation, - # so following options always take no effect - if k not in [ - "spark.remote", - "spark.master", - ]: - try: - session.conf.set(k, v) - except Exception as e: - logger.warn(f"Failed to set configuration {k} due to {e}") + if k not in invalid_opts: + opts[k] = v + + if len(opts) > 0: + session.conf._set_all(configs=opts, silent=True) def create(self) -> "SparkSession": has_channel_builder = self._channel_builder is not None @@ -797,13 +782,11 @@ def range( range.__doc__ = PySparkSession.range.__doc__ - @property + @functools.cached_property def catalog(self) -> "Catalog": from pyspark.sql.connect.catalog import Catalog - if not hasattr(self, "_catalog"): - self._catalog = Catalog(self) - return self._catalog + return Catalog(self) catalog.__doc__ = PySparkSession.catalog.__doc__ @@ -1051,7 +1034,7 @@ def _start_connect_server(master: str, opts: Dict[str, Any]) -> None: default_conf = { "spark.plugins": "org.apache.spark.sql.connect.SparkConnectPlugin", "spark.sql.artifact.isolation.enabled": "true", - "spark.sql.artifact.isolation.always.apply.classloader": "true", + "spark.sql.artifact.isolation.alwaysApplyClassloader": "true", } if "SPARK_TESTING" in os.environ: @@ -1120,6 +1103,16 @@ def creator(old_session_id: str) -> "SparkSession": return creator, (self._session_id,) + def _to_ddl(self, struct: StructType) -> str: + ddl = self._client._analyze(method="json_to_ddl", json_string=struct.json()).ddl_string + assert ddl is not None + return ddl + + def _parse_ddl(self, ddl: str) -> DataType: + dt = self._client._analyze(method="ddl_parse", ddl_string=ddl).parsed + assert dt is not None + return dt + SparkSession.__doc__ = PySparkSession.__doc__ diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 8a5b982bc7f23..2d12704485ad2 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -42,6 +42,7 @@ from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2 from pyspark.sql.merge import MergeIntoWriter from pyspark.sql.streaming import DataStreamWriter +from pyspark.sql.table_arg import TableArg from pyspark.sql.types import StructType, Row from pyspark.sql.utils import dispatch_df_method @@ -2276,6 +2277,28 @@ def columns(self) -> List[str]: """ ... + @dispatch_df_method + def metadataColumn(self, colName: str) -> Column: + """ + Selects a metadata column based on its logical column name and returns it as a + :class:`Column`. + + A metadata column can be accessed this way even if the underlying data source defines a data + column with a conflicting name. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + colName : str + string, metadata column name + + Returns + ------- + :class:`Column` + """ + ... + @dispatch_df_method def colRegex(self, colName: str) -> Column: """ @@ -2549,7 +2572,7 @@ def join( pyspark.errors.exceptions.captured.AnalysisException: Column name#0 are ambiguous... A better approach is to assign aliases to the dataframes, and then reference - the ouptut columns from the join operation using these aliases: + the output columns from the join operation using these aliases: >>> df.alias("a").join( ... df.alias("b"), sf.col("a.name") == sf.col("b.name"), "outer" @@ -2629,6 +2652,108 @@ def join( """ ... + def lateralJoin( + self, + other: "DataFrame", + on: Optional[Column] = None, + how: Optional[str] = None, + ) -> "DataFrame": + """ + Lateral joins with another :class:`DataFrame`, using the given join expression. + + A lateral join (also known as a correlated join) is a type of join where each row from + one DataFrame is used as input to a subquery or a derived table that computes a result + specific to that row. The right side `DataFrame` can reference columns from the current + row of the left side `DataFrame`, allowing for more complex and context-dependent results + than a standard join. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + other : :class:`DataFrame` + Right side of the join + on : :class:`Column`, optional + a join expression (Column). + how : str, optional + default ``inner``. Must be one of: ``inner``, ``cross``, ``left``, ``leftouter``, + and ``left_outer``. + + Returns + ------- + :class:`DataFrame` + Joined DataFrame. + + Examples + -------- + Setup a sample DataFrame. + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql import Row + >>> customers_data = [ + ... Row(customer_id=1, name="Alice"), Row(customer_id=2, name="Bob"), + ... Row(customer_id=3, name="Charlie"), Row(customer_id=4, name="Diana") + ... ] + >>> customers = spark.createDataFrame(customers_data) + >>> orders_data = [ + ... Row(order_id=101, customer_id=1, order_date="2024-01-10", + ... items=[Row(product="laptop", quantity=5), Row(product="mouse", quantity=12)]), + ... Row(order_id=102, customer_id=1, order_date="2024-02-15", + ... items=[Row(product="phone", quantity=2), Row(product="charger", quantity=15)]), + ... Row(order_id=105, customer_id=1, order_date="2024-03-20", + ... items=[Row(product="tablet", quantity=4)]), + ... Row(order_id=103, customer_id=2, order_date="2024-01-12", + ... items=[Row(product="tablet", quantity=8)]), + ... Row(order_id=104, customer_id=2, order_date="2024-03-05", + ... items=[Row(product="laptop", quantity=7)]), + ... Row(order_id=106, customer_id=3, order_date="2024-04-05", + ... items=[Row(product="monitor", quantity=1)]), + ... ] + >>> orders = spark.createDataFrame(orders_data) + + Example 1 (use TVF): Expanding Items in Each Order into Separate Rows + + >>> customers.join(orders, "customer_id").lateralJoin( + ... spark.tvf.explode(sf.col("items").outer()).select("col.*") + ... ).select( + ... "customer_id", "name", "order_id", "order_date", "product", "quantity" + ... ).orderBy("customer_id", "order_id", "product").show() + +-----------+-------+--------+----------+-------+--------+ + |customer_id| name|order_id|order_date|product|quantity| + +-----------+-------+--------+----------+-------+--------+ + | 1| Alice| 101|2024-01-10| laptop| 5| + | 1| Alice| 101|2024-01-10| mouse| 12| + | 1| Alice| 102|2024-02-15|charger| 15| + | 1| Alice| 102|2024-02-15| phone| 2| + | 1| Alice| 105|2024-03-20| tablet| 4| + | 2| Bob| 103|2024-01-12| tablet| 8| + | 2| Bob| 104|2024-03-05| laptop| 7| + | 3|Charlie| 106|2024-04-05|monitor| 1| + +-----------+-------+--------+----------+-------+--------+ + + Example 2 (use subquery): Finding the Two Most Recent Orders for Customer + + >>> customers.alias("c").lateralJoin( + ... orders.alias("o") + ... .where(sf.col("o.customer_id") == sf.col("c.customer_id").outer()) + ... .select("order_id", "order_date") + ... .orderBy(sf.col("order_date").desc()) + ... .limit(2), + ... how="left" + ... ).orderBy("customer_id", "order_id").show() + +-----------+-------+--------+----------+ + |customer_id| name|order_id|order_date| + +-----------+-------+--------+----------+ + | 1| Alice| 102|2024-02-15| + | 1| Alice| 105|2024-03-20| + | 2| Bob| 103|2024-01-12| + | 2| Bob| 104|2024-03-05| + | 3|Charlie| 106|2024-04-05| + | 4| Diana| NULL| NULL| + +-----------+-------+--------+----------+ + """ + ... + # TODO(SPARK-22947): Fix the DataFrame API. @dispatch_df_method def _joinAsOf( @@ -3907,7 +4032,7 @@ def groupingSets( groupingSets : sequence of sequence of columns or str Individual set of columns to group on. cols : :class:`Column` or str - Addional grouping columns specified by users. + Additional grouping columns specified by users. Those columns are shown as the output columns after aggregation. Returns @@ -6476,6 +6601,29 @@ def transpose(self, indexColumn: Optional["ColumnOrName"] = None) -> "DataFrame" """ ... + def asTable(self) -> TableArg: + """ + Converts the DataFrame into a `TableArg` object, which can be used as a table argument + in a user-defined table function (UDTF). + + After obtaining a TableArg from a DataFrame using this method, you can specify partitioning + and ordering for the table argument by calling methods such as `partitionBy`, `orderBy`, and + `withSinglePartition` on the `TableArg` instance. + - partitionBy: Partitions the data based on the specified columns. This method cannot + be called after withSinglePartition() has been called. + - orderBy: Orders the data within partitions based on the specified columns. + - withSinglePartition: Indicates that the data should be treated as a single partition. + This method cannot be called after partitionBy() has been called. + + .. versionadded:: 4.0.0 + + Returns + ------- + :class:`TableArg` + A `TableArg` object representing a table argument. + """ + ... + def scalar(self) -> Column: """ Return a `Column` object for a SCALAR Subquery containing exactly one row and one column. @@ -6509,7 +6657,7 @@ def scalar(self) -> Column: >>> from pyspark.sql import functions as sf >>> employees.where( ... sf.col("salary") > employees.select(sf.avg("salary")).scalar() - ... ).select("name", "salary", "department_id").show() + ... ).select("name", "salary", "department_id").orderBy("name").show() +-----+------+-------------+ | name|salary|department_id| +-----+------+-------------+ @@ -6522,11 +6670,12 @@ def scalar(self) -> Column: in their department. >>> from pyspark.sql import functions as sf - >>> employees.where( + >>> employees.alias("e1").where( ... sf.col("salary") - ... > employees.where(sf.col("department_id") == sf.col("department_id").outer()) - ... .select(sf.avg("salary")).scalar() - ... ).select("name", "salary", "department_id").show() + ... > employees.alias("e2").where( + ... sf.col("e2.department_id") == sf.col("e1.department_id").outer() + ... ).select(sf.avg("salary")).scalar() + ... ).select("name", "salary", "department_id").orderBy("name").show() +-----+------+-------------+ | name|salary|department_id| +-----+------+-------------+ @@ -6538,23 +6687,24 @@ def scalar(self) -> Column: department. >>> from pyspark.sql import functions as sf - >>> employees.select( + >>> employees.alias("e1").select( ... "name", "salary", "department_id", ... sf.format_number( ... sf.lit(100) * sf.col("salary") / - ... employees.where(sf.col("department_id") == sf.col("department_id").outer()) - ... .select(sf.sum("salary")).scalar().alias("avg_salary"), + ... employees.alias("e2").where( + ... sf.col("e2.department_id") == sf.col("e1.department_id").outer() + ... ).select(sf.sum("salary")).scalar().alias("avg_salary"), ... 1 ... ).alias("salary_proportion_in_department") - ... ).show() + ... ).orderBy("name").show() +-------+------+-------------+-------------------------------+ | name|salary|department_id|salary_proportion_in_department| +-------+------+-------------+-------------------------------+ | Alice| 45000| 101| 30.6| | Bob| 54000| 101| 36.7| |Charlie| 29000| 102| 32.2| - | Eve| 48000| 101| 32.7| | David| 61000| 102| 67.8| + | Eve| 48000| 101| 32.7| +-------+------+-------------+-------------------------------+ """ ... @@ -6595,8 +6745,10 @@ def exists(self) -> Column: Example 1: Filter for customers who have placed at least one order. >>> from pyspark.sql import functions as sf - >>> customers.where( - ... orders.where(sf.col("customer_id") == sf.col("customer_id").outer()).exists() + >>> customers.alias("c").where( + ... orders.alias("o").where( + ... sf.col("o.customer_id") == sf.col("c.customer_id").outer() + ... ).exists() ... ).orderBy("customer_id").show() +-----------+-------------+-------+ |customer_id|customer_name|country| @@ -6609,8 +6761,10 @@ def exists(self) -> Column: Example 2: Filter for customers who have never placed an order. >>> from pyspark.sql import functions as sf - >>> customers.where( - ... ~orders.where(sf.col("customer_id") == sf.col("customer_id").outer()).exists() + >>> customers.alias("c").where( + ... ~orders.alias("o").where( + ... sf.col("o.customer_id") == sf.col("c.customer_id").outer() + ... ).exists() ... ).orderBy("customer_id").show() +-----------+-------------+---------+ |customer_id|customer_name| country| @@ -6621,9 +6775,9 @@ def exists(self) -> Column: Example 3: Find Orders from Customers in the USA. >>> from pyspark.sql import functions as sf - >>> orders.where( - ... customers.where( - ... (sf.col("customer_id") == sf.col("customer_id").outer()) + >>> orders.alias("o").where( + ... customers.alias("c").where( + ... (sf.col("c.customer_id") == sf.col("o.customer_id").outer()) ... & (sf.col("country") == "USA") ... ).exists() ... ).orderBy("order_id").show() @@ -6676,6 +6830,9 @@ def plot(self) -> "PySparkPlotAccessor": Notes ----- This API is experimental. + It provides two ways to create plots: + 1. Chaining style (e.g., `df.plot.line(...)`). + 2. Explicit style (e.g., `df.plot(kind="line", ...)`). Examples -------- @@ -6685,6 +6842,7 @@ def plot(self) -> "PySparkPlotAccessor": >>> type(df.plot) >>> df.plot.line(x="category", y=["int_val", "float_val"]) # doctest: +SKIP + >>> df.plot(kind="line", x="category", y=["int_val", "float_val"]) # doctest: +SKIP """ ... diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py index a51c96a9d178f..651e84e84390e 100644 --- a/python/pyspark/sql/datasource.py +++ b/python/pyspark/sql/datasource.py @@ -32,6 +32,7 @@ "DataSourceStreamReader", "SimpleDataSourceStreamReader", "DataSourceWriter", + "DataSourceArrowWriter", "DataSourceStreamWriter", "DataSourceRegistration", "InputPartition", @@ -666,6 +667,44 @@ def abort(self, messages: List[Optional["WriterCommitMessage"]]) -> None: ... +class DataSourceArrowWriter(DataSourceWriter): + """ + A base class for data source writers that process data using PyArrow’s `RecordBatch`. + + Unlike :class:`DataSourceWriter`, which works with an iterator of Spark Rows, this class + is optimized for using the Arrow format when writing data. It can offer better performance + when interfacing with systems or libraries that natively support Arrow. + + .. versionadded: 4.0.0 + """ + + @abstractmethod + def write(self, iterator: Iterator["RecordBatch"]) -> "WriterCommitMessage": + """ + Writes an iterator of PyArrow `RecordBatch` objects to the sink. + + This method is called once on each executor to write data to the data source. + It accepts an iterator of PyArrow `RecordBatch`\\s and returns a single row + representing a commit message, or None if there is no commit message. + + The driver collects commit messages, if any, from all executors and passes them + to the :class:`DataSourceWriter.commit` method if all tasks run successfully. If any + task fails, the :class:`DataSourceWriter.abort` method will be called with the + collected commit messages. + + Parameters + ---------- + iterator : iterator of :class:`RecordBatch`\\s + An iterator of PyArrow `RecordBatch` objects representing the input data. + + Returns + ------- + :class:`WriterCommitMessage` + a serializable commit message + """ + ... + + class DataSourceStreamWriter(ABC): """ A base class for data stream writers. Data stream writers are responsible for writing @@ -783,9 +822,9 @@ def register( wrapped = _wrap_function(sc, dataSource) assert sc._jvm is not None jvm = sc._jvm - ds = jvm.org.apache.spark.sql.execution.datasources.v2.python.UserDefinedPythonDataSource( - wrapped - ) + ds = getattr( + jvm, "org.apache.spark.sql.execution.datasources.v2.python.UserDefinedPythonDataSource" + )(wrapped) self.sparkSession._jsparkSession.dataSource().registerPython(name, ds) diff --git a/python/pyspark/sql/functions/__init__.py b/python/pyspark/sql/functions/__init__.py index dd09c4aa5c774..fc0120bc681d8 100644 --- a/python/pyspark/sql/functions/__init__.py +++ b/python/pyspark/sql/functions/__init__.py @@ -19,3 +19,491 @@ from pyspark.sql.functions.builtin import * # noqa: F401,F403 from pyspark.sql.functions import partitioning # noqa: F401,F403 + +__all__ = [ # noqa: F405 + # Normal functions + "broadcast", + "call_function", + "col", + "column", + "lit", + "expr", + # Conditional Functions + "coalesce", + "ifnull", + "nanvl", + "nullif", + "nullifzero", + "nvl", + "nvl2", + "when", + "zeroifnull", + # Predicate Functions + "equal_null", + "ilike", + "isnan", + "isnotnull", + "isnull", + "like", + "regexp", + "regexp_like", + "rlike", + # Sort Functions + "asc", + "asc_nulls_first", + "asc_nulls_last", + "desc", + "desc_nulls_first", + "desc_nulls_last", + # Mathematical Functions + "abs", + "acos", + "acosh", + "asin", + "asinh", + "atan", + "atan2", + "atanh", + "bin", + "bround", + "cbrt", + "ceil", + "ceiling", + "conv", + "cos", + "cosh", + "cot", + "csc", + "degrees", + "e", + "exp", + "expm1", + "factorial", + "floor", + "greatest", + "hex", + "hypot", + "least", + "ln", + "log", + "log10", + "log1p", + "log2", + "negate", + "negative", + "pi", + "pmod", + "positive", + "pow", + "power", + "radians", + "rand", + "randn", + "rint", + "round", + "sec", + "sign", + "signum", + "sin", + "sinh", + "sqrt", + "tan", + "tanh", + "try_add", + "try_divide", + "try_mod", + "try_multiply", + "try_subtract", + "unhex", + "uniform", + "width_bucket", + # String Functions + "ascii", + "base64", + "bit_length", + "btrim", + "char", + "char_length", + "character_length", + "collate", + "collation", + "concat_ws", + "contains", + "decode", + "elt", + "encode", + "endswith", + "find_in_set", + "format_number", + "format_string", + "initcap", + "instr", + "is_valid_utf8", + "lcase", + "left", + "length", + "levenshtein", + "locate", + "lower", + "lpad", + "ltrim", + "make_valid_utf8", + "mask", + "octet_length", + "overlay", + "position", + "printf", + "randstr", + "regexp_count", + "regexp_extract", + "regexp_extract_all", + "regexp_instr", + "regexp_replace", + "regexp_substr", + "repeat", + "replace", + "right", + "rpad", + "rtrim", + "sentences", + "soundex", + "split", + "split_part", + "startswith", + "substr", + "substring", + "substring_index", + "to_binary", + "to_char", + "to_number", + "to_varchar", + "translate", + "trim", + "try_to_binary", + "try_to_number", + "try_validate_utf8", + "ucase", + "unbase64", + "upper", + "validate_utf8", + # Bitwise Functions + "bit_count", + "bit_get", + "bitwise_not", + "getbit", + "shiftleft", + "shiftright", + "shiftrightunsigned", + # Date and Timestamp Functions + "add_months", + "convert_timezone", + "curdate", + "current_date", + "current_timestamp", + "current_timezone", + "date_add", + "date_diff", + "date_format", + "date_from_unix_date", + "date_part", + "date_sub", + "date_trunc", + "dateadd", + "datediff", + "datepart", + "day", + "dayname", + "dayofmonth", + "dayofweek", + "dayofyear", + "extract", + "from_unixtime", + "from_utc_timestamp", + "hour", + "last_day", + "localtimestamp", + "make_date", + "make_dt_interval", + "make_interval", + "make_timestamp", + "make_timestamp_ltz", + "make_timestamp_ntz", + "make_ym_interval", + "minute", + "month", + "monthname", + "months_between", + "next_day", + "now", + "quarter", + "second", + "session_window", + "timestamp_add", + "timestamp_diff", + "timestamp_micros", + "timestamp_millis", + "timestamp_seconds", + "to_date", + "to_timestamp", + "to_timestamp_ltz", + "to_timestamp_ntz", + "to_unix_timestamp", + "to_utc_timestamp", + "trunc", + "try_make_interval", + "try_make_timestamp", + "try_make_timestamp_ltz", + "try_make_timestamp_ntz", + "try_to_timestamp", + "unix_date", + "unix_micros", + "unix_millis", + "unix_seconds", + "unix_timestamp", + "weekday", + "weekofyear", + "window", + "window_time", + "year", + # Hash Functions + "crc32", + "hash", + "md5", + "sha", + "sha1", + "sha2", + "xxhash64", + # Collection Functions + "aggregate", + "array_sort", + "cardinality", + "concat", + "element_at", + "exists", + "filter", + "forall", + "map_filter", + "map_zip_with", + "reduce", + "reverse", + "size", + "transform", + "transform_keys", + "transform_values", + "try_element_at", + "zip_with", + # Array Functions + "array", + "array_append", + "array_compact", + "array_contains", + "array_distinct", + "array_except", + "array_insert", + "array_intersect", + "array_join", + "array_max", + "array_min", + "array_position", + "array_prepend", + "array_remove", + "array_repeat", + "array_size", + "array_union", + "arrays_overlap", + "arrays_zip", + "flatten", + "get", + "sequence", + "shuffle", + "slice", + "sort_array", + # Struct Functions + "named_struct", + "struct", + # Map Functions + "create_map", + "map_concat", + "map_contains_key", + "map_entries", + "map_from_arrays", + "map_from_entries", + "map_keys", + "map_values", + "str_to_map", + # Aggregate Functions + "any_value", + "approx_count_distinct", + "approx_percentile", + "array_agg", + "avg", + "bit_and", + "bit_or", + "bit_xor", + "bitmap_construct_agg", + "bitmap_or_agg", + "bool_and", + "bool_or", + "collect_list", + "collect_set", + "corr", + "count", + "count_distinct", + "count_if", + "count_min_sketch", + "covar_pop", + "covar_samp", + "every", + "first", + "first_value", + "grouping", + "grouping_id", + "histogram_numeric", + "hll_sketch_agg", + "hll_union_agg", + "kurtosis", + "last", + "last_value", + "listagg", + "listagg_distinct", + "max", + "max_by", + "mean", + "median", + "min", + "min_by", + "mode", + "percentile", + "percentile_approx", + "product", + "regr_avgx", + "regr_avgy", + "regr_count", + "regr_intercept", + "regr_r2", + "regr_slope", + "regr_sxx", + "regr_sxy", + "regr_syy", + "skewness", + "some", + "std", + "stddev", + "stddev_pop", + "stddev_samp", + "string_agg", + "string_agg_distinct", + "sum", + "sum_distinct", + "try_avg", + "try_sum", + "var_pop", + "var_samp", + "variance", + # Window Functions + "cume_dist", + "dense_rank", + "lag", + "lead", + "nth_value", + "ntile", + "percent_rank", + "rank", + "row_number", + # Generator Functions + "explode", + "explode_outer", + "inline", + "inline_outer", + "posexplode", + "posexplode_outer", + "stack", + # Partition Transformation Functions + "years", + "months", + "days", + "hours", + "bucket", + # CSV Functions + "from_csv", + "schema_of_csv", + "to_csv", + # JSON Functions + "from_json", + "get_json_object", + "json_array_length", + "json_object_keys", + "json_tuple", + "schema_of_json", + "to_json", + # VARIANT Functions + "is_variant_null", + "parse_json", + "schema_of_variant", + "schema_of_variant_agg", + "try_variant_get", + "variant_get", + "try_parse_json", + "to_variant_object", + # XML Functions + "from_xml", + "schema_of_xml", + "to_xml", + "xpath", + "xpath_boolean", + "xpath_double", + "xpath_float", + "xpath_int", + "xpath_long", + "xpath_number", + "xpath_short", + "xpath_string", + # URL Functions + "parse_url", + "try_parse_url", + "url_decode", + "url_encode", + "try_url_decode", + # Misc Functions + "aes_decrypt", + "aes_encrypt", + "assert_true", + "bitmap_bit_position", + "bitmap_bucket_number", + "bitmap_count", + "current_catalog", + "current_database", + "current_schema", + "current_user", + "hll_sketch_estimate", + "hll_union", + "input_file_block_length", + "input_file_block_start", + "input_file_name", + "java_method", + "monotonically_increasing_id", + "raise_error", + "reflect", + "session_user", + "spark_partition_id", + "try_aes_decrypt", + "try_reflect", + "typeof", + "user", + "version", + # UDF, UDTF and UDT + "AnalyzeArgument", + "AnalyzeResult", + "OrderingColumn", + "PandasUDFType", + "PartitioningColumn", + "SelectedColumn", + "SkipRestOfInputTableException", + "UserDefinedFunction", + "UserDefinedTableFunction", + "call_udf", + "pandas_udf", + "udf", + "udtf", + "unwrap_udt", +] diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index d2873a388617e..93ac6da1e14c5 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -65,7 +65,6 @@ from pyspark.sql.utils import ( to_str as _to_str, - has_numpy as _has_numpy, try_remote_functions as _try_remote_functions, get_active_spark_context as _get_active_spark_context, enum_to_value as _enum_to_value, @@ -79,8 +78,6 @@ UserDefinedFunctionLike, ) -if _has_numpy: - import numpy as np # Note to developers: all of PySpark functions here take string as column names whenever possible. # Namely, if columns are referred as arguments, they can always be both Column or string, @@ -254,6 +251,8 @@ def lit(col: Any) -> Column: | [true, false]| []| [1.5, 0.1]| [a, b, c]| +------------------+-------+-----------------+--------------------+ """ + from pyspark.testing.utils import have_numpy + if isinstance(col, Column): return col elif isinstance(col, list): @@ -262,7 +261,9 @@ def lit(col: Any) -> Column: errorClass="COLUMN_IN_LIST", messageParameters={"func_name": "lit"} ) return array(*[lit(item) for item in col]) - elif _has_numpy: + elif have_numpy: + import numpy as np + if isinstance(col, np.generic): dt = _from_numpy_type(col.dtype) if dt is None: @@ -1851,6 +1852,314 @@ def sum_distinct(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("sum_distinct", col) +@_try_remote_functions +def listagg(col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None) -> Column: + """ + Aggregate function: returns the concatenation of non-null input values, + separated by the delimiter. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + target column to compute on. + delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional + the delimiter to separate the values. The default value is None. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + Example 1: Using listagg function + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) + >>> df.select(sf.listagg('strings')).show() + +----------------------+ + |listagg(strings, NULL)| + +----------------------+ + | abc| + +----------------------+ + + Example 2: Using listagg function with a delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) + >>> df.select(sf.listagg('strings', ', ')).show() + +--------------------+ + |listagg(strings, , )| + +--------------------+ + | a, b, c| + +--------------------+ + + Example 3: Using listagg function with a binary column and delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',)], ['bytes']) + >>> df.select(sf.listagg('bytes', b'\x42')).show() + +---------------------+ + |listagg(bytes, X'42')| + +---------------------+ + | [01 42 02 42 03]| + +---------------------+ + + Example 4: Using listagg function on a column with all None values + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StructType, StructField, StringType + >>> schema = StructType([StructField("strings", StringType(), True)]) + >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) + >>> df.select(sf.listagg('strings')).show() + +----------------------+ + |listagg(strings, NULL)| + +----------------------+ + | NULL| + +----------------------+ + """ + if delimiter is None: + return _invoke_function_over_columns("listagg", col) + else: + return _invoke_function_over_columns("listagg", col, lit(delimiter)) + + +@_try_remote_functions +def listagg_distinct( + col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None +) -> Column: + """ + Aggregate function: returns the concatenation of distinct non-null input values, + separated by the delimiter. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + target column to compute on. + delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional + the delimiter to separate the values. The default value is None. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + Example 1: Using listagg_distinct function + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) + >>> df.select(sf.listagg_distinct('strings')).show() + +-------------------------------+ + |listagg(DISTINCT strings, NULL)| + +-------------------------------+ + | abc| + +-------------------------------+ + + Example 2: Using listagg_distinct function with a delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) + >>> df.select(sf.listagg_distinct('strings', ', ')).show() + +-----------------------------+ + |listagg(DISTINCT strings, , )| + +-----------------------------+ + | a, b, c| + +-----------------------------+ + + Example 3: Using listagg_distinct function with a binary column and delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',), (b'\x02',)], + ... ['bytes']) + >>> df.select(sf.listagg_distinct('bytes', b'\x42')).show() + +------------------------------+ + |listagg(DISTINCT bytes, X'42')| + +------------------------------+ + | [01 42 02 42 03]| + +------------------------------+ + + Example 4: Using listagg_distinct function on a column with all None values + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StructType, StructField, StringType + >>> schema = StructType([StructField("strings", StringType(), True)]) + >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) + >>> df.select(sf.listagg_distinct('strings')).show() + +-------------------------------+ + |listagg(DISTINCT strings, NULL)| + +-------------------------------+ + | NULL| + +-------------------------------+ + """ + if delimiter is None: + return _invoke_function_over_columns("listagg_distinct", col) + else: + return _invoke_function_over_columns("listagg_distinct", col, lit(delimiter)) + + +@_try_remote_functions +def string_agg( + col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None +) -> Column: + """ + Aggregate function: returns the concatenation of non-null input values, + separated by the delimiter. + + An alias of :func:`listagg`. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + target column to compute on. + delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional + the delimiter to separate the values. The default value is None. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + Example 1: Using string_agg function + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) + >>> df.select(sf.string_agg('strings')).show() + +-------------------------+ + |string_agg(strings, NULL)| + +-------------------------+ + | abc| + +-------------------------+ + + Example 2: Using string_agg function with a delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) + >>> df.select(sf.string_agg('strings', ', ')).show() + +-----------------------+ + |string_agg(strings, , )| + +-----------------------+ + | a, b, c| + +-----------------------+ + + Example 3: Using string_agg function with a binary column and delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',)], ['bytes']) + >>> df.select(sf.string_agg('bytes', b'\x42')).show() + +------------------------+ + |string_agg(bytes, X'42')| + +------------------------+ + | [01 42 02 42 03]| + +------------------------+ + + Example 4: Using string_agg function on a column with all None values + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StructType, StructField, StringType + >>> schema = StructType([StructField("strings", StringType(), True)]) + >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) + >>> df.select(sf.string_agg('strings')).show() + +-------------------------+ + |string_agg(strings, NULL)| + +-------------------------+ + | NULL| + +-------------------------+ + """ + if delimiter is None: + return _invoke_function_over_columns("string_agg", col) + else: + return _invoke_function_over_columns("string_agg", col, lit(delimiter)) + + +@_try_remote_functions +def string_agg_distinct( + col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None +) -> Column: + """ + Aggregate function: returns the concatenation of distinct non-null input values, + separated by the delimiter. + + An alias of :func:`listagg_distinct`. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + target column to compute on. + delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional + the delimiter to separate the values. The default value is None. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + Example 1: Using string_agg_distinct function + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) + >>> df.select(sf.string_agg_distinct('strings')).show() + +----------------------------------+ + |string_agg(DISTINCT strings, NULL)| + +----------------------------------+ + | abc| + +----------------------------------+ + + Example 2: Using string_agg_distinct function with a delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) + >>> df.select(sf.string_agg_distinct('strings', ', ')).show() + +--------------------------------+ + |string_agg(DISTINCT strings, , )| + +--------------------------------+ + | a, b, c| + +--------------------------------+ + + Example 3: Using string_agg_distinct function with a binary column and delimiter + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',), (b'\x02',)], + ... ['bytes']) + >>> df.select(sf.string_agg_distinct('bytes', b'\x42')).show() + +---------------------------------+ + |string_agg(DISTINCT bytes, X'42')| + +---------------------------------+ + | [01 42 02 42 03]| + +---------------------------------+ + + Example 4: Using string_agg_distinct function on a column with all None values + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StructType, StructField, StringType + >>> schema = StructType([StructField("strings", StringType(), True)]) + >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) + >>> df.select(sf.string_agg_distinct('strings')).show() + +----------------------------------+ + |string_agg(DISTINCT strings, NULL)| + +----------------------------------+ + | NULL| + +----------------------------------+ + """ + if delimiter is None: + return _invoke_function_over_columns("string_agg_distinct", col) + else: + return _invoke_function_over_columns("string_agg_distinct", col, lit(delimiter)) + + @_try_remote_functions def product(col: "ColumnOrName") -> Column: """ @@ -7343,6 +7652,12 @@ def rand(seed: Optional[int] = None) -> Column: :class:`~pyspark.sql.Column` A column of random values. + See Also + -------- + :meth:`pyspark.sql.functions.randn` + :meth:`pyspark.sql.functions.randstr` + :meth:`pyspark.sql.functions.uniform` + Examples -------- Example 1: Generate a random column without a seed @@ -7396,6 +7711,12 @@ def randn(seed: Optional[int] = None) -> Column: :class:`~pyspark.sql.Column` A column of random values. + See Also + -------- + :meth:`pyspark.sql.functions.rand` + :meth:`pyspark.sql.functions.randstr` + :meth:`pyspark.sql.functions.uniform` + Examples -------- Example 1: Generate a random column without a seed @@ -8864,6 +9185,13 @@ def curdate() -> Column: :class:`~pyspark.sql.Column` current date. + See Also + -------- + :meth:`pyspark.sql.functions.now` + :meth:`pyspark.sql.functions.current_date` + :meth:`pyspark.sql.functions.current_timestamp` + :meth:`pyspark.sql.functions.localtimestamp` + Examples -------- >>> import pyspark.sql.functions as sf @@ -8893,6 +9221,13 @@ def current_date() -> Column: :class:`~pyspark.sql.Column` current date. + See Also + -------- + :meth:`pyspark.sql.functions.now` + :meth:`pyspark.sql.functions.curdate` + :meth:`pyspark.sql.functions.current_timestamp` + :meth:`pyspark.sql.functions.localtimestamp` + Examples -------- >>> from pyspark.sql import functions as sf @@ -8918,16 +9253,32 @@ def current_timezone() -> Column: :class:`~pyspark.sql.Column` current session local timezone. + See Also + -------- + :meth:`pyspark.sql.functions.convert_timezone` + Examples -------- - >>> from pyspark.sql import functions as sf >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") + + >>> from pyspark.sql import functions as sf >>> spark.range(1).select(sf.current_timezone()).show() +-------------------+ | current_timezone()| +-------------------+ |America/Los_Angeles| +-------------------+ + + Switch the timezone to Shanghai. + + >>> spark.conf.set("spark.sql.session.timeZone", "Asia/Shanghai") + >>> spark.range(1).select(sf.current_timezone()).show() + +------------------+ + |current_timezone()| + +------------------+ + | Asia/Shanghai| + +------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ return _invoke_function("current_timezone") @@ -8949,6 +9300,13 @@ def current_timestamp() -> Column: :class:`~pyspark.sql.Column` current date and time. + See Also + -------- + :meth:`pyspark.sql.functions.now` + :meth:`pyspark.sql.functions.curdate` + :meth:`pyspark.sql.functions.current_date` + :meth:`pyspark.sql.functions.localtimestamp` + Examples -------- >>> from pyspark.sql import functions as sf @@ -8974,6 +9332,13 @@ def now() -> Column: :class:`~pyspark.sql.Column` current timestamp at the start of query evaluation. + See Also + -------- + :meth:`pyspark.sql.functions.curdate` + :meth:`pyspark.sql.functions.current_date` + :meth:`pyspark.sql.functions.current_timestamp` + :meth:`pyspark.sql.functions.localtimestamp` + Examples -------- >>> from pyspark.sql import functions as sf @@ -9004,6 +9369,13 @@ def localtimestamp() -> Column: :class:`~pyspark.sql.Column` current local date and time. + See Also + -------- + :meth:`pyspark.sql.functions.now` + :meth:`pyspark.sql.functions.curdate` + :meth:`pyspark.sql.functions.current_date` + :meth:`pyspark.sql.functions.current_timestamp` + Examples -------- >>> from pyspark.sql import functions as sf @@ -9044,6 +9416,15 @@ def date_format(date: "ColumnOrName", format: str) -> Column: format: literal string format to use to represent datetime values. + See Also + -------- + :meth:`pyspark.sql.functions.to_date` + :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_timestamp_ntz` + :meth:`pyspark.sql.functions.to_utc_timestamp` + :meth:`pyspark.sql.functions.try_to_timestamp` + Returns ------- :class:`~pyspark.sql.Column` @@ -9130,6 +9511,18 @@ def year(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` year part of the date/timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + Examples -------- Example 1: Extract the year from a string column representing dates @@ -9209,6 +9602,18 @@ def quarter(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` quarter of the date/timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + Examples -------- Example 1: Extract the quarter from a string column representing dates @@ -9288,6 +9693,19 @@ def month(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` month part of the date/timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` + :meth:`pyspark.sql.functions.monthname` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + Examples -------- Example 1: Extract the month from a string column representing dates @@ -9368,6 +9786,12 @@ def dayofweek(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` day of the week for given date/timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.dayofyear` + :meth:`pyspark.sql.functions.dayofmonth` + Examples -------- Example 1: Extract the day of the week from a string column representing dates @@ -9442,6 +9866,12 @@ def dayofmonth(col: "ColumnOrName") -> Column: col : :class:`~pyspark.sql.Column` or column name target date/timestamp column to work on. + See Also + -------- + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.dayofyear` + :meth:`pyspark.sql.functions.dayofweek` + Returns ------- :class:`~pyspark.sql.Column` @@ -9523,9 +9953,25 @@ def day(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` day of the month for given date/timestamp as integer. - Examples + See Also -------- - Example 1: Extract the day of the month from a string column representing dates + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` + :meth:`pyspark.sql.functions.dayname` + :meth:`pyspark.sql.functions.dayofyear` + :meth:`pyspark.sql.functions.dayofmonth` + :meth:`pyspark.sql.functions.dayofweek` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + + Examples + -------- + Example 1: Extract the day of the month from a string column representing dates >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) @@ -9602,6 +10048,12 @@ def dayofyear(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` day of the year for given date/timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.dayofyear` + :meth:`pyspark.sql.functions.dayofmonth` + Examples -------- Example 1: Extract the day of the year from a string column representing dates @@ -9681,6 +10133,18 @@ def hour(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` hour part of the timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + Examples -------- Example 1: Extract the hours from a string column representing timestamp @@ -9728,6 +10192,18 @@ def minute(col: "ColumnOrName") -> Column: col : :class:`~pyspark.sql.Column` or column name target date/timestamp column to work on. + See Also + -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.second` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + Returns ------- :class:`~pyspark.sql.Column` @@ -9785,6 +10261,18 @@ def second(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` `seconds` part of the timestamp as integer. + See Also + -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.extract` + :meth:`pyspark.sql.functions.datepart` + :meth:`pyspark.sql.functions.date_part` + Examples -------- Example 1: Extract the seconds from a string column representing timestamp @@ -9839,6 +10327,10 @@ def weekofyear(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` `week` of the year for given date as integer. + See Also + -------- + :meth:`pyspark.sql.functions.weekday` + Examples -------- Example 1: Extract the week of the year from a string column representing dates @@ -9915,6 +10407,11 @@ def weekday(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday). + See Also + -------- + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.weekofyear` + Examples -------- Example 1: Extract the day of the week from a string column representing dates @@ -9991,6 +10488,11 @@ def monthname(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the three-letter abbreviation of month name for date/timestamp (Jan, Feb, Mar...) + See Also + -------- + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.dayname` + Examples -------- Example 1: Extract the month name from a string column representing dates @@ -10067,6 +10569,11 @@ def dayname(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the three-letter abbreviation of day name for date/timestamp (Mon, Tue, Wed...) + See Also + -------- + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.monthname` + Examples -------- Example 1: Extract the weekday name from a string column representing dates @@ -10147,6 +10654,13 @@ def extract(field: Column, source: "ColumnOrName") -> Column: See Also -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` :meth:`pyspark.sql.functions.datepart` :meth:`pyspark.sql.functions.date_part` @@ -10195,6 +10709,13 @@ def date_part(field: Column, source: "ColumnOrName") -> Column: See Also -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` :meth:`pyspark.sql.functions.datepart` :meth:`pyspark.sql.functions.extract` @@ -10243,6 +10764,13 @@ def datepart(field: Column, source: "ColumnOrName") -> Column: See Also -------- + :meth:`pyspark.sql.functions.year` + :meth:`pyspark.sql.functions.quarter` + :meth:`pyspark.sql.functions.month` + :meth:`pyspark.sql.functions.day` + :meth:`pyspark.sql.functions.hour` + :meth:`pyspark.sql.functions.minute` + :meth:`pyspark.sql.functions.second` :meth:`pyspark.sql.functions.date_part` :meth:`pyspark.sql.functions.extract` @@ -10780,7 +11308,11 @@ def to_date(col: "ColumnOrName", format: Optional[str] = None) -> Column: See Also -------- :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_timestamp_ntz` + :meth:`pyspark.sql.functions.to_utc_timestamp` :meth:`pyspark.sql.functions.try_to_timestamp` + :meth:`pyspark.sql.functions.date_format` Examples -------- @@ -11018,7 +11550,12 @@ def to_timestamp(col: "ColumnOrName", format: Optional[str] = None) -> Column: See Also -------- :meth:`pyspark.sql.functions.to_date` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_timestamp_ntz` + :meth:`pyspark.sql.functions.to_utc_timestamp` + :meth:`pyspark.sql.functions.to_unix_timestamp` :meth:`pyspark.sql.functions.try_to_timestamp` + :meth:`pyspark.sql.functions.date_format` Examples -------- @@ -11072,6 +11609,8 @@ def try_to_timestamp(col: "ColumnOrName", format: Optional["ColumnOrName"] = Non -------- :meth:`pyspark.sql.functions.to_date` :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_utc_timestamp` + :meth:`pyspark.sql.functions.date_format` Examples -------- @@ -11128,10 +11667,15 @@ def xpath(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( ... [('b1b2b3c1c2',)], ['x']) - >>> df.select(xpath(df.x, lit('a/b/text()')).alias('r')).collect() - [Row(r=['b1', 'b2', 'b3'])] + >>> df.select(sf.xpath(df.x, sf.lit('a/b/text()'))).show() + +--------------------+ + |xpath(x, a/b/text())| + +--------------------+ + | [b1, b2, b3]| + +--------------------+ """ return _invoke_function_over_columns("xpath", xml, path) @@ -11145,9 +11689,14 @@ def xpath_boolean(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('1',)], ['x']) - >>> df.select(xpath_boolean(df.x, lit('a/b')).alias('r')).collect() - [Row(r=True)] + >>> df.select(sf.xpath_boolean(df.x, sf.lit('a/b'))).show() + +---------------------+ + |xpath_boolean(x, a/b)| + +---------------------+ + | true| + +---------------------+ """ return _invoke_function_over_columns("xpath_boolean", xml, path) @@ -11162,9 +11711,14 @@ def xpath_double(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('12',)], ['x']) - >>> df.select(xpath_double(df.x, lit('sum(a/b)')).alias('r')).collect() - [Row(r=3.0)] + >>> df.select(sf.xpath_double(df.x, sf.lit('sum(a/b)'))).show() + +-------------------------+ + |xpath_double(x, sum(a/b))| + +-------------------------+ + | 3.0| + +-------------------------+ """ return _invoke_function_over_columns("xpath_double", xml, path) @@ -11202,9 +11756,14 @@ def xpath_float(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('12',)], ['x']) - >>> df.select(xpath_float(df.x, lit('sum(a/b)')).alias('r')).collect() - [Row(r=3.0)] + >>> df.select(sf.xpath_float(df.x, sf.lit('sum(a/b)'))).show() + +------------------------+ + |xpath_float(x, sum(a/b))| + +------------------------+ + | 3.0| + +------------------------+ """ return _invoke_function_over_columns("xpath_float", xml, path) @@ -11219,9 +11778,14 @@ def xpath_int(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('12',)], ['x']) - >>> df.select(xpath_int(df.x, lit('sum(a/b)')).alias('r')).collect() - [Row(r=3)] + >>> df.select(sf.xpath_int(df.x, sf.lit('sum(a/b)'))).show() + +----------------------+ + |xpath_int(x, sum(a/b))| + +----------------------+ + | 3| + +----------------------+ """ return _invoke_function_over_columns("xpath_int", xml, path) @@ -11236,9 +11800,14 @@ def xpath_long(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('12',)], ['x']) - >>> df.select(xpath_long(df.x, lit('sum(a/b)')).alias('r')).collect() - [Row(r=3)] + >>> df.select(sf.xpath_long(df.x, sf.lit('sum(a/b)'))).show() + +-----------------------+ + |xpath_long(x, sum(a/b))| + +-----------------------+ + | 3| + +-----------------------+ """ return _invoke_function_over_columns("xpath_long", xml, path) @@ -11253,9 +11822,14 @@ def xpath_short(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('12',)], ['x']) - >>> df.select(xpath_short(df.x, lit('sum(a/b)')).alias('r')).collect() - [Row(r=3)] + >>> df.select(sf.xpath_short(df.x, sf.lit('sum(a/b)'))).show() + +------------------------+ + |xpath_short(x, sum(a/b))| + +------------------------+ + | 3| + +------------------------+ """ return _invoke_function_over_columns("xpath_short", xml, path) @@ -11269,9 +11843,14 @@ def xpath_string(xml: "ColumnOrName", path: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('bcc',)], ['x']) - >>> df.select(xpath_string(df.x, lit('a/c')).alias('r')).collect() - [Row(r='cc')] + >>> df.select(sf.xpath_string(df.x, sf.lit('a/c'))).show() + +--------------------+ + |xpath_string(x, a/c)| + +--------------------+ + | cc| + +--------------------+ """ return _invoke_function_over_columns("xpath_string", xml, path) @@ -11646,6 +12225,9 @@ def from_utc_timestamp(timestamp: "ColumnOrName", tz: Union[Column, str]) -> Col See Also -------- :meth:`pyspark.sql.functions.to_utc_timestamp` + :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_timestamp_ntz` Examples -------- @@ -11712,6 +12294,9 @@ def to_utc_timestamp(timestamp: "ColumnOrName", tz: Union[Column, str]) -> Colum See Also -------- :meth:`pyspark.sql.functions.from_utc_timestamp` + :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_timestamp_ntz` Examples -------- @@ -12034,22 +12619,22 @@ def window( Parameters ---------- - timeColumn : :class:`~pyspark.sql.Column` + timeColumn : :class:`~pyspark.sql.Column` or column name The column or the expression to use as the timestamp for windowing by time. The time column must be of TimestampType or TimestampNTZType. - windowDuration : str + windowDuration : literal string A string specifying the width of the window, e.g. `10 minutes`, `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. Note that the duration is a fixed length of time, and does not vary over time according to a calendar. For example, `1 day` always means 86,400,000 milliseconds, not a calendar day. - slideDuration : str, optional + slideDuration : literal string, optional A new window will be generated every `slideDuration`. Must be less than or equal to the `windowDuration`. Check `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. This duration is likewise absolute, and does not vary according to a calendar. - startTime : str, optional + startTime : literal string, optional The offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide @@ -12060,24 +12645,30 @@ def window( :class:`~pyspark.sql.Column` the column for computed results. + See Also + -------- + :meth:`pyspark.sql.functions.window_time` + :meth:`pyspark.sql.functions.session_window` + Examples -------- >>> import datetime >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame( - ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], - ... ).toDF("date", "val") - >>> w = df.groupBy(sf.window("date", "5 seconds")).agg(sf.sum("val").alias("sum")) - >>> w.select( - ... w.window.start.cast("string").alias("start"), - ... w.window.end.cast("string").alias("end"), - ... "sum" - ... ).show() - +-------------------+-------------------+---+ - | start| end|sum| - +-------------------+-------------------+---+ - |2016-03-11 09:00:05|2016-03-11 09:00:10| 1| - +-------------------+-------------------+---+ + >>> df = spark.createDataFrame([(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ['dt', 'v']) + >>> df2 = df.groupBy(sf.window('dt', '5 seconds')).agg(sf.sum('v')) + >>> df2.show(truncate=False) + +------------------------------------------+------+ + |window |sum(v)| + +------------------------------------------+------+ + |{2016-03-11 09:00:05, 2016-03-11 09:00:10}|1 | + +------------------------------------------+------+ + + >>> df2.printSchema() + root + |-- window: struct (nullable = false) + | |-- start: timestamp (nullable = true) + | |-- end: timestamp (nullable = true) + |-- sum(v): long (nullable = true) """ from pyspark.sql.classic.column import _to_java_column @@ -12123,7 +12714,7 @@ def window_time( Parameters ---------- - windowColumn : :class:`~pyspark.sql.Column` + windowColumn : :class:`~pyspark.sql.Column` or column name The window column of a window aggregate records. Returns @@ -12131,29 +12722,29 @@ def window_time( :class:`~pyspark.sql.Column` the column for computed results. - Notes - ----- - Supports Spark Connect. + See Also + -------- + :meth:`pyspark.sql.functions.window` + :meth:`pyspark.sql.functions.session_window` Examples -------- >>> import datetime - >>> df = spark.createDataFrame( - ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], - ... ).toDF("date", "val") + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ['dt', 'v']) Group the data into 5 second time windows and aggregate as sum. - >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) + >>> df2 = df.groupBy(sf.window('dt', '5 seconds')).agg(sf.sum('v')) Extract the window event time using the window_time function. - >>> w.select( - ... w.window.end.cast("string").alias("end"), - ... window_time(w.window).cast("string").alias("window_time"), - ... "sum" - ... ).collect() - [Row(end='2016-03-11 09:00:10', window_time='2016-03-11 09:00:09.999999', sum=1)] + >>> df2.select('*', sf.window_time('window')).show(truncate=False) + +------------------------------------------+------+--------------------------+ + |window |sum(v)|window_time(window) | + +------------------------------------------+------+--------------------------+ + |{2016-03-11 09:00:05, 2016-03-11 09:00:10}|1 |2016-03-11 09:00:09.999999| + +------------------------------------------+------+--------------------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -12187,10 +12778,10 @@ def session_window(timeColumn: "ColumnOrName", gapDuration: Union[Column, str]) Parameters ---------- - timeColumn : :class:`~pyspark.sql.Column` or str + timeColumn : :class:`~pyspark.sql.Column` or column name The column name or column to use as the timestamp for windowing by time. The time column must be of TimestampType or TimestampNTZType. - gapDuration : :class:`~pyspark.sql.Column` or str + gapDuration : :class:`~pyspark.sql.Column` or literal string A Python string literal or column specifying the timeout of the session. It could be static value, e.g. `10 minutes`, `1 second`, or an expression/UDF that specifies gap duration dynamically based on the input row. @@ -12200,17 +12791,29 @@ def session_window(timeColumn: "ColumnOrName", gapDuration: Union[Column, str]) :class:`~pyspark.sql.Column` the column for computed results. + See Also + -------- + :meth:`pyspark.sql.functions.window` + :meth:`pyspark.sql.functions.window_time` + Examples -------- - >>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val") - >>> w = df.groupBy(session_window("date", "5 seconds")).agg(sum("val").alias("sum")) - >>> w.select(w.session_window.start.cast("string").alias("start"), - ... w.session_window.end.cast("string").alias("end"), "sum").collect() - [Row(start='2016-03-11 09:00:07', end='2016-03-11 09:00:12', sum=1)] - >>> w = df.groupBy(session_window("date", lit("5 seconds"))).agg(sum("val").alias("sum")) - >>> w.select(w.session_window.start.cast("string").alias("start"), - ... w.session_window.end.cast("string").alias("end"), "sum").collect() - [Row(start='2016-03-11 09:00:07', end='2016-03-11 09:00:12', sum=1)] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('2016-03-11 09:00:07', 1)], ['dt', 'v']) + >>> df2 = df.groupBy(sf.session_window('dt', '5 seconds')).agg(sf.sum('v')) + >>> df2.show(truncate=False) + +------------------------------------------+------+ + |session_window |sum(v)| + +------------------------------------------+------+ + |{2016-03-11 09:00:07, 2016-03-11 09:00:12}|1 | + +------------------------------------------+------+ + + >>> df2.printSchema() + root + |-- session_window: struct (nullable = false) + | |-- start: timestamp (nullable = true) + | |-- end: timestamp (nullable = true) + |-- sum(v): long (nullable = true) """ from pyspark.sql.classic.column import _to_java_column @@ -12240,37 +12843,57 @@ def to_unix_timestamp( Parameters ---------- - timestamp : :class:`~pyspark.sql.Column` or str + timestamp : :class:`~pyspark.sql.Column` or column name Input column or strings. - format : :class:`~pyspark.sql.Column` or str, optional + format : :class:`~pyspark.sql.Column` or column name, optional format to use to convert UNIX timestamp values. + See Also + -------- + :meth:`pyspark.sql.functions.to_date` + :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_timestamp_ntz` + :meth:`pyspark.sql.functions.to_utc_timestamp` + Examples -------- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - Example 1: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp string. + Example 1: Using default format to parse the timestamp string. >>> import pyspark.sql.functions as sf - >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt']) - >>> time_df.select(sf.to_unix_timestamp('dt').alias('unix_time')).show() - +----------+ - | unix_time| - +----------+ - |1428520332| - +----------+ + >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) + >>> df.select('*', sf.to_unix_timestamp('ts')).show() + +-------------------+------------------------------------------+ + | ts|to_unix_timestamp(ts, yyyy-MM-dd HH:mm:ss)| + +-------------------+------------------------------------------+ + |2015-04-08 12:12:12| 1428520332| + +-------------------+------------------------------------------+ - Example 2: Using user-specified format 'yyyy-MM-dd' parses the timestamp string. + Example 2: Using user-specified format 'yyyy-MM-dd' to parse the date string. >>> import pyspark.sql.functions as sf - >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> time_df.select( - ... sf.to_unix_timestamp('dt', sf.lit('yyyy-MM-dd')).alias('unix_time')).show() - +----------+ - | unix_time| - +----------+ - |1428476400| - +----------+ + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select('*', sf.to_unix_timestamp(df.dt, sf.lit('yyyy-MM-dd'))).show() + +----------+---------------------------------+ + | dt|to_unix_timestamp(dt, yyyy-MM-dd)| + +----------+---------------------------------+ + |2015-04-08| 1428476400| + +----------+---------------------------------+ + + Example 3: Using a format column to represent different formats. + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame( + ... [('2015-04-08', 'yyyy-MM-dd'), ('2025+01+09', 'yyyy+MM+dd')], ['dt', 'fmt']) + >>> df.select('*', sf.to_unix_timestamp('dt', 'fmt')).show() + +----------+----------+--------------------------+ + | dt| fmt|to_unix_timestamp(dt, fmt)| + +----------+----------+--------------------------+ + |2015-04-08|yyyy-MM-dd| 1428476400| + |2025+01+09|yyyy+MM+dd| 1736409600| + +----------+----------+--------------------------+ >>> spark.conf.unset("spark.sql.session.timeZone") """ @@ -12286,29 +12909,63 @@ def to_timestamp_ltz( format: Optional["ColumnOrName"] = None, ) -> Column: """ - Parses the `timestamp` with the `format` to a timestamp without time zone. + Parses the `timestamp` with the `format` to a timestamp with time zone. Returns null with invalid input. .. versionadded:: 3.5.0 Parameters ---------- - timestamp : :class:`~pyspark.sql.Column` or str + timestamp : :class:`~pyspark.sql.Column` or column name Input column or strings. - format : :class:`~pyspark.sql.Column` or str, optional + format : :class:`~pyspark.sql.Column` or column name, optional format to use to convert type `TimestampType` timestamp values. + See Also + -------- + :meth:`pyspark.sql.functions.to_date` + :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ntz` + :meth:`pyspark.sql.functions.to_utc_timestamp` + :meth:`pyspark.sql.functions.to_unix_timestamp` + :meth:`pyspark.sql.functions.date_format` + Examples -------- - >>> df = spark.createDataFrame([("2016-12-31",)], ["e"]) - >>> df.select(to_timestamp_ltz(df.e, lit("yyyy-MM-dd")).alias('r')).collect() - ... # doctest: +SKIP - [Row(r=datetime.datetime(2016, 12, 31, 0, 0))] + Example 1: Using default format to parse the timestamp string. - >>> df = spark.createDataFrame([("2016-12-31",)], ["e"]) - >>> df.select(to_timestamp_ltz(df.e).alias('r')).collect() - ... # doctest: +SKIP - [Row(r=datetime.datetime(2016, 12, 31, 0, 0))] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) + >>> df.select('*', sf.to_timestamp_ltz('ts')).show() + +-------------------+--------------------+ + | ts|to_timestamp_ltz(ts)| + +-------------------+--------------------+ + |2015-04-08 12:12:12| 2015-04-08 12:12:12| + +-------------------+--------------------+ + + Example 2: Using user-specified format to parse the date string. + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('2016-12-31',)], ['dt']) + >>> df.select('*', sf.to_timestamp_ltz(df.dt, sf.lit('yyyy-MM-dd'))).show() + +----------+--------------------------------+ + | dt|to_timestamp_ltz(dt, yyyy-MM-dd)| + +----------+--------------------------------+ + |2016-12-31| 2016-12-31 00:00:00| + +----------+--------------------------------+ + + Example 3: Using a format column to represent different formats. + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame( + ... [('2015-04-08', 'yyyy-MM-dd'), ('2025+01+09', 'yyyy+MM+dd')], ['dt', 'fmt']) + >>> df.select('*', sf.to_timestamp_ltz('dt', 'fmt')).show() + +----------+----------+-------------------------+ + | dt| fmt|to_timestamp_ltz(dt, fmt)| + +----------+----------+-------------------------+ + |2015-04-08|yyyy-MM-dd| 2015-04-08 00:00:00| + |2025+01+09|yyyy+MM+dd| 2025-01-09 00:00:00| + +----------+----------+-------------------------+ """ if format is not None: return _invoke_function_over_columns("to_timestamp_ltz", timestamp, format) @@ -12329,22 +12986,56 @@ def to_timestamp_ntz( Parameters ---------- - timestamp : :class:`~pyspark.sql.Column` or str + timestamp : :class:`~pyspark.sql.Column` or column name Input column or strings. - format : :class:`~pyspark.sql.Column` or str, optional + format : :class:`~pyspark.sql.Column` or column name, optional format to use to convert type `TimestampNTZType` timestamp values. + See Also + -------- + :meth:`pyspark.sql.functions.to_date` + :meth:`pyspark.sql.functions.to_timestamp` + :meth:`pyspark.sql.functions.to_timestamp_ltz` + :meth:`pyspark.sql.functions.to_utc_timestamp` + :meth:`pyspark.sql.functions.to_unix_timestamp` + :meth:`pyspark.sql.functions.date_format` + Examples -------- - >>> df = spark.createDataFrame([("2016-04-08",)], ["e"]) - >>> df.select(to_timestamp_ntz(df.e, lit("yyyy-MM-dd")).alias('r')).collect() - ... # doctest: +SKIP - [Row(r=datetime.datetime(2016, 4, 8, 0, 0))] + Example 1: Using default format to parse the timestamp string. - >>> df = spark.createDataFrame([("2016-04-08",)], ["e"]) - >>> df.select(to_timestamp_ntz(df.e).alias('r')).collect() - ... # doctest: +SKIP - [Row(r=datetime.datetime(2016, 4, 8, 0, 0))] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) + >>> df.select('*', sf.to_timestamp_ntz('ts')).show() + +-------------------+--------------------+ + | ts|to_timestamp_ntz(ts)| + +-------------------+--------------------+ + |2015-04-08 12:12:12| 2015-04-08 12:12:12| + +-------------------+--------------------+ + + Example 2: Using user-specified format 'yyyy-MM-dd' to parse the date string. + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('2016-12-31',)], ['dt']) + >>> df.select('*', sf.to_timestamp_ntz(df.dt, sf.lit('yyyy-MM-dd'))).show() + +----------+--------------------------------+ + | dt|to_timestamp_ntz(dt, yyyy-MM-dd)| + +----------+--------------------------------+ + |2016-12-31| 2016-12-31 00:00:00| + +----------+--------------------------------+ + + Example 3: Using a format column to represent different formats. + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame( + ... [('2015-04-08', 'yyyy-MM-dd'), ('2025+01+09', 'yyyy+MM+dd')], ['dt', 'fmt']) + >>> df.select('*', sf.to_timestamp_ntz('dt', 'fmt')).show() + +----------+----------+-------------------------+ + | dt| fmt|to_timestamp_ntz(dt, fmt)| + +----------+----------+-------------------------+ + |2015-04-08|yyyy-MM-dd| 2015-04-08 00:00:00| + |2025+01+09|yyyy+MM+dd| 2025-01-09 00:00:00| + +----------+----------+-------------------------+ """ if format is not None: return _invoke_function_over_columns("to_timestamp_ntz", timestamp, format) @@ -12361,9 +13052,15 @@ def current_catalog() -> Column: .. versionadded:: 3.5.0 + See Also + -------- + :meth:`pyspark.sql.functions.current_database` + :meth:`pyspark.sql.functions.current_schema` + Examples -------- - >>> spark.range(1).select(current_catalog()).show() + >>> import pyspark.sql.functions as sf + >>> spark.range(1).select(sf.current_catalog()).show() +-----------------+ |current_catalog()| +-----------------+ @@ -12379,9 +13076,15 @@ def current_database() -> Column: .. versionadded:: 3.5.0 + See Also + -------- + :meth:`pyspark.sql.functions.current_catalog` + :meth:`pyspark.sql.functions.current_schema` + Examples -------- - >>> spark.range(1).select(current_database()).show() + >>> import pyspark.sql.functions as sf + >>> spark.range(1).select(sf.current_database()).show() +----------------+ |current_schema()| +----------------+ @@ -12397,6 +13100,11 @@ def current_schema() -> Column: .. versionadded:: 3.5.0 + See Also + -------- + :meth:`pyspark.sql.functions.current_catalog` + :meth:`pyspark.sql.functions.current_database` + Examples -------- >>> import pyspark.sql.functions as sf @@ -12416,9 +13124,15 @@ def current_user() -> Column: .. versionadded:: 3.5.0 + See Also + -------- + :meth:`pyspark.sql.functions.user` + :meth:`pyspark.sql.functions.session_user` + Examples -------- - >>> spark.range(1).select(current_user()).show() # doctest: +SKIP + >>> import pyspark.sql.functions as sf + >>> spark.range(1).select(sf.current_user()).show() # doctest: +SKIP +--------------+ |current_user()| +--------------+ @@ -12434,6 +13148,11 @@ def user() -> Column: .. versionadded:: 3.5.0 + See Also + -------- + :meth:`pyspark.sql.functions.current_user` + :meth:`pyspark.sql.functions.session_user` + Examples -------- >>> import pyspark.sql.functions as sf @@ -12453,6 +13172,11 @@ def session_user() -> Column: .. versionadded:: 4.0.0 + See Also + -------- + :meth:`pyspark.sql.functions.user` + :meth:`pyspark.sql.functions.current_user` + Examples -------- >>> import pyspark.sql.functions as sf @@ -12469,7 +13193,7 @@ def session_user() -> Column: @_try_remote_functions def crc32(col: "ColumnOrName") -> Column: """ - Calculates the cyclic redundancy check value (CRC32) of a binary column and + Calculates the cyclic redundancy check value (CRC32) of a binary column and returns the value as a bigint. .. versionchanged:: 3.4.0 @@ -12477,7 +13201,7 @@ def crc32(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. Returns @@ -12489,8 +13213,14 @@ def crc32(col: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect() - [Row(crc32=2743272264)] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('ABC',)], ['a']) + >>> df.select('*', sf.crc32('a')).show(truncate=False) + +---+----------+ + |a |crc32(a) | + +---+----------+ + |ABC|2743272264| + +---+----------+ """ return _invoke_function_over_columns("crc32", col) @@ -12506,7 +13236,7 @@ def md5(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. Returns @@ -12516,8 +13246,14 @@ def md5(col: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() - [Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('ABC',)], ['a']) + >>> df.select('*', sf.md5('a')).show(truncate=False) + +---+--------------------------------+ + |a |md5(a) | + +---+--------------------------------+ + |ABC|902fbdd2b1df0c4f70b4a5d23525e932| + +---+--------------------------------+ """ return _invoke_function_over_columns("md5", col) @@ -12533,7 +13269,7 @@ def sha1(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. Returns @@ -12541,10 +13277,21 @@ def sha1(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the column for computed results. + See Also + -------- + :meth:`pyspark.sql.functions.sha` + :meth:`pyspark.sql.functions.sha2` + Examples -------- - >>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() - [Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('ABC',)], ['a']) + >>> df.select('*', sf.sha1('a')).show(truncate=False) + +---+----------------------------------------+ + |a |sha1(a) | + +---+----------------------------------------+ + |ABC|3c01bdbb26f358bab27f267924aa2c9a03fcfdb8| + +---+----------------------------------------+ """ return _invoke_function_over_columns("sha1", col) @@ -12562,7 +13309,7 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. numBits : int the desired bit length of the result, which must have a @@ -12573,12 +13320,18 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column: :class:`~pyspark.sql.Column` the column for computed results. + See Also + -------- + :meth:`pyspark.sql.functions.sha` + :meth:`pyspark.sql.functions.sha1` + Examples -------- - >>> df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"]) - >>> df.withColumn("sha2", sha2(df.name, 256)).show(truncate=False) + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([['Alice'], ['Bob']], ['name']) + >>> df.select('*', sf.sha2('name', 256)).show(truncate=False) +-----+----------------------------------------------------------------+ - |name |sha2 | + |name |sha2(name, 256) | +-----+----------------------------------------------------------------+ |Alice|3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043| |Bob |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961| @@ -12608,7 +13361,7 @@ def hash(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name one or more columns to compute on. Returns @@ -12616,27 +13369,34 @@ def hash(*cols: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` hash value as int column. + See Also + -------- + :meth:`pyspark.sql.functions.xxhash64` + Examples -------- + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) + >>> df.select('*', sf.hash('c1')).show() + +---+---+----------+ + | c1| c2| hash(c1)| + +---+---+----------+ + |ABC|DEF|-757602832| + +---+---+----------+ - Hash for one column - - >>> df.select(hash('c1').alias('hash')).show() - +----------+ - | hash| - +----------+ - |-757602832| - +----------+ - - Two or more columns + >>> df.select('*', sf.hash('c1', df.c2)).show() + +---+---+------------+ + | c1| c2|hash(c1, c2)| + +---+---+------------+ + |ABC|DEF| 599895104| + +---+---+------------+ - >>> df.select(hash('c1', 'c2').alias('hash')).show() - +---------+ - | hash| - +---------+ - |599895104| - +---------+ + >>> df.select('*', sf.hash('*')).show() + +---+---+------------+ + | c1| c2|hash(c1, c2)| + +---+---+------------+ + |ABC|DEF| 599895104| + +---+---+------------+ """ return _invoke_function_over_seq_of_columns("hash", cols) @@ -12653,7 +13413,7 @@ def xxhash64(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name one or more columns to compute on. Returns @@ -12661,27 +13421,34 @@ def xxhash64(*cols: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` hash value as long column. + See Also + -------- + :meth:`pyspark.sql.functions.hash` + Examples -------- + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) - - Hash for one column - - >>> df.select(xxhash64('c1').alias('hash')).show() - +-------------------+ - | hash| - +-------------------+ - |4105715581806190027| - +-------------------+ - - Two or more columns - - >>> df.select(xxhash64('c1', 'c2').alias('hash')).show() - +-------------------+ - | hash| - +-------------------+ - |3233247871021311208| - +-------------------+ + >>> df.select('*', sf.xxhash64('c1')).show() + +---+---+-------------------+ + | c1| c2| xxhash64(c1)| + +---+---+-------------------+ + |ABC|DEF|4105715581806190027| + +---+---+-------------------+ + + >>> df.select('*', sf.xxhash64('c1', df.c2)).show() + +---+---+-------------------+ + | c1| c2| xxhash64(c1, c2)| + +---+---+-------------------+ + |ABC|DEF|3233247871021311208| + +---+---+-------------------+ + + >>> df.select('*', sf.xxhash64('*')).show() + +---+---+-------------------+ + | c1| c2| xxhash64(c1, c2)| + +---+---+-------------------+ + |ABC|DEF|3233247871021311208| + +---+---+-------------------+ """ return _invoke_function_over_seq_of_columns("xxhash64", cols) @@ -12699,9 +13466,9 @@ def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name column name or column that represents the input column to test - errMsg : :class:`~pyspark.sql.Column` or str, optional + errMsg : :class:`~pyspark.sql.Column` or literal string, optional A Python string literal or column containing the error message Returns @@ -12709,16 +13476,36 @@ def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None :class:`~pyspark.sql.Column` `null` if the input column is `true` otherwise throws an error with specified message. + See Also + -------- + :meth:`pyspark.sql.functions.raise_error` + Examples -------- - >>> df = spark.createDataFrame([(0,1)], ['a', 'b']) - >>> df.select(assert_true(df.a < df.b).alias('r')).collect() - [Row(r=None)] - >>> df.select(assert_true(df.a < df.b, df.a).alias('r')).collect() - [Row(r=None)] - >>> df.select(assert_true(df.a < df.b, 'error').alias('r')).collect() - [Row(r=None)] - >>> df.select(assert_true(df.a > df.b, 'My error msg').alias('r')).collect() # doctest: +SKIP + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(0, 1)], ['a', 'b']) + >>> df.select('*', sf.assert_true(df.a < df.b)).show() # doctest: +SKIP + +------------------------------------------------------+ + |assert_true((a < b), '(a#788L < b#789L)' is not true!)| + +------------------------------------------------------+ + | NULL| + +------------------------------------------------------+ + + >>> df.select('*', sf.assert_true(df.a < df.b, df.a)).show() + +---+---+-----------------------+ + | a| b|assert_true((a < b), a)| + +---+---+-----------------------+ + | 0| 1| NULL| + +---+---+-----------------------+ + + >>> df.select('*', sf.assert_true(df.a < df.b, 'error')).show() + +---+---+---------------------------+ + | a| b|assert_true((a < b), error)| + +---+---+---------------------------+ + | 0| 1| NULL| + +---+---+---------------------------+ + + >>> df.select('*', sf.assert_true(df.a > df.b, 'My error msg')).show() # doctest: +SKIP ... java.lang.RuntimeException: My error msg ... @@ -12746,7 +13533,7 @@ def raise_error(errMsg: Union[Column, str]) -> Column: Parameters ---------- - errMsg : :class:`~pyspark.sql.Column` or str + errMsg : :class:`~pyspark.sql.Column` or literal string A Python string literal or column containing the error message Returns @@ -12754,10 +13541,14 @@ def raise_error(errMsg: Union[Column, str]) -> Column: :class:`~pyspark.sql.Column` throws an error with specified message. + See Also + -------- + :meth:`pyspark.sql.functions.assert_true` + Examples -------- - >>> df = spark.range(1) - >>> df.select(raise_error("My error message")).show() # doctest: +SKIP + >>> import pyspark.sql.functions as sf + >>> spark.range(1).select(sf.raise_error("My error message")).show() # doctest: +SKIP ... java.lang.RuntimeException: My error message ... @@ -12786,7 +13577,7 @@ def upper(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -12794,17 +13585,22 @@ def upper(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` upper case values. + See Also + -------- + :meth:`pyspark.sql.functions.lower` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") - >>> df.select(upper("value")).show() - +------------+ - |upper(value)| - +------------+ - | SPARK| - | PYSPARK| - | PANDAS API| - +------------+ + >>> df.select("*", sf.upper("value")).show() + +----------+------------+ + | value|upper(value)| + +----------+------------+ + | Spark| SPARK| + | PySpark| PYSPARK| + |Pandas API| PANDAS API| + +----------+------------+ """ return _invoke_function_over_columns("upper", col) @@ -12821,7 +13617,7 @@ def lower(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -12829,17 +13625,22 @@ def lower(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` lower case values. + See Also + -------- + :meth:`pyspark.sql.functions.upper` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") - >>> df.select(lower("value")).show() - +------------+ - |lower(value)| - +------------+ - | spark| - | pyspark| - | pandas api| - +------------+ + >>> df.select("*", sf.lower("value")).show() + +----------+------------+ + | value|lower(value)| + +----------+------------+ + | Spark| spark| + | PySpark| pyspark| + |Pandas API| pandas api| + +----------+------------+ """ return _invoke_function_over_columns("lower", col) @@ -12856,7 +13657,7 @@ def ascii(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -12866,15 +13667,16 @@ def ascii(col: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") - >>> df.select(ascii("value")).show() - +------------+ - |ascii(value)| - +------------+ - | 83| - | 80| - | 80| - +------------+ + >>> df.select("*", sf.ascii("value")).show() + +----------+------------+ + | value|ascii(value)| + +----------+------------+ + | Spark| 83| + | PySpark| 80| + |Pandas API| 80| + +----------+------------+ """ return _invoke_function_over_columns("ascii", col) @@ -12891,7 +13693,7 @@ def base64(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -12899,17 +13701,22 @@ def base64(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` BASE64 encoding of string value. + See Also + -------- + :meth:`pyspark.sql.functions.unbase64` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") - >>> df.select(base64("value")).show() - +----------------+ - | base64(value)| - +----------------+ - | U3Bhcms=| - | UHlTcGFyaw==| - |UGFuZGFzIEFQSQ==| - +----------------+ + >>> df.select("*", sf.base64("value")).show() + +----------+----------------+ + | value| base64(value)| + +----------+----------------+ + | Spark| U3Bhcms=| + | PySpark| UHlTcGFyaw==| + |Pandas API|UGFuZGFzIEFQSQ==| + +----------+----------------+ """ return _invoke_function_over_columns("base64", col) @@ -12926,7 +13733,7 @@ def unbase64(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -12934,19 +13741,22 @@ def unbase64(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` encoded string value. + See Also + -------- + :meth:`pyspark.sql.functions.base64` + Examples -------- - >>> df = spark.createDataFrame(["U3Bhcms=", - ... "UHlTcGFyaw==", - ... "UGFuZGFzIEFQSQ=="], "STRING") - >>> df.select(unbase64("value")).show() - +--------------------+ - | unbase64(value)| - +--------------------+ - | [53 70 61 72 6B]| - |[50 79 53 70 61 7...| - |[50 61 6E 64 61 7...| - +--------------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame(["U3Bhcms=", "UHlTcGFyaw==", "UGFuZGFzIEFQSQ=="], "STRING") + >>> df.select("*", sf.unbase64("value")).show(truncate=False) + +----------------+-------------------------------+ + |value |unbase64(value) | + +----------------+-------------------------------+ + |U3Bhcms= |[53 70 61 72 6B] | + |UHlTcGFyaw== |[50 79 53 70 61 72 6B] | + |UGFuZGFzIEFQSQ==|[50 61 6E 64 61 73 20 41 50 49]| + +----------------+-------------------------------+ """ return _invoke_function_over_columns("unbase64", col) @@ -12963,9 +13773,9 @@ def ltrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - trim : :class:`~pyspark.sql.Column` or str, optional + trim : :class:`~pyspark.sql.Column` or column name, optional The trim string characters to trim, the default value is a single space .. versionadded:: 4.0.0 @@ -12975,6 +13785,11 @@ def ltrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: :class:`~pyspark.sql.Column` left trimmed values. + See Also + -------- + :meth:`pyspark.sql.functions.trim` + :meth:`pyspark.sql.functions.rtrim` + Examples -------- Example 1: Trim the spaces @@ -13002,6 +13817,18 @@ def ltrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: | Spark**| Spark**| | *Spark| Spark| +--------+--------------------------+ + + Example 3: Trim a column containing different characters + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("**Spark*", "*"), ("==Spark=", "=")], ["value", "t"]) + >>> df.select("*", sf.ltrim("value", "t")).show() + +--------+---+--------------------------+ + | value| t|TRIM(LEADING t FROM value)| + +--------+---+--------------------------+ + |**Spark*| *| Spark*| + |==Spark=| =| Spark=| + +--------+---+--------------------------+ """ if trim is not None: return _invoke_function_over_columns("ltrim", col, trim) @@ -13021,9 +13848,9 @@ def rtrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - trim : :class:`~pyspark.sql.Column` or str, optional + trim : :class:`~pyspark.sql.Column` or column name, optional The trim string characters to trim, the default value is a single space .. versionadded:: 4.0.0 @@ -13033,6 +13860,11 @@ def rtrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: :class:`~pyspark.sql.Column` right trimmed values. + See Also + -------- + :meth:`pyspark.sql.functions.trim` + :meth:`pyspark.sql.functions.ltrim` + Examples -------- Example 1: Trim the spaces @@ -13060,6 +13892,18 @@ def rtrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: | Spark**| Spark| | *Spark| *Spark| +--------+---------------------------+ + + Example 3: Trim a column containing different characters + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("**Spark*", "*"), ("==Spark=", "=")], ["value", "t"]) + >>> df.select("*", sf.rtrim("value", "t")).show() + +--------+---+---------------------------+ + | value| t|TRIM(TRAILING t FROM value)| + +--------+---+---------------------------+ + |**Spark*| *| **Spark| + |==Spark=| =| ==Spark| + +--------+---+---------------------------+ """ if trim is not None: return _invoke_function_over_columns("rtrim", col, trim) @@ -13079,9 +13923,9 @@ def trim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - trim : :class:`~pyspark.sql.Column` or str, optional + trim : :class:`~pyspark.sql.Column` or column name, optional The trim string characters to trim, the default value is a single space .. versionadded:: 4.0.0 @@ -13091,6 +13935,11 @@ def trim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: :class:`~pyspark.sql.Column` trimmed values from both sides. + See Also + -------- + :meth:`pyspark.sql.functions.ltrim` + :meth:`pyspark.sql.functions.rtrim` + Examples -------- Example 1: Trim the spaces @@ -13118,6 +13967,18 @@ def trim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: | Spark**| Spark| | *Spark| Spark| +--------+-----------------------+ + + Example 3: Trim a column containing different characters + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("**Spark*", "*"), ("==Spark=", "=")], ["value", "t"]) + >>> df.select("*", sf.trim("value", "t")).show() + +--------+---+-----------------------+ + | value| t|TRIM(BOTH t FROM value)| + +--------+---+-----------------------+ + |**Spark*| *| Spark| + |==Spark=| =| Spark| + +--------+---+-----------------------+ """ if trim is not None: return _invoke_function_over_columns("trim", col, trim) @@ -13138,9 +13999,9 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: Parameters ---------- - sep : str + sep : literal string words separator. - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name list of columns to work on. Returns @@ -13148,11 +14009,20 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` string of concatenated words. + See Also + -------- + :meth:`pyspark.sql.functions.concat` + Examples -------- - >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) - >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect() - [Row(s='abcd-123')] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("abcd", "123")], ["s", "d"]) + >>> df.select("*", sf.concat_ws("-", df.s, "d", sf.lit("xyz"))).show() + +----+---+-----------------------+ + | s| d|concat_ws(-, s, d, xyz)| + +----+---+-----------------------+ + |abcd|123| abcd-123-xyz| + +----+---+-----------------------+ """ from pyspark.sql.classic.column import _to_seq, _to_java_column @@ -13173,9 +14043,9 @@ def decode(col: "ColumnOrName", charset: str) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - charset : str + charset : literal string charset to use to decode to. Returns @@ -13183,15 +14053,20 @@ def decode(col: "ColumnOrName", charset: str) -> Column: :class:`~pyspark.sql.Column` the column for computed results. + See Also + -------- + :meth:`pyspark.sql.functions.encode` + Examples -------- - >>> df = spark.createDataFrame([('abcd',)], ['a']) - >>> df.select(decode("a", "UTF-8")).show() - +----------------+ - |decode(a, UTF-8)| - +----------------+ - | abcd| - +----------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(b"\x61\x62\x63\x64",)], ["a"]) + >>> df.select("*", sf.decode("a", "UTF-8")).show() + +-------------+----------------+ + | a|decode(a, UTF-8)| + +-------------+----------------+ + |[61 62 63 64]| abcd| + +-------------+----------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -13211,9 +14086,9 @@ def encode(col: "ColumnOrName", charset: str) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - charset : str + charset : literal string charset to use to encode. Returns @@ -13221,15 +14096,20 @@ def encode(col: "ColumnOrName", charset: str) -> Column: :class:`~pyspark.sql.Column` the column for computed results. + See Also + -------- + :meth:`pyspark.sql.functions.decode` + Examples -------- - >>> df = spark.createDataFrame([('abcd',)], ['c']) - >>> df.select(encode("c", "UTF-8")).show() - +----------------+ - |encode(c, UTF-8)| - +----------------+ - | [61 62 63 64]| - +----------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("abcd",)], ["c"]) + >>> df.select("*", sf.encode("c", "UTF-8")).show() + +----+----------------+ + | c|encode(c, UTF-8)| + +----+----------------+ + |abcd| [61 62 63 64]| + +----+----------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -13245,7 +14125,7 @@ def is_valid_utf8(str: "ColumnOrName") -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name A column of strings, each representing a UTF-8 byte sequence. Returns @@ -13253,6 +14133,12 @@ def is_valid_utf8(str: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` whether the input string is a valid UTF-8 string. + See Also + -------- + :meth:`pyspark.sql.functions.make_valid_utf8` + :meth:`pyspark.sql.functions.validate_utf8` + :meth:`pyspark.sql.functions.try_validate_utf8` + Examples -------- >>> import pyspark.sql.functions as sf @@ -13276,7 +14162,7 @@ def make_valid_utf8(str: "ColumnOrName") -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name A column of strings, each representing a UTF-8 byte sequence. Returns @@ -13284,6 +14170,12 @@ def make_valid_utf8(str: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the valid UTF-8 version of the given input string. + See Also + -------- + :meth:`pyspark.sql.functions.is_valid_utf8` + :meth:`pyspark.sql.functions.validate_utf8` + :meth:`pyspark.sql.functions.try_validate_utf8` + Examples -------- >>> import pyspark.sql.functions as sf @@ -13306,7 +14198,7 @@ def validate_utf8(str: "ColumnOrName") -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name A column of strings, each representing a UTF-8 byte sequence. Returns @@ -13314,6 +14206,12 @@ def validate_utf8(str: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the input string if it is a valid UTF-8 string, error otherwise. + See Also + -------- + :meth:`pyspark.sql.functions.is_valid_utf8` + :meth:`pyspark.sql.functions.make_valid_utf8` + :meth:`pyspark.sql.functions.try_validate_utf8` + Examples -------- >>> import pyspark.sql.functions as sf @@ -13336,7 +14234,7 @@ def try_validate_utf8(str: "ColumnOrName") -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name A column of strings, each representing a UTF-8 byte sequence. Returns @@ -13344,6 +14242,12 @@ def try_validate_utf8(str: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the input string if it is a valid UTF-8 string, null otherwise. + See Also + -------- + :meth:`pyspark.sql.functions.is_valid_utf8` + :meth:`pyspark.sql.functions.make_valid_utf8` + :meth:`pyspark.sql.functions.validate_utf8` + Examples -------- >>> import pyspark.sql.functions as sf @@ -13370,7 +14274,7 @@ def format_number(col: "ColumnOrName", d: int) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name the column name of the numeric value to be formatted d : int the N decimal places @@ -13382,8 +14286,14 @@ def format_number(col: "ColumnOrName", d: int) -> Column: Examples -------- - >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() - [Row(v='5.0000')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(5,)], ["a"]) + >>> df.select("*", sf.format_number("a", 4), sf.format_number(df.a, 6)).show() + +---+-------------------+-------------------+ + | a|format_number(a, 4)|format_number(a, 6)| + +---+-------------------+-------------------+ + | 5| 5.0000| 5.000000| + +---+-------------------+-------------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -13402,9 +14312,9 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column: Parameters ---------- - format : str + format : literal string string that can contain embedded format tags and used as result column's value - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name column names or :class:`~pyspark.sql.Column`\\s to be used in formatting Returns @@ -13412,11 +14322,20 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the column of formatted results. + See Also + -------- + :meth:`pyspark.sql.functions.printf` + Examples -------- - >>> df = spark.createDataFrame([(5, "hello")], ['a', 'b']) - >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect() - [Row(v='5 hello')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(5, "hello")], ["a", "b"]) + >>> df.select("*", sf.format_string('%d %s', "a", df.b)).show() + +---+-----+--------------------------+ + | a| b|format_string(%d %s, a, b)| + +---+-----+--------------------------+ + | 5|hello| 5 hello| + +---+-----+--------------------------+ """ from pyspark.sql.classic.column import _to_seq, _to_java_column @@ -13457,6 +14376,13 @@ def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column: :class:`~pyspark.sql.Column` location of the first occurrence of the substring as integer. + See Also + -------- + :meth:`pyspark.sql.functions.locate` + :meth:`pyspark.sql.functions.substr` + :meth:`pyspark.sql.functions.substring` + :meth:`pyspark.sql.functions.substring_index` + Examples -------- Example 1: Using a literal string as the 'substring' @@ -13504,14 +14430,14 @@ def overlay( Parameters ---------- - src : :class:`~pyspark.sql.Column` or str - column name or column containing the string that will be replaced - replace : :class:`~pyspark.sql.Column` or str - column name or column containing the substitution string - pos : :class:`~pyspark.sql.Column` or str or int - column name, column, or int containing the starting position in src - len : :class:`~pyspark.sql.Column` or str or int, optional - column name, column, or int containing the number of bytes to replace in src + src : :class:`~pyspark.sql.Column` or column name + the string that will be replaced + replace : :class:`~pyspark.sql.Column` or column name + the substitution string + pos : :class:`~pyspark.sql.Column` or column name or int + the starting position in src + len : :class:`~pyspark.sql.Column` or column name or int, optional + the number of bytes to replace in src string by 'replace' defaults to -1, which represents the length of the 'replace' string Returns @@ -13521,13 +14447,28 @@ def overlay( Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y")) - >>> df.select(overlay("x", "y", 7).alias("overlayed")).collect() - [Row(overlayed='SPARK_CORE')] - >>> df.select(overlay("x", "y", 7, 0).alias("overlayed")).collect() - [Row(overlayed='SPARK_CORESQL')] - >>> df.select(overlay("x", "y", 7, 2).alias("overlayed")).collect() - [Row(overlayed='SPARK_COREL')] + >>> df.select("*", sf.overlay("x", df.y, 7)).show() + +---------+----+--------------------+ + | x| y|overlay(x, y, 7, -1)| + +---------+----+--------------------+ + |SPARK_SQL|CORE| SPARK_CORE| + +---------+----+--------------------+ + + >>> df.select("*", sf.overlay("x", df.y, 7, 0)).show() + +---------+----+-------------------+ + | x| y|overlay(x, y, 7, 0)| + +---------+----+-------------------+ + |SPARK_SQL|CORE| SPARK_CORESQL| + +---------+----+-------------------+ + + >>> df.select("*", sf.overlay("x", "y", 7, 2)).show() + +---------+----+-------------------+ + | x| y|overlay(x, y, 7, 2)| + +---------+----+-------------------+ + |SPARK_SQL|CORE| SPARK_COREL| + +---------+----+-------------------+ """ pos = _enum_to_value(pos) if not isinstance(pos, (int, str, Column)): @@ -13581,11 +14522,11 @@ def sentences( Parameters ---------- - string : :class:`~pyspark.sql.Column` or str + string : :class:`~pyspark.sql.Column` or column name a string to be split - language : :class:`~pyspark.sql.Column` or str, optional + language : :class:`~pyspark.sql.Column` or column name, optional a language of the locale - country : :class:`~pyspark.sql.Column` or str, optional + country : :class:`~pyspark.sql.Column` or column name, optional a country of the locale Returns @@ -13593,28 +14534,35 @@ def sentences( :class:`~pyspark.sql.Column` arrays of split sentences. + See Also + -------- + :meth:`pyspark.sql.functions.split` + :meth:`pyspark.sql.functions.split_part` + Examples -------- - >>> df = spark.createDataFrame([["This is an example sentence."]], ["string"]) - >>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False) - +-----------------------------------+ - |sentences(string, en, US) | - +-----------------------------------+ - |[[This, is, an, example, sentence]]| - +-----------------------------------+ - >>> df.select(sentences(df.string, lit("en"))).show(truncate=False) - +-----------------------------------+ - |sentences(string, en, ) | - +-----------------------------------+ - |[[This, is, an, example, sentence]]| - +-----------------------------------+ - >>> df = spark.createDataFrame([["Hello world. How are you?"]], ["s"]) - >>> df.select(sentences("s")).show(truncate=False) - +---------------------------------+ - |sentences(s, , ) | - +---------------------------------+ - |[[Hello, world], [How, are, you]]| - +---------------------------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("This is an example sentence.", )], ["s"]) + >>> df.select("*", sf.sentences(df.s, sf.lit("en"), sf.lit("US"))).show(truncate=False) + +----------------------------+-----------------------------------+ + |s |sentences(s, en, US) | + +----------------------------+-----------------------------------+ + |This is an example sentence.|[[This, is, an, example, sentence]]| + +----------------------------+-----------------------------------+ + + >>> df.select("*", sf.sentences(df.s, sf.lit("en"))).show(truncate=False) + +----------------------------+-----------------------------------+ + |s |sentences(s, en, ) | + +----------------------------+-----------------------------------+ + |This is an example sentence.|[[This, is, an, example, sentence]]| + +----------------------------+-----------------------------------+ + + >>> df.select("*", sf.sentences(df.s)).show(truncate=False) + +----------------------------+-----------------------------------+ + |s |sentences(s, , ) | + +----------------------------+-----------------------------------+ + |This is an example sentence.|[[This, is, an, example, sentence]]| + +----------------------------+-----------------------------------+ """ if language is None: language = lit("") @@ -13646,15 +14594,15 @@ def substring( Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - pos : :class:`~pyspark.sql.Column` or str or int + pos : :class:`~pyspark.sql.Column` or column name or int starting position in str. .. versionchanged:: 4.0.0 `pos` now accepts column and column name. - len : :class:`~pyspark.sql.Column` or str or int + len : :class:`~pyspark.sql.Column` or column name or int length of chars. .. versionchanged:: 4.0.0 @@ -13665,6 +14613,14 @@ def substring( :class:`~pyspark.sql.Column` substring of given value. + See Also + -------- + :meth:`pyspark.sql.functions.instr` + :meth:`pyspark.sql.functions.locate` + :meth:`pyspark.sql.functions.substr` + :meth:`pyspark.sql.functions.substring_index` + :meth:`pyspark.sql.Column.substr` + Examples -------- Example 1: Using literal integers as arguments @@ -13743,9 +14699,9 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - delim : str + delim : literal string delimiter of values. count : int number of occurrences. @@ -13755,13 +14711,31 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: :class:`~pyspark.sql.Column` substring of given value. + See Also + -------- + :meth:`pyspark.sql.functions.instr` + :meth:`pyspark.sql.functions.locate` + :meth:`pyspark.sql.functions.substr` + :meth:`pyspark.sql.functions.substring` + :meth:`pyspark.sql.Column.substr` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('a.b.c.d',)], ['s']) - >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect() - [Row(s='a.b')] - >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() - [Row(s='b.c.d')] + >>> df.select('*', sf.substring_index(df.s, '.', 2)).show() + +-------+------------------------+ + | s|substring_index(s, ., 2)| + +-------+------------------------+ + |a.b.c.d| a.b| + +-------+------------------------+ + + >>> df.select('*', sf.substring_index('s', '.', -3)).show() + +-------+-------------------------+ + | s|substring_index(s, ., -3)| + +-------+-------------------------+ + |a.b.c.d| b.c.d| + +-------+-------------------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -13783,16 +14757,15 @@ def levenshtein( Parameters ---------- - left : :class:`~pyspark.sql.Column` or str + left : :class:`~pyspark.sql.Column` or column name first column value. - right : :class:`~pyspark.sql.Column` or str + right : :class:`~pyspark.sql.Column` or column name second column value. threshold : int, optional if set when the levenshtein distance of the two given strings less than or equal to a given threshold then return result distance, or -1 - .. versionchanged: 3.5.0 - Added ``threshold`` argument. + .. versionadded: 3.5.0 Returns ------- @@ -13801,11 +14774,21 @@ def levenshtein( Examples -------- - >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) - >>> df0.select(levenshtein('l', 'r').alias('d')).collect() - [Row(d=3)] - >>> df0.select(levenshtein('l', 'r', 2).alias('d')).collect() - [Row(d=-1)] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) + >>> df.select('*', sf.levenshtein('l', 'r')).show() + +------+-------+-----------------+ + | l| r|levenshtein(l, r)| + +------+-------+-----------------+ + |kitten|sitting| 3| + +------+-------+-----------------+ + + >>> df.select('*', sf.levenshtein(df.l, df.r, 2)).show() + +------+-------+--------------------+ + | l| r|levenshtein(l, r, 2)| + +------+-------+--------------------+ + |kitten|sitting| -1| + +------+-------+--------------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -13829,9 +14812,9 @@ def locate(substr: str, str: "ColumnOrName", pos: int = 1) -> Column: Parameters ---------- - substr : str + substr : literal string a string - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name a Column of :class:`pyspark.sql.types.StringType` pos : int, optional start position (zero based) @@ -13846,11 +14829,31 @@ def locate(substr: str, str: "ColumnOrName", pos: int = 1) -> Column: The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str. + See Also + -------- + :meth:`pyspark.sql.functions.instr` + :meth:`pyspark.sql.functions.substr` + :meth:`pyspark.sql.functions.substring` + :meth:`pyspark.sql.functions.substring_index` + :meth:`pyspark.sql.Column.substr` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(locate('b', df.s, 1).alias('s')).collect() - [Row(s=2)] + >>> df.select('*', sf.locate('b', 's', 1)).show() + +----+---------------+ + | s|locate(b, s, 1)| + +----+---------------+ + |abcd| 2| + +----+---------------+ + + >>> df.select('*', sf.locate('b', df.s, 3)).show() + +----+---------------+ + | s|locate(b, s, 3)| + +----+---------------+ + |abcd| 0| + +----+---------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -13894,6 +14897,10 @@ def lpad( :class:`~pyspark.sql.Column` left padded result. + See Also + -------- + :meth:`pyspark.sql.functions.rpad` + Examples -------- Example 1: Pad with a literal string @@ -13960,6 +14967,10 @@ def rpad( :class:`~pyspark.sql.Column` right padded result. + See Also + -------- + :meth:`pyspark.sql.functions.lpad` + Examples -------- Example 1: Pad with a literal string @@ -14003,9 +15014,9 @@ def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - n : :class:`~pyspark.sql.Column` or str or int + n : :class:`~pyspark.sql.Column` or column name or int number of times to repeat value. .. versionchanged:: 4.0.0 @@ -14018,35 +15029,35 @@ def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: Examples -------- - >>> import pyspark.sql.functions as sf - >>> spark.createDataFrame( - ... [('ab',)], ['s',] - ... ).select(sf.repeat("s", 3)).show() - +------------+ - |repeat(s, 3)| - +------------+ - | ababab| - +------------+ + Example 1: Repeat with a constant number of times >>> import pyspark.sql.functions as sf - >>> spark.createDataFrame( - ... [('ab',)], ['s',] - ... ).select(sf.repeat("s", sf.lit(4))).show() - +------------+ - |repeat(s, 4)| - +------------+ - | abababab| - +------------+ + >>> df = spark.createDataFrame([('ab',)], ['s',]) + >>> df.select("*", sf.repeat("s", 3)).show() + +---+------------+ + | s|repeat(s, 3)| + +---+------------+ + | ab| ababab| + +---+------------+ + + >>> df.select("*", sf.repeat(df.s, sf.lit(4))).show() + +---+------------+ + | s|repeat(s, 4)| + +---+------------+ + | ab| abababab| + +---+------------+ + + Example 2: Repeat with a column containing different number of times >>> import pyspark.sql.functions as sf - >>> spark.createDataFrame( - ... [('ab', 5,)], ['s', 't'] - ... ).select(sf.repeat("s", 't')).show() - +------------+ - |repeat(s, t)| - +------------+ - | ababababab| - +------------+ + >>> df = spark.createDataFrame([('ab', 5,), ('abc', 6,)], ['s', 't']) + >>> df.select("*", sf.repeat("s", "t")).show() + +---+---+------------------+ + | s| t| repeat(s, t)| + +---+---+------------------+ + | ab| 5| ababababab| + |abc| 6|abcabcabcabcabcabc| + +---+---+------------------+ """ n = _enum_to_value(n) n = lit(n) if isinstance(n, int) else n @@ -14069,12 +15080,18 @@ def split( Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name a string expression to split - pattern : :class:`~pyspark.sql.Column` or str + pattern : :class:`~pyspark.sql.Column` or literal string a string representing a regular expression. The regex string should be a Java regular expression. - limit : :class:`~pyspark.sql.Column` or str or int + + .. versionchanged:: 4.0.0 + `pattern` now accepts column. Does not accept column name since string type remain + accepted as a regular expression representation, for backwards compatibility. + In addition to int, `limit` now accepts column and column name. + + limit : :class:`~pyspark.sql.Column` or column name or int an integer which controls the number of times `pattern` is applied. * ``limit > 0``: The resulting array's length will not be more than `limit`, and the @@ -14086,61 +15103,66 @@ def split( .. versionchanged:: 3.0 `split` now takes an optional `limit` field. If not provided, default limit value is -1. - .. versionchanged:: 4.0.0 - `pattern` now accepts column. Does not accept column name since string type remain - accepted as a regular expression representation, for backwards compatibility. - In addition to int, `limit` now accepts column and column name. - Returns ------- :class:`~pyspark.sql.Column` array of separated strings. + See Also + -------- + :meth:`pyspark.sql.functions.sentences` + :meth:`pyspark.sql.functions.split_part` + Examples -------- - >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) - >>> df.select(sf.split(df.s, '[ABC]', 2).alias('s')).show() - +-----------------+ - | s| - +-----------------+ - |[one, twoBthreeC]| - +-----------------+ + Example 1: Repeat with a constant pattern >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) - >>> df.select(sf.split(df.s, '[ABC]', -1).alias('s')).show() - +-------------------+ - | s| - +-------------------+ - |[one, two, three, ]| - +-------------------+ + >>> df.select('*', sf.split(df.s, '[ABC]')).show() + +--------------+-------------------+ + | s|split(s, [ABC], -1)| + +--------------+-------------------+ + |oneAtwoBthreeC|[one, two, three, ]| + +--------------+-------------------+ + + >>> df.select('*', sf.split(df.s, '[ABC]', 2)).show() + +--------------+------------------+ + | s|split(s, [ABC], 2)| + +--------------+------------------+ + |oneAtwoBthreeC| [one, twoBthreeC]| + +--------------+------------------+ + + >>> df.select('*', sf.split('s', '[ABC]', -2)).show() + +--------------+-------------------+ + | s|split(s, [ABC], -2)| + +--------------+-------------------+ + |oneAtwoBthreeC|[one, two, three, ]| + +--------------+-------------------+ + + Example 2: Repeat with a column containing different patterns and limits >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame( - ... [('oneAtwoBthreeC', '[ABC]'), ('1A2B3C', '[1-9]+'), ('aa2bb3cc4', '[1-9]+')], - ... ['s', 'pattern'] - ... ) - >>> df.select(sf.split(df.s, df.pattern).alias('s')).show() - +-------------------+ - | s| - +-------------------+ - |[one, two, three, ]| - | [, A, B, C]| - | [aa, bb, cc, ]| - +-------------------+ - - >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame( - ... [('oneAtwoBthreeC', '[ABC]', 2), ('1A2B3C', '[1-9]+', -1)], - ... ['s', 'pattern', 'expected_parts'] - ... ) - >>> df.select(sf.split(df.s, df.pattern, df.expected_parts).alias('s')).show() + >>> df = spark.createDataFrame([ + ... ('oneAtwoBthreeC', '[ABC]', 2), + ... ('1A2B3C', '[1-9]+', 1), + ... ('aa2bb3cc4', '[1-9]+', -1)], ['s', 'p', 'l']) + >>> df.select('*', sf.split(df.s, df.p)).show() + +--------------+------+---+-------------------+ + | s| p| l| split(s, p, -1)| + +--------------+------+---+-------------------+ + |oneAtwoBthreeC| [ABC]| 2|[one, two, three, ]| + | 1A2B3C|[1-9]+| 1| [, A, B, C]| + | aa2bb3cc4|[1-9]+| -1| [aa, bb, cc, ]| + +--------------+------+---+-------------------+ + + >>> df.select(sf.split('s', df.p, 'l')).show() +-----------------+ - | s| + | split(s, p, l)| +-----------------+ |[one, twoBthreeC]| - | [, A, B, C]| + | [1A2B3C]| + | [aa, bb, cc, ]| +-----------------+ """ limit = _enum_to_value(limit) @@ -14305,16 +15327,29 @@ def randstr(length: Union[Column, int], seed: Optional[Union[Column, int]] = Non :class:`~pyspark.sql.Column` The generated random string with the specified length. + See Also + -------- + :meth:`pyspark.sql.functions.rand` + :meth:`pyspark.sql.functions.randn` + Examples -------- - >>> spark.createDataFrame([('3',)], ['a']) \\ - ... .select(randstr(lit(5), lit(0)).alias('result')) \\ - ... .selectExpr("length(result) > 0").show() - +--------------------+ - |(length(result) > 0)| - +--------------------+ - | true| - +--------------------+ + >>> import pyspark.sql.functions as sf + >>> spark.range(0, 10, 1, 1).select(sf.randstr(16, 3)).show() + +----------------+ + | randstr(16, 3)| + +----------------+ + |nurJIpH4cmmMnsCG| + |fl9YtT5m01trZtIt| + |PD19rAgscTHS7qQZ| + |2CuAICF5UJOruVv4| + |kNZEs8nDpJEoz3Rl| + |OXiU0KN5eaXfjXFs| + |qfnTM1BZAHtN0gBV| + |1p8XiSKwg33KnRPK| + |od5y5MucayQq1bKK| + |tklYPmKmc5sIppWM| + +----------------+ """ length = _enum_to_value(length) length = lit(length) @@ -14335,9 +15370,9 @@ def regexp_count(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - regexp : :class:`~pyspark.sql.Column` or str + regexp : :class:`~pyspark.sql.Column` or column name regex pattern to apply. Returns @@ -14347,13 +15382,35 @@ def regexp_count(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"]) - >>> df.select(regexp_count('str', lit(r'\d+')).alias('d')).collect() - [Row(d=3)] - >>> df.select(regexp_count('str', lit(r'mmm')).alias('d')).collect() - [Row(d=0)] - >>> df.select(regexp_count("str", col("regexp")).alias('d')).collect() - [Row(d=3)] + >>> df.select('*', sf.regexp_count('str', sf.lit(r'\d+'))).show() + +---------+------+----------------------+ + | str|regexp|regexp_count(str, \d+)| + +---------+------+----------------------+ + |1a 2b 14m| \d+| 3| + +---------+------+----------------------+ + + >>> df.select('*', sf.regexp_count('str', sf.lit(r'mmm'))).show() + +---------+------+----------------------+ + | str|regexp|regexp_count(str, mmm)| + +---------+------+----------------------+ + |1a 2b 14m| \d+| 0| + +---------+------+----------------------+ + + >>> df.select('*', sf.regexp_count("str", sf.col("regexp"))).show() + +---------+------+-------------------------+ + | str|regexp|regexp_count(str, regexp)| + +---------+------+-------------------------+ + |1a 2b 14m| \d+| 3| + +---------+------+-------------------------+ + + >>> df.select('*', sf.regexp_count(sf.col('str'), "regexp")).show() + +---------+------+-------------------------+ + | str|regexp|regexp_count(str, regexp)| + +---------+------+-------------------------+ + |1a 2b 14m| \d+| 3| + +---------+------+-------------------------+ """ return _invoke_function_over_columns("regexp_count", str, regexp) @@ -14370,7 +15427,7 @@ def regexp_extract(str: "ColumnOrName", pattern: str, idx: int) -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. pattern : str regex pattern to apply. @@ -14382,17 +15439,36 @@ def regexp_extract(str: "ColumnOrName", pattern: str, idx: int) -> Column: :class:`~pyspark.sql.Column` matched value specified by `idx` group id. + See Also + -------- + :meth:`pyspark.sql.functions.regexp_extract_all` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect() - [Row(d='100')] + >>> df.select('*', sf.regexp_extract('str', r'(\d+)-(\d+)', 1)).show() + +-------+-----------------------------------+ + | str|regexp_extract(str, (\d+)-(\d+), 1)| + +-------+-----------------------------------+ + |100-200| 100| + +-------+-----------------------------------+ + >>> df = spark.createDataFrame([('foo',)], ['str']) - >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect() - [Row(d='')] + >>> df.select('*', sf.regexp_extract('str', r'(\d+)', 1)).show() + +---+-----------------------------+ + |str|regexp_extract(str, (\d+), 1)| + +---+-----------------------------+ + |foo| | + +---+-----------------------------+ + >>> df = spark.createDataFrame([('aaaac',)], ['str']) - >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() - [Row(d='')] + >>> df.select('*', sf.regexp_extract(sf.col('str'), '(a+)(b)?(c)', 2)).show() + +-----+-----------------------------------+ + | str|regexp_extract(str, (a+)(b)?(c), 2)| + +-----+-----------------------------------+ + |aaaac| | + +-----+-----------------------------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -14412,11 +15488,11 @@ def regexp_extract_all( Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - regexp : :class:`~pyspark.sql.Column` or str + regexp : :class:`~pyspark.sql.Column` or column name regex pattern to apply. - idx : int, optional + idx : :class:`~pyspark.sql.Column` or int, optional matched group id. Returns @@ -14424,17 +15500,48 @@ def regexp_extract_all( :class:`~pyspark.sql.Column` all strings in the `str` that match a Java regex and corresponding to the regex group index. + See Also + -------- + :meth:`pyspark.sql.functions.regexp_extract` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")], ["str", "regexp"]) - >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)')).alias('d')).collect() - [Row(d=['100', '300'])] - >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 1).alias('d')).collect() - [Row(d=['100', '300'])] - >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 2).alias('d')).collect() - [Row(d=['200', '400'])] - >>> df.select(regexp_extract_all('str', col("regexp")).alias('d')).collect() - [Row(d=['100', '300'])] + >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'))).show() + +----------------+-----------+---------------------------------------+ + | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 1)| + +----------------+-----------+---------------------------------------+ + |100-200, 300-400|(\d+)-(\d+)| [100, 300]| + +----------------+-----------+---------------------------------------+ + + >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'), sf.lit(1))).show() + +----------------+-----------+---------------------------------------+ + | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 1)| + +----------------+-----------+---------------------------------------+ + |100-200, 300-400|(\d+)-(\d+)| [100, 300]| + +----------------+-----------+---------------------------------------+ + + >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'), 2)).show() + +----------------+-----------+---------------------------------------+ + | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 2)| + +----------------+-----------+---------------------------------------+ + |100-200, 300-400|(\d+)-(\d+)| [200, 400]| + +----------------+-----------+---------------------------------------+ + + >>> df.select('*', sf.regexp_extract_all('str', sf.col("regexp"))).show() + +----------------+-----------+----------------------------------+ + | str| regexp|regexp_extract_all(str, regexp, 1)| + +----------------+-----------+----------------------------------+ + |100-200, 300-400|(\d+)-(\d+)| [100, 300]| + +----------------+-----------+----------------------------------+ + + >>> df.select('*', sf.regexp_extract_all(sf.col('str'), "regexp")).show() + +----------------+-----------+----------------------------------+ + | str| regexp|regexp_extract_all(str, regexp, 1)| + +----------------+-----------+----------------------------------+ + |100-200, 300-400|(\d+)-(\d+)| [100, 300]| + +----------------+-----------+----------------------------------+ """ if idx is None: return _invoke_function_over_columns("regexp_extract_all", str, regexp) @@ -14469,43 +15576,102 @@ def regexp_replace( Examples -------- - >>> df = spark.createDataFrame([("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"]) - >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect() - [Row(d='-----')] - >>> df.select(regexp_replace("str", col("pattern"), col("replacement")).alias('d')).collect() - [Row(d='-----')] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("100-200", r"(\d+)", "--")], + ... ["str", "pattern", "replacement"] + ... ) + + Example 1: Replaces all the substrings in the `str` column name that + match the regex pattern `(\d+)` (one or more digits) with the replacement + string "--". + + >>> df.select('*', sf.regexp_replace('str', r'(\d+)', '--')).show() + +-------+-------+-----------+---------------------------------+ + | str|pattern|replacement|regexp_replace(str, (\d+), --, 1)| + +-------+-------+-----------+---------------------------------+ + |100-200| (\d+)| --| -----| + +-------+-------+-----------+---------------------------------+ + + Example 2: Replaces all the substrings in the `str` Column that match + the regex pattern in the `pattern` Column with the string in the `replacement` + column. + + >>> df.select('*', \ + ... sf.regexp_replace(sf.col("str"), sf.col("pattern"), sf.col("replacement")) \ + ... ).show() + +-------+-------+-----------+--------------------------------------------+ + | str|pattern|replacement|regexp_replace(str, pattern, replacement, 1)| + +-------+-------+-----------+--------------------------------------------+ + |100-200| (\d+)| --| -----| + +-------+-------+-----------+--------------------------------------------+ """ return _invoke_function_over_columns("regexp_replace", string, lit(pattern), lit(replacement)) @_try_remote_functions def regexp_substr(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: - r"""Returns the substring that matches the Java regex `regexp` within the string `str`. + r"""Returns the first substring that matches the Java regex `regexp` within the string `str`. If the regular expression is not found, the result is null. .. versionadded:: 3.5.0 Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - regexp : :class:`~pyspark.sql.Column` or str + regexp : :class:`~pyspark.sql.Column` or column name regex pattern to apply. Returns ------- :class:`~pyspark.sql.Column` - the substring that matches a Java regex within the string `str`. + the first substring that matches a Java regex within the string `str`. Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"]) - >>> df.select(regexp_substr('str', lit(r'\d+')).alias('d')).collect() - [Row(d='1')] - >>> df.select(regexp_substr('str', lit(r'mmm')).alias('d')).collect() - [Row(d=None)] - >>> df.select(regexp_substr("str", col("regexp")).alias('d')).collect() - [Row(d='1')] + + Example 1: Returns the first substring in the `str` column name that + matches the regex pattern `(\d+)` (one or more digits). + + >>> df.select('*', sf.regexp_substr('str', sf.lit(r'\d+'))).show() + +---------+------+-----------------------+ + | str|regexp|regexp_substr(str, \d+)| + +---------+------+-----------------------+ + |1a 2b 14m| \d+| 1| + +---------+------+-----------------------+ + + Example 2: Returns the first substring in the `str` column name that + matches the regex pattern `(mmm)` (three consecutive 'm' characters) + + >>> df.select('*', sf.regexp_substr('str', sf.lit(r'mmm'))).show() + +---------+------+-----------------------+ + | str|regexp|regexp_substr(str, mmm)| + +---------+------+-----------------------+ + |1a 2b 14m| \d+| NULL| + +---------+------+-----------------------+ + + Example 3: Returns the first substring in the `str` column name that + matches the regex pattern in `regexp` Column. + + >>> df.select('*', sf.regexp_substr("str", sf.col("regexp"))).show() + +---------+------+--------------------------+ + | str|regexp|regexp_substr(str, regexp)| + +---------+------+--------------------------+ + |1a 2b 14m| \d+| 1| + +---------+------+--------------------------+ + + Example 4: Returns the first substring in the `str` Column that + matches the regex pattern in `regexp` column name. + + >>> df.select('*', sf.regexp_substr(sf.col("str"), "regexp")).show() + +---------+------+--------------------------+ + | str|regexp|regexp_substr(str, regexp)| + +---------+------+--------------------------+ + |1a 2b 14m| \d+| 1| + +---------+------+--------------------------+ """ return _invoke_function_over_columns("regexp_substr", str, regexp) @@ -14514,36 +15680,70 @@ def regexp_substr(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: def regexp_instr( str: "ColumnOrName", regexp: "ColumnOrName", idx: Optional[Union[int, Column]] = None ) -> Column: - r"""Extract all strings in the `str` that match the Java regex `regexp` + r"""Returns the position of the first substring in the `str` that match the Java regex `regexp` and corresponding to the regex group index. .. versionadded:: 3.5.0 Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - regexp : :class:`~pyspark.sql.Column` or str + regexp : :class:`~pyspark.sql.Column` or column name regex pattern to apply. - idx : int, optional + idx : :class:`~pyspark.sql.Column` or int, optional matched group id. Returns ------- :class:`~pyspark.sql.Column` - all strings in the `str` that match a Java regex and corresponding to the regex group index. + the position of the first substring in the `str` that match a Java regex and corresponding + to the regex group index. Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str", "regexp"]) - >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)')).alias('d')).collect() - [Row(d=1)] - >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 1).alias('d')).collect() - [Row(d=1)] - >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 2).alias('d')).collect() - [Row(d=1)] - >>> df.select(regexp_instr('str', col("regexp")).alias('d')).collect() - [Row(d=1)] + + Example 1: Returns the position of the first substring in the `str` column name that + match the regex pattern `(\d+(a|b|m))` (one or more digits followed by 'a', 'b', or 'm'). + + >>> df.select('*', sf.regexp_instr('str', sf.lit(r'\d+(a|b|m)'))).show() + +---------+----------+--------------------------------+ + | str| regexp|regexp_instr(str, \d+(a|b|m), 0)| + +---------+----------+--------------------------------+ + |1a 2b 14m|\d+(a|b|m)| 1| + +---------+----------+--------------------------------+ + + Example 2: Returns the position of the first substring in the `str` column name that + match the regex pattern `(\d+(a|b|m))` (one or more digits followed by 'a', 'b', or 'm'), + + >>> df.select('*', sf.regexp_instr('str', sf.lit(r'\d+(a|b|m)'), sf.lit(1))).show() + +---------+----------+--------------------------------+ + | str| regexp|regexp_instr(str, \d+(a|b|m), 1)| + +---------+----------+--------------------------------+ + |1a 2b 14m|\d+(a|b|m)| 1| + +---------+----------+--------------------------------+ + + Example 3: Returns the position of the first substring in the `str` column name that + match the regex pattern in `regexp` Column. + + >>> df.select('*', sf.regexp_instr('str', sf.col("regexp"))).show() + +---------+----------+----------------------------+ + | str| regexp|regexp_instr(str, regexp, 0)| + +---------+----------+----------------------------+ + |1a 2b 14m|\d+(a|b|m)| 1| + +---------+----------+----------------------------+ + + Example 4: Returns the position of the first substring in the `str` Column that + match the regex pattern in `regexp` column name. + + >>> df.select('*', sf.regexp_instr(sf.col("str"), "regexp")).show() + +---------+----------+----------------------------+ + | str| regexp|regexp_instr(str, regexp, 0)| + +---------+----------+----------------------------+ + |1a 2b 14m|\d+(a|b|m)| 1| + +---------+----------+----------------------------+ """ if idx is None: return _invoke_function_over_columns("regexp_instr", str, regexp) @@ -14562,7 +15762,7 @@ def initcap(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -14572,8 +15772,14 @@ def initcap(col: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect() - [Row(v='Ab Cd')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('ab cd',)], ['a']) + >>> df.select("*", sf.initcap("a")).show() + +-----+----------+ + | a|initcap(a)| + +-----+----------+ + |ab cd| Ab Cd| + +-----+----------+ """ return _invoke_function_over_columns("initcap", col) @@ -14590,7 +15796,7 @@ def soundex(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -14600,9 +15806,15 @@ def soundex(col: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name']) - >>> df.select(soundex(df.name).alias("soundex")).collect() - [Row(soundex='P362'), Row(soundex='U612')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ["s"]) + >>> df.select("*", sf.soundex("s")).show() + +-------+----------+ + | s|soundex(s)| + +-------+----------+ + | Peters| P362| + |Uhrbach| U612| + +-------+----------+ """ return _invoke_function_over_columns("soundex", col) @@ -14618,7 +15830,7 @@ def bin(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -14628,9 +15840,22 @@ def bin(col: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([2,5], "INT") - >>> df.select(bin(df.value).alias('c')).collect() - [Row(c='10'), Row(c='101')] + >>> import pyspark.sql.functions as sf + >>> spark.range(10).select("*", sf.bin("id")).show() + +---+-------+ + | id|bin(id)| + +---+-------+ + | 0| 0| + | 1| 1| + | 2| 10| + | 3| 11| + | 4| 100| + | 5| 101| + | 6| 110| + | 7| 111| + | 8| 1000| + | 9| 1001| + +---+-------+ """ return _invoke_function_over_columns("bin", col) @@ -14648,9 +15873,13 @@ def hex(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. + See Also + -------- + :meth:`pyspark.sql.functions.unhex` + Returns ------- :class:`~pyspark.sql.Column` @@ -14658,8 +15887,14 @@ def hex(col: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() - [Row(hex(a)='414243', hex(b)='3')] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('ABC', 3)], ['a', 'b']) + >>> df.select('*', sf.hex('a'), sf.hex(df.b)).show() + +---+---+------+------+ + | a| b|hex(a)|hex(b)| + +---+---+------+------+ + |ABC| 3|414243| 3| + +---+---+------+------+ """ return _invoke_function_over_columns("hex", col) @@ -14676,9 +15911,13 @@ def unhex(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. + See Also + -------- + :meth:`pyspark.sql.functions.hex` + Returns ------- :class:`~pyspark.sql.Column` @@ -14686,8 +15925,14 @@ def unhex(col: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect() - [Row(unhex(a)=bytearray(b'ABC'))] + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('414243',)], ['a']) + >>> df.select('*', sf.unhex('a')).show() + +------+----------+ + | a| unhex(a)| + +------+----------+ + |414243|[41 42 43]| + +------+----------+ """ return _invoke_function_over_columns("unhex", col) @@ -14722,14 +15967,22 @@ def uniform( Examples -------- - >>> spark.createDataFrame([('3',)], ['a']) \\ - ... .select(uniform(lit(0), lit(10), lit(0)).alias('result')) \\ - ... .selectExpr("result < 15").show() - +-------------+ - |(result < 15)| - +-------------+ - | true| - +-------------+ + >>> import pyspark.sql.functions as sf + >>> spark.range(0, 10, 1, 1).select(sf.uniform(5, 105, 3)).show() + +------------------+ + |uniform(5, 105, 3)| + +------------------+ + | 30| + | 71| + | 99| + | 77| + | 16| + | 25| + | 89| + | 80| + | 51| + | 83| + +------------------+ """ min = _enum_to_value(min) min = lit(min) @@ -15092,18 +16345,35 @@ def split_part(src: "ColumnOrName", delimiter: "ColumnOrName", partNum: "ColumnO Parameters ---------- - src : :class:`~pyspark.sql.Column` or str - A column of string to be splited. - delimiter : :class:`~pyspark.sql.Column` or str + src : :class:`~pyspark.sql.Column` or column name + A column of string to be split. + delimiter : :class:`~pyspark.sql.Column` or column name A column of string, the delimiter used for split. - partNum : :class:`~pyspark.sql.Column` or str + partNum : :class:`~pyspark.sql.Column` or column name A column of string, requested part of the split (1-based). + See Also + -------- + :meth:`pyspark.sql.functions.sentences` + :meth:`pyspark.sql.functions.split` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"]) - >>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect() - [Row(r='13')] + >>> df.select("*", sf.split_part("a", "b", "c")).show() + +--------+---+---+-------------------+ + | a| b| c|split_part(a, b, c)| + +--------+---+---+-------------------+ + |11.12.13| .| 3| 13| + +--------+---+---+-------------------+ + + >>> df.select("*", sf.split_part(df.a, df.b, sf.lit(-2))).show() + +--------+---+---+--------------------+ + | a| b| c|split_part(a, b, -2)| + +--------+---+---+--------------------+ + |11.12.13| .| 3| 12| + +--------+---+---+--------------------+ """ return _invoke_function_over_columns("split_part", src, delimiter, partNum) @@ -15120,34 +16390,42 @@ def substr( Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name A column of string. - pos : :class:`~pyspark.sql.Column` or str + pos : :class:`~pyspark.sql.Column` or column name A column of string, the substring of `str` that starts at `pos`. - len : :class:`~pyspark.sql.Column` or str, optional + len : :class:`~pyspark.sql.Column` or column name, optional A column of string, the substring of `str` is of length `len`. + Returns + ------- + :class:`~pyspark.sql.Column` + substring of given value. + + See Also + -------- + :meth:`pyspark.sql.functions.instr` + :meth:`pyspark.sql.functions.substring` + :meth:`pyspark.sql.functions.substring_index` + :meth:`pyspark.sql.Column.substr` + Examples -------- - >>> import pyspark.sql.functions as sf - >>> spark.createDataFrame( - ... [("Spark SQL", 5, 1,)], ["a", "b", "c"] - ... ).select(sf.substr("a", "b", "c")).show() - +---------------+ - |substr(a, b, c)| - +---------------+ - | k| - +---------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("Spark SQL", 5, 1,)], ["a", "b", "c"]) + >>> df.select("*", sf.substr("a", "b", "c")).show() + +---------+---+---+---------------+ + | a| b| c|substr(a, b, c)| + +---------+---+---+---------------+ + |Spark SQL| 5| 1| k| + +---------+---+---+---------------+ - >>> import pyspark.sql.functions as sf - >>> spark.createDataFrame( - ... [("Spark SQL", 5, 1,)], ["a", "b", "c"] - ... ).select(sf.substr("a", "b")).show() - +------------------------+ - |substr(a, b, 2147483647)| - +------------------------+ - | k SQL| - +------------------------+ + >>> df.select("*", sf.substr(df.a, df.b)).show() + +---------+---+---+------------------------+ + | a| b| c|substr(a, b, 2147483647)| + +---------+---+---+------------------------+ + |Spark SQL| 5| 1| k SQL| + +---------+---+---+------------------------+ """ if len is not None: return _invoke_function_over_columns("substr", str, pos, len) @@ -16326,12 +17604,12 @@ def collation(col: "ColumnOrName") -> Column: Examples -------- >>> df = spark.createDataFrame([('name',)], ['dt']) - >>> df.select(collation('dt').alias('collation')).show() - +-----------+ - | collation| - +-----------+ - |UTF8_BINARY| - +-----------+ + >>> df.select(collation('dt').alias('collation')).show(truncate=False) + +--------------------------+ + |collation | + +--------------------------+ + |SYSTEM.BUILTIN.UTF8_BINARY| + +--------------------------+ """ return _invoke_function_over_columns("collation", col) @@ -16974,6 +18252,7 @@ def concat(*cols: "ColumnOrName") -> Column: See Also -------- + :meth:`pyspark.sql.functions.concat_ws` :meth:`pyspark.sql.functions.array_join` : to concatenate string columns with delimiter Examples @@ -18280,7 +19559,7 @@ def explode(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name Target column to work on. Returns @@ -18293,6 +19572,8 @@ def explode(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.posexplode` :meth:`pyspark.sql.functions.explode_outer` :meth:`pyspark.sql.functions.posexplode_outer` + :meth:`pyspark.sql.functions.inline` + :meth:`pyspark.sql.functions.inline_outer` Notes ----- @@ -18302,119 +19583,79 @@ def explode(col: "ColumnOrName") -> Column: -------- Example 1: Exploding an array column - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(id=1, values=[1, 2, 3])]) - >>> df.select(sf.explode(df.values).alias("value")).show() - +-----+ - |value| - +-----+ - | 1| - | 2| - | 3| - +-----+ + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') + >>> df.show() + +---+---------------+ + | i| a| + +---+---------------+ + | 1|[1, 2, 3, NULL]| + | 2| []| + | 3| NULL| + +---+---------------+ + + >>> df.select('*', sf.explode('a')).show() + +---+---------------+----+ + | i| a| col| + +---+---------------+----+ + | 1|[1, 2, 3, NULL]| 1| + | 1|[1, 2, 3, NULL]| 2| + | 1|[1, 2, 3, NULL]| 3| + | 1|[1, 2, 3, NULL]|NULL| + +---+---------------+----+ Example 2: Exploding a map column - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(id=1, values={"a": "b", "c": "d"})]) - >>> df.select(sf.explode(df.values).alias("key", "value")).show() - +---+-----+ - |key|value| - +---+-----+ - | a| b| - | c| d| - +---+-----+ - - Example 3: Exploding an array column with multiple rows - - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame( - ... [Row(id=1, values=[1, 2]), Row(id=2, values=[3, 4])]) - >>> df.select("id", sf.explode(df.values).alias("value")).show() - +---+-----+ - | id|value| - +---+-----+ - | 1| 1| - | 1| 2| - | 2| 3| - | 2| 4| - +---+-----+ - - Example 4: Exploding a map column with multiple rows - - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([ - ... Row(id=1, values={"a": "b", "c": "d"}), - ... Row(id=2, values={"e": "f", "g": "h"}) - ... ]) - >>> df.select("id", sf.explode(df.values).alias("key", "value")).show() - +---+---+-----+ - | id|key|value| - +---+---+-----+ - | 1| a| b| - | 1| c| d| - | 2| e| f| - | 2| g| h| - +---+---+-----+ - - Example 5: Exploding multiple array columns + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') + >>> df.show(truncate=False) + +---+---------------------------+ + |i |m | + +---+---------------------------+ + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}| + |2 |{} | + |3 |NULL | + +---+---------------------------+ - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=1, list1=[1, 2], list2=[3, 4])]) - >>> df.select(sf.explode(df.list1).alias("list1"), "list2") \\ - ... .select("list1", sf.explode(df.list2).alias("list2")).show() - +-----+-----+ - |list1|list2| - +-----+-----+ - | 1| 3| - | 1| 4| - | 2| 3| - | 2| 4| - +-----+-----+ + >>> df.select('*', sf.explode('m')).show(truncate=False) + +---+---------------------------+---+-----+ + |i |m |key|value| + +---+---------------------------+---+-----+ + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |2 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|3 |4 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|5 |NULL | + +---+---------------------------+---+-----+ - Example 6: Exploding an array of struct column + Example 3: Exploding multiple array columns >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame( - ... [(1, [(1, 2), (3, 4)])], - ... "id: int, structlist: array>") - >>> df = df.select(sf.explode(df.structlist).alias("struct")) - >>> df.select("struct.*").show() + >>> df = spark.sql('SELECT ARRAY(1,2) AS a1, ARRAY(3,4,5) AS a2') + >>> df.select( + ... '*', sf.explode('a1').alias('v1') + ... ).select('*', sf.explode('a2').alias('v2')).show() + +------+---------+---+---+ + | a1| a2| v1| v2| + +------+---------+---+---+ + |[1, 2]|[3, 4, 5]| 1| 3| + |[1, 2]|[3, 4, 5]| 1| 4| + |[1, 2]|[3, 4, 5]| 1| 5| + |[1, 2]|[3, 4, 5]| 2| 3| + |[1, 2]|[3, 4, 5]| 2| 4| + |[1, 2]|[3, 4, 5]| 2| 5| + +------+---------+---+---+ + + Example 4: Exploding an array of struct column + + >>> import pyspark.sql.functions as sf + >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') + >>> df.select(sf.explode('a').alias("s")).select("s.*").show() +---+---+ | a| b| +---+---+ | 1| 2| | 3| 4| +---+---+ - - Example 7: Exploding an empty array column - - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([(1, [])], "id: int, values: array") - >>> df.select(sf.explode(df.values).alias("value")).show() - +-----+ - |value| - +-----+ - +-----+ - - Example 8: Exploding an empty map column - - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([(1, {})], "id: int, values: map") - >>> df.select(sf.explode(df.values).alias("key", "value")).show() - +---+-----+ - |key|value| - +---+-----+ - +---+-----+ - """ + """ # noqa: E501 return _invoke_function_over_columns("explode", col) @@ -18432,7 +19673,7 @@ def posexplode(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -18440,20 +19681,61 @@ def posexplode(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` one row per array item or map key value including positions as a separate column. + See Also + -------- + :meth:`pyspark.sql.functions.explode` + :meth:`pyspark.sql.functions.explode_outer` + :meth:`pyspark.sql.functions.posexplode_outer` + :meth:`pyspark.sql.functions.inline` + :meth:`pyspark.sql.functions.inline_outer` + Examples -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) - >>> df.select(posexplode(df.intlist)).collect() - [Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)] + Example 1: Exploding an array column - >>> df.select(posexplode(df.mapfield)).show() - +---+---+-----+ - |pos|key|value| - +---+---+-----+ - | 0| a| b| - +---+---+-----+ - """ + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') + >>> df.show() + +---+---------------+ + | i| a| + +---+---------------+ + | 1|[1, 2, 3, NULL]| + | 2| []| + | 3| NULL| + +---+---------------+ + + >>> df.select('*', sf.posexplode('a')).show() + +---+---------------+---+----+ + | i| a|pos| col| + +---+---------------+---+----+ + | 1|[1, 2, 3, NULL]| 0| 1| + | 1|[1, 2, 3, NULL]| 1| 2| + | 1|[1, 2, 3, NULL]| 2| 3| + | 1|[1, 2, 3, NULL]| 3|NULL| + +---+---------------+---+----+ + + Example 2: Exploding a map column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') + >>> df.show(truncate=False) + +---+---------------------------+ + |i |m | + +---+---------------------------+ + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}| + |2 |{} | + |3 |NULL | + +---+---------------------------+ + + >>> df.select('*', sf.posexplode('m')).show(truncate=False) + +---+---------------------------+---+---+-----+ + |i |m |pos|key|value| + +---+---------------------------+---+---+-----+ + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|0 |1 |2 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |3 |4 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|2 |5 |NULL | + +---+---------------------------+---+---+-----+ + """ # noqa: E501 return _invoke_function_over_columns("posexplode", col) @@ -18469,7 +19751,7 @@ def inline(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name Input column of values to explode. Returns @@ -18480,6 +19762,9 @@ def inline(col: "ColumnOrName") -> Column: See Also -------- :meth:`pyspark.sql.functions.explode` + :meth:`pyspark.sql.functions.explode_outer` + :meth:`pyspark.sql.functions.posexplode` + :meth:`pyspark.sql.functions.posexplode_outer` :meth:`pyspark.sql.functions.inline_outer` Examples @@ -18487,102 +19772,89 @@ def inline(col: "ColumnOrName") -> Column: Example 1: Using inline with a single struct array column >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])]) - >>> df.select(sf.inline(df.structlist)).show() - +---+---+ - | a| b| - +---+---+ - | 1| 2| - | 3| 4| - +---+---+ + >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') + >>> df.select('*', sf.inline(df.a)).show() + +----------------+---+---+ + | a| a| b| + +----------------+---+---+ + |[{1, 2}, {3, 4}]| 1| 2| + |[{1, 2}, {3, 4}]| 3| 4| + +----------------+---+---+ Example 2: Using inline with a column name >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])]) - >>> df.select(sf.inline("structlist")).show() - +---+---+ - | a| b| - +---+---+ - | 1| 2| - | 3| 4| - +---+---+ + >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') + >>> df.select('*', sf.inline('a')).show() + +----------------+---+---+ + | a| a| b| + +----------------+---+---+ + |[{1, 2}, {3, 4}]| 1| 2| + |[{1, 2}, {3, 4}]| 3| 4| + +----------------+---+---+ Example 3: Using inline with an alias >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])]) - >>> df.select(sf.inline("structlist").alias("c1", "c2")).show() - +---+---+ - | c1| c2| - +---+---+ - | 1| 2| - | 3| 4| - +---+---+ + >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') + >>> df.select('*', sf.inline('a').alias("c1", "c2")).show() + +----------------+---+---+ + | a| c1| c2| + +----------------+---+---+ + |[{1, 2}, {3, 4}]| 1| 2| + |[{1, 2}, {3, 4}]| 3| 4| + +----------------+---+---+ Example 4: Using inline with multiple struct array columns >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([ - ... Row(structlist1=[Row(a=1, b=2), Row(a=3, b=4)], - ... structlist2=[Row(c=5, d=6), Row(c=7, d=8)]) - ... ]) - >>> df.select(sf.inline("structlist1"), "structlist2") \\ - ... .select("a", "b", sf.inline("structlist2")).show() - +---+---+---+---+ - | a| b| c| d| - +---+---+---+---+ - | 1| 2| 5| 6| - | 1| 2| 7| 8| - | 3| 4| 5| 6| - | 3| 4| 7| 8| - +---+---+---+---+ + >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a1, ARRAY(NAMED_STRUCT("c",5,"d",6), NAMED_STRUCT("c",7,"d",8)) AS a2') + >>> df.select( + ... '*', sf.inline('a1') + ... ).select('*', sf.inline('a2')).show() + +----------------+----------------+---+---+---+---+ + | a1| a2| a| b| c| d| + +----------------+----------------+---+---+---+---+ + |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 1| 2| 5| 6| + |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 1| 2| 7| 8| + |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 3| 4| 5| 6| + |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 3| 4| 7| 8| + +----------------+----------------+---+---+---+---+ Example 5: Using inline with a nested struct array column >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([ - ... Row(structlist=Row(a=1, b=2, nested=[Row(c=3, d=4), Row(c=5, d=6)])) - ... ]) - >>> df.select(sf.inline("structlist.nested")).show() - +---+---+ - | c| d| - +---+---+ - | 3| 4| - | 5| 6| - +---+---+ - - Example 6: Using inline with an empty struct array column - - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame( - ... [Row(structlist=[])], "structlist: array>") - >>> df.select(sf.inline(df.structlist)).show() - +---+---+ - | a| b| - +---+---+ - +---+---+ - - Example 7: Using inline with a struct array column containing null values - - >>> import pyspark.sql.functions as sf - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), None, Row(a=3, b=4)])]) - >>> df.select(sf.inline(df.structlist)).show() - +----+----+ - | a| b| - +----+----+ - | 1| 2| - |NULL|NULL| - | 3| 4| - +----+----+ - """ + >>> df = spark.sql('SELECT NAMED_STRUCT("a",1,"b",2,"c",ARRAY(NAMED_STRUCT("c",3,"d",4), NAMED_STRUCT("c",5,"d",6))) AS s') + >>> df.select('*', sf.inline('s.c')).show(truncate=False) + +------------------------+---+---+ + |s |c |d | + +------------------------+---+---+ + |{1, 2, [{3, 4}, {5, 6}]}|3 |4 | + |{1, 2, [{3, 4}, {5, 6}]}|5 |6 | + +------------------------+---+---+ + + Example 6: Using inline with a column containing: array continaing null, empty array and null + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(NAMED_STRUCT("a",1,"b",2), NULL, NAMED_STRUCT("a",3,"b",4))), (2,ARRAY()), (3,NULL) AS t(i,s)') + >>> df.show(truncate=False) + +---+----------------------+ + |i |s | + +---+----------------------+ + |1 |[{1, 2}, NULL, {3, 4}]| + |2 |[] | + |3 |NULL | + +---+----------------------+ + + >>> df.select('*', sf.inline('s')).show(truncate=False) + +---+----------------------+----+----+ + |i |s |a |b | + +---+----------------------+----+----+ + |1 |[{1, 2}, NULL, {3, 4}]|1 |2 | + |1 |[{1, 2}, NULL, {3, 4}]|NULL|NULL| + |1 |[{1, 2}, NULL, {3, 4}]|3 |4 | + +---+----------------------+----+----+ + """ # noqa: E501 return _invoke_function_over_columns("inline", col) @@ -18601,7 +19873,7 @@ def explode_outer(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -18609,31 +19881,47 @@ def explode_outer(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` one row per array item or map key value. + See Also + -------- + :meth:`pyspark.sql.functions.explode` + :meth:`pyspark.sql.functions.posexplode` + :meth:`pyspark.sql.functions.posexplode_outer` + :meth:`pyspark.sql.functions.inline` + :meth:`pyspark.sql.functions.inline_outer` + Examples -------- - >>> df = spark.createDataFrame( - ... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], - ... ("id", "an_array", "a_map") - ... ) - >>> df.select("id", "an_array", explode_outer("a_map")).show() - +---+----------+----+-----+ - | id| an_array| key|value| - +---+----------+----+-----+ - | 1|[foo, bar]| x| 1.0| - | 2| []|NULL| NULL| - | 3| NULL|NULL| NULL| - +---+----------+----+-----+ - - >>> df.select("id", "a_map", explode_outer("an_array")).show() - +---+----------+----+ - | id| a_map| col| - +---+----------+----+ - | 1|{x -> 1.0}| foo| - | 1|{x -> 1.0}| bar| - | 2| {}|NULL| - | 3| NULL|NULL| - +---+----------+----+ - """ + Example 1: Using an array column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') + >>> df.select('*', sf.explode_outer('a')).show() + +---+---------------+----+ + | i| a| col| + +---+---------------+----+ + | 1|[1, 2, 3, NULL]| 1| + | 1|[1, 2, 3, NULL]| 2| + | 1|[1, 2, 3, NULL]| 3| + | 1|[1, 2, 3, NULL]|NULL| + | 2| []|NULL| + | 3| NULL|NULL| + +---+---------------+----+ + + Example 2: Using a map column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') + >>> df.select('*', sf.explode_outer('m')).show(truncate=False) + +---+---------------------------+----+-----+ + |i |m |key |value| + +---+---------------------------+----+-----+ + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |2 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|3 |4 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|5 |NULL | + |2 |{} |NULL|NULL | + |3 |NULL |NULL|NULL | + +---+---------------------------+----+-----+ + """ # noqa: E501 return _invoke_function_over_columns("explode_outer", col) @@ -18652,7 +19940,7 @@ def posexplode_outer(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. Returns @@ -18660,30 +19948,47 @@ def posexplode_outer(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` one row per array item or map key value including positions as a separate column. + See Also + -------- + :meth:`pyspark.sql.functions.explode` + :meth:`pyspark.sql.functions.explode_outer` + :meth:`pyspark.sql.functions.posexplode` + :meth:`pyspark.sql.functions.inline` + :meth:`pyspark.sql.functions.inline_outer` + Examples -------- - >>> df = spark.createDataFrame( - ... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], - ... ("id", "an_array", "a_map") - ... ) - >>> df.select("id", "an_array", posexplode_outer("a_map")).show() - +---+----------+----+----+-----+ - | id| an_array| pos| key|value| - +---+----------+----+----+-----+ - | 1|[foo, bar]| 0| x| 1.0| - | 2| []|NULL|NULL| NULL| - | 3| NULL|NULL|NULL| NULL| - +---+----------+----+----+-----+ - >>> df.select("id", "a_map", posexplode_outer("an_array")).show() - +---+----------+----+----+ - | id| a_map| pos| col| - +---+----------+----+----+ - | 1|{x -> 1.0}| 0| foo| - | 1|{x -> 1.0}| 1| bar| - | 2| {}|NULL|NULL| - | 3| NULL|NULL|NULL| - +---+----------+----+----+ - """ + Example 1: Using an array column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') + >>> df.select('*', sf.posexplode_outer('a')).show() + +---+---------------+----+----+ + | i| a| pos| col| + +---+---------------+----+----+ + | 1|[1, 2, 3, NULL]| 0| 1| + | 1|[1, 2, 3, NULL]| 1| 2| + | 1|[1, 2, 3, NULL]| 2| 3| + | 1|[1, 2, 3, NULL]| 3|NULL| + | 2| []|NULL|NULL| + | 3| NULL|NULL|NULL| + +---+---------------+----+----+ + + Example 2: Using a map column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') + >>> df.select('*', sf.posexplode_outer('m')).show(truncate=False) + +---+---------------------------+----+----+-----+ + |i |m |pos |key |value| + +---+---------------------------+----+----+-----+ + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|0 |1 |2 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |3 |4 | + |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|2 |5 |NULL | + |2 |{} |NULL|NULL|NULL | + |3 |NULL |NULL|NULL|NULL | + +---+---------------------------+----+----+-----+ + """ # noqa: E501 return _invoke_function_over_columns("posexplode_outer", col) @@ -18697,7 +20002,7 @@ def inline_outer(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name input column of values to explode. Returns @@ -18707,7 +20012,10 @@ def inline_outer(col: "ColumnOrName") -> Column: See Also -------- + :meth:`pyspark.sql.functions.explode` :meth:`pyspark.sql.functions.explode_outer` + :meth:`pyspark.sql.functions.posexplode` + :meth:`pyspark.sql.functions.posexplode_outer` :meth:`pyspark.sql.functions.inline` Notes @@ -18716,20 +20024,27 @@ def inline_outer(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([ - ... Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]), - ... Row(id=2, structlist=[]) - ... ]) - >>> df.select('id', inline_outer(df.structlist)).show() - +---+----+----+ - | id| a| b| - +---+----+----+ - | 1| 1| 2| - | 1| 3| 4| - | 2|NULL|NULL| - +---+----+----+ - """ + >>> from pyspark.sql import functions as sf + >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(NAMED_STRUCT("a",1,"b",2), NULL, NAMED_STRUCT("a",3,"b",4))), (2,ARRAY()), (3,NULL) AS t(i,s)') + >>> df.printSchema() + root + |-- i: integer (nullable = false) + |-- s: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- a: integer (nullable = false) + | | |-- b: integer (nullable = false) + + >>> df.select('*', sf.inline_outer('s')).show(truncate=False) + +---+----------------------+----+----+ + |i |s |a |b | + +---+----------------------+----+----+ + |1 |[{1, 2}, NULL, {3, 4}]|1 |2 | + |1 |[{1, 2}, NULL, {3, 4}]|NULL|NULL| + |1 |[{1, 2}, NULL, {3, 4}]|3 |4 | + |2 |[] |NULL|NULL| + |3 |NULL |NULL|NULL| + +---+----------------------+----+----+ + """ # noqa: E501 return _invoke_function_over_columns("inline_outer", col) @@ -18817,7 +20132,7 @@ def from_json( """ Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType` as keys type, :class:`StructType` or :class:`ArrayType` with - the specified schema. Returns `null`, in the case of an unparseable string. + the specified schema. Returns `null`, in the case of an unparsable string. .. versionadded:: 2.1.0 @@ -19429,7 +20744,7 @@ def from_xml( ) -> Column: """ Parses a column containing a XML string to a row with - the specified schema. Returns `null`, in the case of an unparseable string. + the specified schema. Returns `null`, in the case of an unparsable string. .. versionadded:: 4.0.0 @@ -21823,7 +23138,7 @@ def transform_keys(col: "ColumnOrName", f: Callable[[Column, Column], Column]) - Returns ------- :class:`~pyspark.sql.Column` - a new map of enties where new keys were calculated by applying given function to + a new map of entries where new keys were calculated by applying given function to each key value argument. Examples @@ -21863,7 +23178,7 @@ def transform_values(col: "ColumnOrName", f: Callable[[Column, Column], Column]) Returns ------- :class:`~pyspark.sql.Column` - a new map of enties where new values were calculated by applying given function to + a new map of entries where new values were calculated by applying given function to each key value argument. Examples @@ -22314,7 +23629,7 @@ def convert_timezone( the current session time zone is used as the source time zone. targetTz : :class:`~pyspark.sql.Column` The time zone to which the input timestamp should be converted. - sourceTs : :class:`~pyspark.sql.Column` + sourceTs : :class:`~pyspark.sql.Column` or column name A timestamp without time zone. Returns @@ -22322,35 +23637,43 @@ def convert_timezone( :class:`~pyspark.sql.Column` A new column that contains a timestamp for converted time zone. + See Also + -------- + :meth:`pyspark.sql.functions.current_timezone` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - Example 1: Converts the timestamp without time zone `sourceTs`, - the source time zone `sourceTz` is None. + Example 1: Converts the timestamp without time zone `sourceTs`. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(sf.convert_timezone( # doctest: +SKIP - ... None, sf.lit('Asia/Hong_Kong'), 'dt') - ... ).show() - +--------------------------------------------------------+ - |convert_timezone(current_timezone(), Asia/Hong_Kong, dt)| - +--------------------------------------------------------+ - | 2015-04-08 00:00:00| - +--------------------------------------------------------+ + >>> df = spark.createDataFrame([('2015-04-08 00:00:00',)], ['ts']) + >>> df.select( + ... '*', + ... sf.convert_timezone(None, sf.lit('Asia/Hong_Kong'), 'ts') + ... ).show() # doctest: +SKIP + +-------------------+--------------------------------------------------------+ + | ts|convert_timezone(current_timezone(), Asia/Hong_Kong, ts)| + +-------------------+--------------------------------------------------------+ + |2015-04-08 00:00:00| 2015-04-08 15:00:00| + +-------------------+--------------------------------------------------------+ - Example 2: Converts the timestamp without time zone `sourceTs`. + Example 2: Converts the timestamp with time zone `sourceTs`. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(sf.convert_timezone( - ... sf.lit('America/Los_Angeles'), sf.lit('Asia/Hong_Kong'), 'dt') + >>> df = spark.createDataFrame([('2015-04-08 15:00:00',)], ['ts']) + >>> df.select( + ... '*', + ... sf.convert_timezone(sf.lit('Asia/Hong_Kong'), sf.lit('America/Los_Angeles'), df.ts) ... ).show() - +---------------------------------------------------------+ - |convert_timezone(America/Los_Angeles, Asia/Hong_Kong, dt)| - +---------------------------------------------------------+ - | 2015-04-08 15:00:00| - +---------------------------------------------------------+ + +-------------------+---------------------------------------------------------+ + | ts|convert_timezone(Asia/Hong_Kong, America/Los_Angeles, ts)| + +-------------------+---------------------------------------------------------+ + |2015-04-08 15:00:00| 2015-04-08 00:00:00| + +-------------------+---------------------------------------------------------+ + + >>> spark.conf.unset("spark.sql.session.timeZone") """ if sourceTz is None: return _invoke_function_over_columns("convert_timezone", targetTz, sourceTs) @@ -22372,13 +23695,13 @@ def make_dt_interval( Parameters ---------- - days : :class:`~pyspark.sql.Column` or str, optional + days : :class:`~pyspark.sql.Column` or column name, optional The number of days, positive or negative. - hours : :class:`~pyspark.sql.Column` or str, optional + hours : :class:`~pyspark.sql.Column` or column name, optional The number of hours, positive or negative. - mins : :class:`~pyspark.sql.Column` or str, optional + mins : :class:`~pyspark.sql.Column` or column name, optional The number of minutes, positive or negative. - secs : :class:`~pyspark.sql.Column` or str, optional + secs : :class:`~pyspark.sql.Column` or column name, optional The number of seconds with the fractional part in microsecond precision. Returns @@ -22386,63 +23709,62 @@ def make_dt_interval( :class:`~pyspark.sql.Column` A new column that contains a DayTimeIntervalType duration. - Examples + See Also -------- + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.make_ym_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples + -------- Example 1: Make DayTimeIntervalType duration from days, hours, mins and secs. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], - ... ["day", "hour", "min", "sec"]) - >>> df.select(sf.make_dt_interval(df.day, df.hour, df.min, df.sec)).show(truncate=False) - +------------------------------------------+ - |make_dt_interval(day, hour, min, sec) | - +------------------------------------------+ - |INTERVAL '1 12:30:01.001001' DAY TO SECOND| - +------------------------------------------+ + >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) + >>> df.select('*', sf.make_dt_interval(df.day, df.hour, df.min, df.sec)).show(truncate=False) + +---+----+---+--------+------------------------------------------+ + |day|hour|min|sec |make_dt_interval(day, hour, min, sec) | + +---+----+---+--------+------------------------------------------+ + |1 |12 |30 |1.001001|INTERVAL '1 12:30:01.001001' DAY TO SECOND| + +---+----+---+--------+------------------------------------------+ Example 2: Make DayTimeIntervalType duration from days, hours and mins. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], - ... ["day", "hour", "min", "sec"]) - >>> df.select(sf.make_dt_interval(df.day, df.hour, df.min)).show(truncate=False) - +-----------------------------------+ - |make_dt_interval(day, hour, min, 0)| - +-----------------------------------+ - |INTERVAL '1 12:30:00' DAY TO SECOND| - +-----------------------------------+ + >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) + >>> df.select('*', sf.make_dt_interval(df.day, 'hour', df.min)).show(truncate=False) + +---+----+---+--------+-----------------------------------+ + |day|hour|min|sec |make_dt_interval(day, hour, min, 0)| + +---+----+---+--------+-----------------------------------+ + |1 |12 |30 |1.001001|INTERVAL '1 12:30:00' DAY TO SECOND| + +---+----+---+--------+-----------------------------------+ Example 3: Make DayTimeIntervalType duration from days and hours. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], - ... ["day", "hour", "min", "sec"]) - >>> df.select(sf.make_dt_interval(df.day, df.hour)).show(truncate=False) - +-----------------------------------+ - |make_dt_interval(day, hour, 0, 0) | - +-----------------------------------+ - |INTERVAL '1 12:00:00' DAY TO SECOND| - +-----------------------------------+ + >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) + >>> df.select('*', sf.make_dt_interval(df.day, df.hour)).show(truncate=False) + +---+----+---+--------+-----------------------------------+ + |day|hour|min|sec |make_dt_interval(day, hour, 0, 0) | + +---+----+---+--------+-----------------------------------+ + |1 |12 |30 |1.001001|INTERVAL '1 12:00:00' DAY TO SECOND| + +---+----+---+--------+-----------------------------------+ Example 4: Make DayTimeIntervalType duration from days. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], - ... ["day", "hour", "min", "sec"]) - >>> df.select(sf.make_dt_interval(df.day)).show(truncate=False) - +-----------------------------------+ - |make_dt_interval(day, 0, 0, 0) | - +-----------------------------------+ - |INTERVAL '1 00:00:00' DAY TO SECOND| - +-----------------------------------+ + >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) + >>> df.select('*', sf.make_dt_interval('day')).show(truncate=False) + +---+----+---+--------+-----------------------------------+ + |day|hour|min|sec |make_dt_interval(day, 0, 0, 0) | + +---+----+---+--------+-----------------------------------+ + |1 |12 |30 |1.001001|INTERVAL '1 00:00:00' DAY TO SECOND| + +---+----+---+--------+-----------------------------------+ - Example 5: Make DayTimeIntervalType duration. + Example 5: Make empty interval. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], - ... ["day", "hour", "min", "sec"]) - >>> df.select(sf.make_dt_interval()).show(truncate=False) + >>> spark.range(1).select(sf.make_dt_interval()).show(truncate=False) +-----------------------------------+ |make_dt_interval(0, 0, 0, 0) | +-----------------------------------+ @@ -22474,19 +23796,19 @@ def try_make_interval( Parameters ---------- - years : :class:`~pyspark.sql.Column` or str, optional + years : :class:`~pyspark.sql.Column` or column name, optional The number of years, positive or negative. - months : :class:`~pyspark.sql.Column` or str, optional + months : :class:`~pyspark.sql.Column` or column name, optional The number of months, positive or negative. - weeks : :class:`~pyspark.sql.Column` or str, optional + weeks : :class:`~pyspark.sql.Column` or column name, optional The number of weeks, positive or negative. - days : :class:`~pyspark.sql.Column` or str, optional + days : :class:`~pyspark.sql.Column` or column name, optional The number of days, positive or negative. - hours : :class:`~pyspark.sql.Column` or str, optional + hours : :class:`~pyspark.sql.Column` or column name, optional The number of hours, positive or negative. - mins : :class:`~pyspark.sql.Column` or str, optional + mins : :class:`~pyspark.sql.Column` or column name, optional The number of minutes, positive or negative. - secs : :class:`~pyspark.sql.Column` or str, optional + secs : :class:`~pyspark.sql.Column` or column name, optional The number of seconds with the fractional part in microsecond precision. Returns @@ -22494,16 +23816,21 @@ def try_make_interval( :class:`~pyspark.sql.Column` A new column that contains an interval. - Examples + See Also -------- + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.make_dt_interval` + :meth:`pyspark.sql.functions.make_ym_interval` + Examples + -------- Example 1: Try make interval from years, months, weeks, days, hours, mins and secs. >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval( - ... df.year, df.month, df.week, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.try_make_interval(df.year, df.month, 'week', df.day, 'hour', df.min, df.sec) ... ).show(truncate=False) +---------------------------------------------------------------+ |try_make_interval(year, month, week, day, hour, min, sec) | @@ -22515,9 +23842,9 @@ def try_make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval( - ... df.year, df.month, df.week, df.day, df.hour, df.min) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.try_make_interval(df.year, df.month, 'week', df.day, df.hour, df.min) ... ).show(truncate=False) +-------------------------------------------------------+ |try_make_interval(year, month, week, day, hour, min, 0)| @@ -22529,9 +23856,9 @@ def try_make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval( - ... df.year, df.month, df.week, df.day, df.hour) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.try_make_interval(df.year, df.month, 'week', df.day, df.hour) ... ).show(truncate=False) +-----------------------------------------------------+ |try_make_interval(year, month, week, day, hour, 0, 0)| @@ -22543,8 +23870,8 @@ def try_make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval(df.year, df.month, df.week, df.day)).show(truncate=False) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select(sf.try_make_interval(df.year, 'month', df.week, df.day)).show(truncate=False) +--------------------------------------------------+ |try_make_interval(year, month, week, day, 0, 0, 0)| +--------------------------------------------------+ @@ -22555,8 +23882,8 @@ def try_make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval(df.year, df.month, df.week)).show(truncate=False) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select(sf.try_make_interval(df.year, 'month', df.week)).show(truncate=False) +------------------------------------------------+ |try_make_interval(year, month, week, 0, 0, 0, 0)| +------------------------------------------------+ @@ -22567,8 +23894,8 @@ def try_make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval(df.year, df.month)).show(truncate=False) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select(sf.try_make_interval(df.year, 'month')).show(truncate=False) +---------------------------------------------+ |try_make_interval(year, month, 0, 0, 0, 0, 0)| +---------------------------------------------+ @@ -22579,7 +23906,7 @@ def try_make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) >>> df.select(sf.try_make_interval(df.year)).show(truncate=False) +-----------------------------------------+ |try_make_interval(year, 0, 0, 0, 0, 0, 0)| @@ -22587,18 +23914,25 @@ def try_make_interval( |100 years | +-----------------------------------------+ - Example 8: Try make interval from years with overflow. + Example 8: Try make empty interval. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[2147483647, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_interval(df.year)).show(truncate=False) - +-----------------------------------------+ - |try_make_interval(year, 0, 0, 0, 0, 0, 0)| - +-----------------------------------------+ - |NULL | - +-----------------------------------------+ + >>> spark.range(1).select(sf.try_make_interval()).show(truncate=False) + +--------------------------------------+ + |try_make_interval(0, 0, 0, 0, 0, 0, 0)| + +--------------------------------------+ + |0 seconds | + +--------------------------------------+ + + Example 9: Try make interval from years with overflow. + >>> import pyspark.sql.functions as sf + >>> spark.range(1).select(sf.try_make_interval(sf.lit(2147483647))).show(truncate=False) + +-----------------------------------------------+ + |try_make_interval(2147483647, 0, 0, 0, 0, 0, 0)| + +-----------------------------------------------+ + |NULL | + +-----------------------------------------------+ """ _years = lit(0) if years is None else years _months = lit(0) if months is None else months @@ -22629,19 +23963,19 @@ def make_interval( Parameters ---------- - years : :class:`~pyspark.sql.Column` or str, optional + years : :class:`~pyspark.sql.Column` or column name, optional The number of years, positive or negative. - months : :class:`~pyspark.sql.Column` or str, optional + months : :class:`~pyspark.sql.Column` or column name, optional The number of months, positive or negative. - weeks : :class:`~pyspark.sql.Column` or str, optional + weeks : :class:`~pyspark.sql.Column` or column name, optional The number of weeks, positive or negative. - days : :class:`~pyspark.sql.Column` or str, optional + days : :class:`~pyspark.sql.Column` or column name, optional The number of days, positive or negative. - hours : :class:`~pyspark.sql.Column` or str, optional + hours : :class:`~pyspark.sql.Column` or column name, optional The number of hours, positive or negative. - mins : :class:`~pyspark.sql.Column` or str, optional + mins : :class:`~pyspark.sql.Column` or column name, optional The number of minutes, positive or negative. - secs : :class:`~pyspark.sql.Column` or str, optional + secs : :class:`~pyspark.sql.Column` or column name, optional The number of seconds with the fractional part in microsecond precision. Returns @@ -22649,16 +23983,21 @@ def make_interval( :class:`~pyspark.sql.Column` A new column that contains an interval. - Examples + See Also -------- + :meth:`pyspark.sql.functions.make_dt_interval` + :meth:`pyspark.sql.functions.make_ym_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples + -------- Example 1: Make interval from years, months, weeks, days, hours, mins and secs. >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_interval( - ... df.year, df.month, df.week, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.make_interval(df.year, df.month, 'week', df.day, df.hour, df.min, df.sec) ... ).show(truncate=False) +---------------------------------------------------------------+ |make_interval(year, month, week, day, hour, min, sec) | @@ -22670,9 +24009,9 @@ def make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_interval( - ... df.year, df.month, df.week, df.day, df.hour, df.min) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.make_interval(df.year, df.month, 'week', df.day, df.hour, df.min) ... ).show(truncate=False) +---------------------------------------------------+ |make_interval(year, month, week, day, hour, min, 0)| @@ -22684,9 +24023,9 @@ def make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_interval( - ... df.year, df.month, df.week, df.day, df.hour) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.make_interval(df.year, df.month, 'week', df.day, df.hour) ... ).show(truncate=False) +-------------------------------------------------+ |make_interval(year, month, week, day, hour, 0, 0)| @@ -22698,8 +24037,8 @@ def make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_interval(df.year, df.month, df.week, df.day)).show(truncate=False) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select(sf.make_interval(df.year, df.month, 'week', df.day)).show(truncate=False) +----------------------------------------------+ |make_interval(year, month, week, day, 0, 0, 0)| +----------------------------------------------+ @@ -22710,8 +24049,8 @@ def make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_interval(df.year, df.month, df.week)).show(truncate=False) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) + >>> df.select(sf.make_interval(df.year, df.month, 'week')).show(truncate=False) +--------------------------------------------+ |make_interval(year, month, week, 0, 0, 0, 0)| +--------------------------------------------+ @@ -22722,7 +24061,7 @@ def make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) >>> df.select(sf.make_interval(df.year, df.month)).show(truncate=False) +-----------------------------------------+ |make_interval(year, month, 0, 0, 0, 0, 0)| @@ -22734,7 +24073,7 @@ def make_interval( >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) + ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) >>> df.select(sf.make_interval(df.year)).show(truncate=False) +-------------------------------------+ |make_interval(year, 0, 0, 0, 0, 0, 0)| @@ -22742,12 +24081,10 @@ def make_interval( |100 years | +-------------------------------------+ - Example 8: Make interval. + Example 8: Make empty interval. >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], - ... ["year", "month", "week", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_interval()).show(truncate=False) + >>> spark.range(1).select(sf.make_interval()).show(truncate=False) +----------------------------------+ |make_interval(0, 0, 0, 0, 0, 0, 0)| +----------------------------------+ @@ -22786,22 +24123,22 @@ def make_timestamp( Parameters ---------- - years : :class:`~pyspark.sql.Column` or str + years : :class:`~pyspark.sql.Column` or column name The year to represent, from 1 to 9999 - months : :class:`~pyspark.sql.Column` or str + months : :class:`~pyspark.sql.Column` or column name The month-of-year to represent, from 1 (January) to 12 (December) - days : :class:`~pyspark.sql.Column` or str + days : :class:`~pyspark.sql.Column` or column name The day-of-month to represent, from 1 to 31 - hours : :class:`~pyspark.sql.Column` or str + hours : :class:`~pyspark.sql.Column` or column name The hour-of-day to represent, from 0 to 23 - mins : :class:`~pyspark.sql.Column` or str + mins : :class:`~pyspark.sql.Column` or column name The minute-of-hour to represent, from 0 to 59 - secs : :class:`~pyspark.sql.Column` or str + secs : :class:`~pyspark.sql.Column` or column name The second-of-minute and its micro-fraction to represent, from 0 to 60. The value can be either an integer like 13 , or a fraction like 13.123. If the sec argument equals to 60, the seconds field is set to 0 and 1 minute is added to the final timestamp. - timezone : :class:`~pyspark.sql.Column` or str, optional + timezone : :class:`~pyspark.sql.Column` or column name, optional The time zone identifier. For example, CET, UTC and etc. Returns @@ -22809,38 +24146,48 @@ def make_timestamp( :class:`~pyspark.sql.Column` A new column that contains a timestamp. + See Also + -------- + :meth:`pyspark.sql.functions.make_timestamp_ltz` + :meth:`pyspark.sql.functions.make_timestamp_ntz` + :meth:`pyspark.sql.functions.try_make_timestamp` + :meth:`pyspark.sql.functions.try_make_timestamp_ltz` + :meth:`pyspark.sql.functions.try_make_timestamp_ntz` + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") Example 1: Make timestamp from years, months, days, hours, mins and secs. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.make_timestamp( - ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec, 'tz') ... ).show(truncate=False) - +----------------------------------------------------------+ - |make_timestamp(year, month, day, hour, min, sec, timezone)| - +----------------------------------------------------------+ - |2014-12-27 21:30:45.887 | - +----------------------------------------------------------+ + +----------------------------------------------------+ + |make_timestamp(year, month, day, hour, min, sec, tz)| + +----------------------------------------------------+ + |2014-12-27 21:30:45.887 | + +----------------------------------------------------+ Example 2: Make timestamp without timezone. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.make_timestamp( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec) ... ).show(truncate=False) +------------------------------------------------+ |make_timestamp(year, month, day, hour, min, sec)| +------------------------------------------------+ |2014-12-28 06:30:45.887 | +------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ if timezone is not None: @@ -22895,17 +24242,27 @@ def try_make_timestamp( :class:`~pyspark.sql.Column` A new column that contains a timestamp or NULL in case of an error. + See Also + -------- + :meth:`pyspark.sql.functions.make_timestamp` + :meth:`pyspark.sql.functions.make_timestamp_ltz` + :meth:`pyspark.sql.functions.make_timestamp_ntz` + :meth:`pyspark.sql.functions.try_make_timestamp_ltz` + :meth:`pyspark.sql.functions.try_make_timestamp_ntz` + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") Example 1: Make timestamp from years, months, days, hours, mins and secs. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.try_make_timestamp( - ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.try_make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec, 'tz') ... ).show(truncate=False) +----------------------------------------------------+ |try_make_timestamp(year, month, day, hour, min, sec)| @@ -22916,11 +24273,10 @@ def try_make_timestamp( Example 2: Make timestamp without timezone. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.try_make_timestamp( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.try_make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec) ... ).show(truncate=False) +----------------------------------------------------+ |try_make_timestamp(year, month, day, hour, min, sec)| @@ -22932,17 +24288,17 @@ def try_make_timestamp( Example 3: Make timestamp with invalid input. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 13, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.try_make_timestamp( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.try_make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec) ... ).show(truncate=False) +----------------------------------------------------+ |try_make_timestamp(year, month, day, hour, min, sec)| +----------------------------------------------------+ |NULL | +----------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ if timezone is not None: @@ -22997,38 +24353,48 @@ def make_timestamp_ltz( :class:`~pyspark.sql.Column` A new column that contains a current timestamp. + See Also + -------- + :meth:`pyspark.sql.functions.make_timestamp` + :meth:`pyspark.sql.functions.make_timestamp_ntz` + :meth:`pyspark.sql.functions.try_make_timestamp` + :meth:`pyspark.sql.functions.try_make_timestamp_ltz` + :meth:`pyspark.sql.functions.try_make_timestamp_ntz` + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") Example 1: Make the current timestamp from years, months, days, hours, mins and secs. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.make_timestamp_ltz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.make_timestamp_ltz(df.year, df.month, 'day', df.hour, df.min, df.sec, 'tz') ... ).show(truncate=False) - +--------------------------------------------------------------+ - |make_timestamp_ltz(year, month, day, hour, min, sec, timezone)| - +--------------------------------------------------------------+ - |2014-12-27 21:30:45.887 | - +--------------------------------------------------------------+ + +--------------------------------------------------------+ + |make_timestamp_ltz(year, month, day, hour, min, sec, tz)| + +--------------------------------------------------------+ + |2014-12-27 21:30:45.887 | + +--------------------------------------------------------+ Example 2: Make the current timestamp without timezone. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.make_timestamp_ltz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.make_timestamp_ltz(df.year, df.month, 'day', df.hour, df.min, df.sec) ... ).show(truncate=False) +----------------------------------------------------+ |make_timestamp_ltz(year, month, day, hour, min, sec)| +----------------------------------------------------+ |2014-12-28 06:30:45.887 | +----------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ if timezone is not None: @@ -23083,54 +24449,62 @@ def try_make_timestamp_ltz( :class:`~pyspark.sql.Column` A new column that contains a current timestamp, or NULL in case of an error. + See Also + -------- + :meth:`pyspark.sql.functions.make_timestamp` + :meth:`pyspark.sql.functions.make_timestamp_ltz` + :meth:`pyspark.sql.functions.make_timestamp_ntz` + :meth:`pyspark.sql.functions.try_make_timestamp` + :meth:`pyspark.sql.functions.try_make_timestamp_ntz` + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") Example 1: Make the current timestamp from years, months, days, hours, mins and secs. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.try_make_timestamp_ltz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.try_make_timestamp_ltz('year', 'month', df.day, df.hour, df.min, df.sec, 'tz') ... ).show(truncate=False) - +------------------------------------------------------------------+ - |try_make_timestamp_ltz(year, month, day, hour, min, sec, timezone)| - +------------------------------------------------------------------+ - |2014-12-27 21:30:45.887 | - +------------------------------------------------------------------+ + +------------------------------------------------------------+ + |try_make_timestamp_ltz(year, month, day, hour, min, sec, tz)| + +------------------------------------------------------------+ + |2014-12-27 21:30:45.887 | + +------------------------------------------------------------+ Example 2: Make the current timestamp without timezone. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.try_make_timestamp_ltz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.try_make_timestamp_ltz('year', 'month', df.day, df.hour, df.min, df.sec) ... ).show(truncate=False) +--------------------------------------------------------+ |try_make_timestamp_ltz(year, month, day, hour, min, sec)| +--------------------------------------------------------+ |2014-12-28 06:30:45.887 | +--------------------------------------------------------+ - >>> spark.conf.unset("spark.sql.session.timeZone") Example 3: Make the current timestamp with invalid input. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 13, 28, 6, 30, 45.887, 'CET']], - ... ["year", "month", "day", "hour", "min", "sec", "timezone"]) - >>> df.select(sf.try_make_timestamp_ltz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) + >>> df.select( + ... sf.try_make_timestamp_ltz('year', 'month', df.day, df.hour, df.min, df.sec) ... ).show(truncate=False) +--------------------------------------------------------+ |try_make_timestamp_ltz(year, month, day, hour, min, sec)| +--------------------------------------------------------+ |NULL | +--------------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ if timezone is not None: @@ -23161,17 +24535,17 @@ def make_timestamp_ntz( Parameters ---------- - years : :class:`~pyspark.sql.Column` or str + years : :class:`~pyspark.sql.Column` or column name The year to represent, from 1 to 9999 - months : :class:`~pyspark.sql.Column` or str + months : :class:`~pyspark.sql.Column` or column name The month-of-year to represent, from 1 (January) to 12 (December) - days : :class:`~pyspark.sql.Column` or str + days : :class:`~pyspark.sql.Column` or column name The day-of-month to represent, from 1 to 31 - hours : :class:`~pyspark.sql.Column` or str + hours : :class:`~pyspark.sql.Column` or column name The hour-of-day to represent, from 0 to 23 - mins : :class:`~pyspark.sql.Column` or str + mins : :class:`~pyspark.sql.Column` or column name The minute-of-hour to represent, from 0 to 59 - secs : :class:`~pyspark.sql.Column` or str + secs : :class:`~pyspark.sql.Column` or column name The second-of-minute and its micro-fraction to represent, from 0 to 60. The value can be either an integer like 13 , or a fraction like 13.123. If the sec argument equals to 60, the seconds field is set @@ -23182,23 +24556,32 @@ def make_timestamp_ntz( :class:`~pyspark.sql.Column` A new column that contains a local date-time. - Examples + See Also -------- + :meth:`pyspark.sql.functions.make_timestamp` + :meth:`pyspark.sql.functions.make_timestamp_ltz` + :meth:`pyspark.sql.functions.try_make_timestamp` + :meth:`pyspark.sql.functions.try_make_timestamp_ltz` + :meth:`pyspark.sql.functions.try_make_timestamp_ntz` + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.try_make_interval` - Example 1: Make local date-time from years, months, days, hours, mins, secs. + Examples + -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]], - ... ["year", "month", "day", "hour", "min", "sec"]) - >>> df.select(sf.make_timestamp_ntz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.make_timestamp_ntz('year', 'month', df.day, df.hour, df.min, df.sec) ... ).show(truncate=False) +----------------------------------------------------+ |make_timestamp_ntz(year, month, day, hour, min, sec)| +----------------------------------------------------+ |2014-12-28 06:30:45.887 | +----------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ return _invoke_function_over_columns( @@ -23244,39 +24627,48 @@ def try_make_timestamp_ntz( :class:`~pyspark.sql.Column` A new column that contains a local date-time, or NULL in case of an error. + See Also + -------- + :meth:`pyspark.sql.functions.make_timestamp` + :meth:`pyspark.sql.functions.make_timestamp_ltz` + :meth:`pyspark.sql.functions.make_timestamp_ntz` + :meth:`pyspark.sql.functions.try_make_timestamp` + :meth:`pyspark.sql.functions.try_make_timestamp_ltz` + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") Example 1: Make local date-time from years, months, days, hours, mins, secs. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]], - ... ["year", "month", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_timestamp_ntz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.try_make_timestamp_ntz('year', 'month', df.day, df.hour, df.min, df.sec) ... ).show(truncate=False) +--------------------------------------------------------+ |try_make_timestamp_ntz(year, month, day, hour, min, sec)| +--------------------------------------------------------+ |2014-12-28 06:30:45.887 | +--------------------------------------------------------+ - >>> spark.conf.unset("spark.sql.session.timeZone") Example 2: Make local date-time with invalid input >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([[2014, 13, 28, 6, 30, 45.887]], - ... ["year", "month", "day", "hour", "min", "sec"]) - >>> df.select(sf.try_make_timestamp_ntz( - ... df.year, df.month, df.day, df.hour, df.min, df.sec) + ... ['year', 'month', 'day', 'hour', 'min', 'sec']) + >>> df.select( + ... sf.try_make_timestamp_ntz('year', 'month', df.day, df.hour, df.min, df.sec) ... ).show(truncate=False) +--------------------------------------------------------+ |try_make_timestamp_ntz(year, month, day, hour, min, sec)| +--------------------------------------------------------+ |NULL | +--------------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ return _invoke_function_over_columns( @@ -23296,9 +24688,9 @@ def make_ym_interval( Parameters ---------- - years : :class:`~pyspark.sql.Column` or str, optional + years : :class:`~pyspark.sql.Column` or column name, optional The number of years, positive or negative - months : :class:`~pyspark.sql.Column` or str, optional + months : :class:`~pyspark.sql.Column` or column name, optional The number of months, positive or negative Returns @@ -23306,44 +24698,48 @@ def make_ym_interval( :class:`~pyspark.sql.Column` A new column that contains a year-month interval. + See Also + -------- + :meth:`pyspark.sql.functions.make_interval` + :meth:`pyspark.sql.functions.make_dt_interval` + :meth:`pyspark.sql.functions.try_make_interval` + Examples -------- + >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") Example 1: Make year-month interval from years, months. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - >>> df = spark.createDataFrame([[2014, 12]], ["year", "month"]) - >>> df.select(sf.make_ym_interval(df.year, df.month)).show(truncate=False) - +-------------------------------+ - |make_ym_interval(year, month) | - +-------------------------------+ - |INTERVAL '2015-0' YEAR TO MONTH| - +-------------------------------+ + >>> df = spark.createDataFrame([[2014, 12]], ['year', 'month']) + >>> df.select('*', sf.make_ym_interval('year', df.month)).show(truncate=False) + +----+-----+-------------------------------+ + |year|month|make_ym_interval(year, month) | + +----+-----+-------------------------------+ + |2014|12 |INTERVAL '2015-0' YEAR TO MONTH| + +----+-----+-------------------------------+ Example 2: Make year-month interval from years. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - >>> df = spark.createDataFrame([[2014, 12]], ["year", "month"]) - >>> df.select(sf.make_ym_interval(df.year)).show(truncate=False) - +-------------------------------+ - |make_ym_interval(year, 0) | - +-------------------------------+ - |INTERVAL '2014-0' YEAR TO MONTH| - +-------------------------------+ + >>> df = spark.createDataFrame([[2014, 12]], ['year', 'month']) + >>> df.select('*', sf.make_ym_interval(df.year)).show(truncate=False) + +----+-----+-------------------------------+ + |year|month|make_ym_interval(year, 0) | + +----+-----+-------------------------------+ + |2014|12 |INTERVAL '2014-0' YEAR TO MONTH| + +----+-----+-------------------------------+ - Example 3: Make year-month interval. + Example 3: Make empty interval. >>> import pyspark.sql.functions as sf - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - >>> df = spark.createDataFrame([[2014, 12]], ["year", "month"]) - >>> df.select(sf.make_ym_interval()).show(truncate=False) + >>> spark.range(1).select(sf.make_ym_interval()).show(truncate=False) +----------------------------+ |make_ym_interval(0, 0) | +----------------------------+ |INTERVAL '0-0' YEAR TO MONTH| +----------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ _years = lit(0) if years is None else years @@ -23948,21 +25344,21 @@ def aes_encrypt( Parameters ---------- - input : :class:`~pyspark.sql.Column` or str + input : :class:`~pyspark.sql.Column` or column name The binary value to encrypt. - key : :class:`~pyspark.sql.Column` or str + key : :class:`~pyspark.sql.Column` or column name The passphrase to use to encrypt the data. mode : :class:`~pyspark.sql.Column` or str, optional Specifies which block cipher mode should be used to encrypt messages. Valid modes: ECB, GCM, CBC. - padding : :class:`~pyspark.sql.Column` or str, optional + padding : :class:`~pyspark.sql.Column` or column name, optional Specifies how to pad messages whose length is not a multiple of the block size. Valid values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS for CBC. - iv : :class:`~pyspark.sql.Column` or str, optional + iv : :class:`~pyspark.sql.Column` or column name, optional Optional initialization vector. Only supported for CBC and GCM modes. Valid values: None or "". 16-byte array for CBC mode. 12-byte array for GCM mode. - aad : :class:`~pyspark.sql.Column` or str, optional + aad : :class:`~pyspark.sql.Column` or column name, optional Optional additional authenticated data. Only supported for GCM mode. This can be any free-form input and must be provided for both encryption and decryption. @@ -23971,6 +25367,11 @@ def aes_encrypt( :class:`~pyspark.sql.Column` A new column that contains an encrypted value. + See Also + -------- + :meth:`pyspark.sql.functions.aes_decrypt` + :meth:`pyspark.sql.functions.try_aes_decrypt` + Examples -------- @@ -23983,7 +25384,7 @@ def aes_encrypt( ... ["input", "key", "mode", "padding", "iv", "aad"] ... ) >>> df.select(sf.base64(sf.aes_encrypt( - ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv, sf.lit("hex")), df.aad) + ... df.input, df.key, "mode", df.padding, sf.to_binary(df.iv, sf.lit("hex")), df.aad) ... )).show(truncate=False) +-----------------------------------------------------------------------+ |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), aad))| @@ -24000,7 +25401,7 @@ def aes_encrypt( ... ["input", "key", "mode", "padding", "iv", "aad"] ... ) >>> df.select(sf.base64(sf.aes_encrypt( - ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv, sf.lit("hex"))) + ... df.input, df.key, "mode", df.padding, sf.to_binary(df.iv, sf.lit("hex"))) ... )).show(truncate=False) +--------------------------------------------------------------------+ |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), ))| @@ -24015,7 +25416,7 @@ def aes_encrypt( ... "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)], ... ["input", "key", "mode", "padding"] ... ) - >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode, df.padding), + >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, "mode", df.padding), ... df.key, df.mode, df.padding ... ).cast("STRING")).show(truncate=False) +---------------------------------------------------------------------------------------------+ @@ -24031,7 +25432,7 @@ def aes_encrypt( ... "Spark SQL", "0000111122223333", "ECB",)], ... ["input", "key", "mode"] ... ) - >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode), + >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, "mode"), ... df.key, df.mode ... ).cast("STRING")).show(truncate=False) +---------------------------------------------------------------------------------------------+ @@ -24082,18 +25483,18 @@ def aes_decrypt( Parameters ---------- - input : :class:`~pyspark.sql.Column` or str + input : :class:`~pyspark.sql.Column` or column name The binary value to decrypt. - key : :class:`~pyspark.sql.Column` or str + key : :class:`~pyspark.sql.Column` or column name The passphrase to use to decrypt the data. - mode : :class:`~pyspark.sql.Column` or str, optional + mode : :class:`~pyspark.sql.Column` or column name, optional Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB, GCM, CBC. - padding : :class:`~pyspark.sql.Column` or str, optional + padding : :class:`~pyspark.sql.Column` or column name, optional Specifies how to pad messages whose length is not a multiple of the block size. Valid values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS for CBC. - aad : :class:`~pyspark.sql.Column` or str, optional + aad : :class:`~pyspark.sql.Column` or column name, optional Optional additional authenticated data. Only supported for GCM mode. This can be any free-form input and must be provided for both encryption and decryption. @@ -24102,6 +25503,11 @@ def aes_decrypt( :class:`~pyspark.sql.Column` A new column that contains a decrypted value. + See Also + -------- + :meth:`pyspark.sql.functions.aes_encrypt` + :meth:`pyspark.sql.functions.try_aes_decrypt` + Examples -------- @@ -24115,7 +25521,7 @@ def aes_decrypt( ... ["input", "key", "mode", "padding", "aad"] ... ) >>> df.select(sf.aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode, df.padding, df.aad + ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad ... ).cast("STRING")).show(truncate=False) +---------------------------------------------------------------------+ |CAST(aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)| @@ -24132,7 +25538,7 @@ def aes_decrypt( ... ["input", "key", "mode", "padding"] ... ) >>> df.select(sf.aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode, df.padding + ... sf.unbase64(df.input), df.key, "mode", df.padding ... ).cast("STRING")).show(truncate=False) +------------------------------------------------------------------+ |CAST(aes_decrypt(unbase64(input), key, mode, padding, ) AS STRING)| @@ -24149,7 +25555,7 @@ def aes_decrypt( ... ["input", "key", "mode", "padding"] ... ) >>> df.select(sf.aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode + ... sf.unbase64(df.input), df.key, "mode" ... ).cast("STRING")).show(truncate=False) +------------------------------------------------------------------+ |CAST(aes_decrypt(unbase64(input), key, mode, DEFAULT, ) AS STRING)| @@ -24201,18 +25607,18 @@ def try_aes_decrypt( Parameters ---------- - input : :class:`~pyspark.sql.Column` or str + input : :class:`~pyspark.sql.Column` or column name The binary value to decrypt. - key : :class:`~pyspark.sql.Column` or str + key : :class:`~pyspark.sql.Column` or column name The passphrase to use to decrypt the data. - mode : :class:`~pyspark.sql.Column` or str, optional + mode : :class:`~pyspark.sql.Column` or column name, optional Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB, GCM, CBC. - padding : :class:`~pyspark.sql.Column` or str, optional + padding : :class:`~pyspark.sql.Column` or column name, optional Specifies how to pad messages whose length is not a multiple of the block size. Valid values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS for CBC. - aad : :class:`~pyspark.sql.Column` or str, optional + aad : :class:`~pyspark.sql.Column` or column name, optional Optional additional authenticated data. Only supported for GCM mode. This can be any free-form input and must be provided for both encryption and decryption. @@ -24221,6 +25627,11 @@ def try_aes_decrypt( :class:`~pyspark.sql.Column` A new column that contains a decrypted value or a NULL value. + See Also + -------- + :meth:`pyspark.sql.functions.aes_encrypt` + :meth:`pyspark.sql.functions.aes_decrypt` + Examples -------- @@ -24234,7 +25645,7 @@ def try_aes_decrypt( ... ["input", "key", "mode", "padding", "aad"] ... ) >>> df.select(sf.try_aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode, df.padding, df.aad + ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad ... ).cast("STRING")).show(truncate=False) +-------------------------------------------------------------------------+ |CAST(try_aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)| @@ -24252,7 +25663,7 @@ def try_aes_decrypt( ... ["input", "key", "mode", "padding", "aad"] ... ) >>> df.select(sf.try_aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode, df.padding, df.aad + ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad ... ).cast("STRING")).show(truncate=False) +-------------------------------------------------------------------------+ |CAST(try_aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)| @@ -24269,7 +25680,7 @@ def try_aes_decrypt( ... ["input", "key", "mode", "padding"] ... ) >>> df.select(sf.try_aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode, df.padding + ... sf.unbase64(df.input), df.key, "mode", df.padding ... ).cast("STRING")).show(truncate=False) +----------------------------------------------------------------------+ |CAST(try_aes_decrypt(unbase64(input), key, mode, padding, ) AS STRING)| @@ -24286,7 +25697,7 @@ def try_aes_decrypt( ... ["input", "key", "mode", "padding"] ... ) >>> df.select(sf.try_aes_decrypt( - ... sf.unbase64(df.input), df.key, df.mode + ... sf.unbase64(df.input), df.key, "mode" ... ).cast("STRING")).show(truncate=False) +----------------------------------------------------------------------+ |CAST(try_aes_decrypt(unbase64(input), key, mode, DEFAULT, ) AS STRING)| @@ -24326,7 +25737,12 @@ def sha(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name + + See Also + -------- + :meth:`pyspark.sql.functions.sha1` + :meth:`pyspark.sql.functions.sha2` Examples -------- @@ -24416,18 +25832,28 @@ def reflect(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str - the first element should be a literal string for the class name, - and the second element should be a literal string for the method name, - and the remaining are input arguments to the Java method. + cols : :class:`~pyspark.sql.Column` or column name + the first element should be a Column representing literal string for the class name, + and the second element should be a Column representing literal string for the method name, + and the remaining are input arguments (Columns or column names) to the Java method. + + See Also + -------- + :meth:`pyspark.sql.functions.java_method` + :meth:`pyspark.sql.functions.try_reflect` Examples -------- - >>> df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"]) + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('a5cf6c42-0c85-418f-af6c-3e4e5b1328f2',)], ['a']) >>> df.select( - ... reflect(lit("java.util.UUID"), lit("fromString"), df.a).alias('r') - ... ).collect() - [Row(r='a5cf6c42-0c85-418f-af6c-3e4e5b1328f2')] + ... sf.reflect(sf.lit('java.util.UUID'), sf.lit('fromString'), 'a') + ... ).show(truncate=False) + +--------------------------------------+ + |reflect(java.util.UUID, fromString, a)| + +--------------------------------------+ + |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | + +--------------------------------------+ """ return _invoke_function_over_seq_of_columns("reflect", cols) @@ -24441,13 +25867,20 @@ def java_method(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str - the first element should be a literal string for the class name, - and the second element should be a literal string for the method name, - and the remaining are input arguments to the Java method. + cols : :class:`~pyspark.sql.Column` or column name + the first element should be a Column representing literal string for the class name, + and the second element should be a Column representing literal string for the method name, + and the remaining are input arguments (Columns or column names) to the Java method. + + See Also + -------- + :meth:`pyspark.sql.functions.reflect` + :meth:`pyspark.sql.functions.try_reflect` Examples -------- + Example 1: Reflecting a method call with a column argument + >>> import pyspark.sql.functions as sf >>> spark.range(1).select( ... sf.java_method( @@ -24461,6 +25894,19 @@ def java_method(*cols: "ColumnOrName") -> Column: +-----------------------------------------------------------------------------+ |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | +-----------------------------------------------------------------------------+ + + Example 2: Reflecting a method call with a column name argument + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('a5cf6c42-0c85-418f-af6c-3e4e5b1328f2',)], ['a']) + >>> df.select( + ... sf.java_method(sf.lit('java.util.UUID'), sf.lit('fromString'), 'a') + ... ).show(truncate=False) + +------------------------------------------+ + |java_method(java.util.UUID, fromString, a)| + +------------------------------------------+ + |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | + +------------------------------------------+ """ return _invoke_function_over_seq_of_columns("java_method", cols) @@ -24476,10 +25922,15 @@ def try_reflect(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str - the first element should be a literal string for the class name, - and the second element should be a literal string for the method name, - and the remaining are input arguments to the Java method. + cols : :class:`~pyspark.sql.Column` or column name + the first element should be a Column representing literal string for the class name, + and the second element should be a Column representing literal string for the method name, + and the remaining are input arguments (Columns or column names) to the Java method. + + See Also + -------- + :meth:`pyspark.sql.functions.reflect` + :meth:`pyspark.sql.functions.java_method` Examples -------- @@ -24488,25 +25939,24 @@ def try_reflect(*cols: "ColumnOrName") -> Column: >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"]) >>> df.select( - ... sf.try_reflect(sf.lit("java.util.UUID"), sf.lit("fromString"), df.a) - ... ).show() + ... sf.try_reflect(sf.lit("java.util.UUID"), sf.lit("fromString"), "a") + ... ).show(truncate=False) +------------------------------------------+ |try_reflect(java.util.UUID, fromString, a)| +------------------------------------------+ - | a5cf6c42-0c85-418...| + |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | +------------------------------------------+ Example 2: Exception in the reflection call, resulting in null >>> from pyspark.sql import functions as sf - >>> df = spark.range(1) - >>> df.select( + >>> spark.range(1).select( ... sf.try_reflect(sf.lit("scala.Predef"), sf.lit("require"), sf.lit(False)) - ... ).show() + ... ).show(truncate=False) +-----------------------------------------+ |try_reflect(scala.Predef, require, false)| +-----------------------------------------+ - | NULL| + |NULL | +-----------------------------------------+ """ return _invoke_function_over_seq_of_columns("try_reflect", cols) @@ -24522,12 +25972,12 @@ def version() -> Column: Examples -------- - >>> df = spark.range(1) - >>> df.select(version()).show(truncate=False) # doctest: +SKIP + >>> from pyspark.sql import functions as sf + >>> spark.range(1).select(sf.version()).show(truncate=False) # doctest: +SKIP +----------------------------------------------+ |version() | +----------------------------------------------+ - |3.5.0 cafbea5b13623276517a9d716f75745eff91f616| + |4.0.0 4f8d1f575e99aeef8990c63a9614af0fc5479330| +----------------------------------------------+ """ return _invoke_function_over_columns("version") @@ -24542,13 +25992,18 @@ def typeof(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name Examples -------- - >>> df = spark.createDataFrame([(1,)], ["a"]) - >>> df.select(typeof(df.a).alias('r')).collect() - [Row(r='bigint')] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(True, 1, 1.0, 'xyz',)], ['a', 'b', 'c', 'd']) + >>> df.select(sf.typeof(df.a), sf.typeof(df.b), sf.typeof('c'), sf.typeof('d')).show() + +---------+---------+---------+---------+ + |typeof(a)|typeof(b)|typeof(c)|typeof(d)| + +---------+---------+---------+---------+ + | boolean| bigint| double| string| + +---------+---------+---------+---------+ """ return _invoke_function_over_columns("typeof", col) @@ -24563,20 +26018,48 @@ def stack(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name the first element should be a literal int for the number of rows to be separated, and the remaining are input elements to be separated. Examples -------- - >>> df = spark.createDataFrame([(1, 2, 3)], ["a", "b", "c"]) - >>> df.select(stack(lit(2), df.a, df.b, df.c)).show(truncate=False) - +----+----+ - |col0|col1| - +----+----+ - |1 |2 | - |3 |NULL| - +----+----+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c']) + >>> df.select('*', sf.stack(sf.lit(2), df.a, df.b, 'c')).show() + +---+---+---+----+----+ + | a| b| c|col0|col1| + +---+---+---+----+----+ + | 1| 2| 3| 1| 2| + | 1| 2| 3| 3|NULL| + +---+---+---+----+----+ + + >>> df.select('*', sf.stack(sf.lit(2), df.a, df.b, 'c').alias('x', 'y')).show() + +---+---+---+---+----+ + | a| b| c| x| y| + +---+---+---+---+----+ + | 1| 2| 3| 1| 2| + | 1| 2| 3| 3|NULL| + +---+---+---+---+----+ + + >>> df.select('*', sf.stack(sf.lit(3), df.a, df.b, 'c')).show() + +---+---+---+----+ + | a| b| c|col0| + +---+---+---+----+ + | 1| 2| 3| 1| + | 1| 2| 3| 2| + | 1| 2| 3| 3| + +---+---+---+----+ + + >>> df.select('*', sf.stack(sf.lit(4), df.a, df.b, 'c')).show() + +---+---+---+----+ + | a| b| c|col0| + +---+---+---+----+ + | 1| 2| 3| 1| + | 1| 2| 3| 2| + | 1| 2| 3| 3| + | 1| 2| 3|NULL| + +---+---+---+----+ """ return _invoke_function_over_seq_of_columns("stack", cols) @@ -24590,14 +26073,26 @@ def bitmap_bit_position(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The input column. + See Also + -------- + :meth:`pyspark.sql.functions.bitmap_bucket_number` + :meth:`pyspark.sql.functions.bitmap_construct_agg` + :meth:`pyspark.sql.functions.bitmap_count` + :meth:`pyspark.sql.functions.bitmap_or_agg` + Examples -------- - >>> df = spark.createDataFrame([(123,)], ["a"]) - >>> df.select(bitmap_bit_position(df.a).alias("r")).collect() - [Row(r=122)] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(123,)], ['a']) + >>> df.select('*', sf.bitmap_bit_position('a')).show() + +---+----------------------+ + | a|bitmap_bit_position(a)| + +---+----------------------+ + |123| 122| + +---+----------------------+ """ return _invoke_function_over_columns("bitmap_bit_position", col) @@ -24611,14 +26106,26 @@ def bitmap_bucket_number(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The input column. + See Also + -------- + :meth:`pyspark.sql.functions.bitmap_bit_position` + :meth:`pyspark.sql.functions.bitmap_construct_agg` + :meth:`pyspark.sql.functions.bitmap_count` + :meth:`pyspark.sql.functions.bitmap_or_agg` + Examples -------- - >>> df = spark.createDataFrame([(123,)], ["a"]) - >>> df.select(bitmap_bucket_number(df.a).alias("r")).collect() - [Row(r=1)] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(123,)], ['a']) + >>> df.select('*', sf.bitmap_bucket_number('a')).show() + +---+-----------------------+ + | a|bitmap_bucket_number(a)| + +---+-----------------------+ + |123| 1| + +---+-----------------------+ """ return _invoke_function_over_columns("bitmap_bucket_number", col) @@ -24633,16 +26140,28 @@ def bitmap_construct_agg(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The input column will most likely be bitmap_bit_position(). + See Also + -------- + :meth:`pyspark.sql.functions.bitmap_bit_position` + :meth:`pyspark.sql.functions.bitmap_bucket_number` + :meth:`pyspark.sql.functions.bitmap_count` + :meth:`pyspark.sql.functions.bitmap_or_agg` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1,),(2,),(3,)], ["a"]) - >>> df.select(substring(hex( - ... bitmap_construct_agg(bitmap_bit_position(df.a)) - ... ), 0, 6).alias("r")).collect() - [Row(r='070000')] + >>> df.select( + ... sf.bitmap_construct_agg(sf.bitmap_bit_position('a')) + ... ).show() + +--------------------------------------------+ + |bitmap_construct_agg(bitmap_bit_position(a))| + +--------------------------------------------+ + | [07 00 00 00 00 0...| + +--------------------------------------------+ """ return _invoke_function_over_columns("bitmap_construct_agg", col) @@ -24656,14 +26175,26 @@ def bitmap_count(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The input bitmap. + See Also + -------- + :meth:`pyspark.sql.functions.bitmap_bit_position` + :meth:`pyspark.sql.functions.bitmap_bucket_number` + :meth:`pyspark.sql.functions.bitmap_construct_agg` + :meth:`pyspark.sql.functions.bitmap_or_agg` + Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("FFFF",)], ["a"]) - >>> df.select(bitmap_count(to_binary(df.a, lit("hex"))).alias('r')).collect() - [Row(r=16)] + >>> df.select(sf.bitmap_count(sf.to_binary(df.a, sf.lit("hex")))).show() + +-------------------------------+ + |bitmap_count(to_binary(a, hex))| + +-------------------------------+ + | 16| + +-------------------------------+ """ return _invoke_function_over_columns("bitmap_count", col) @@ -24676,18 +26207,28 @@ def bitmap_or_agg(col: "ColumnOrName") -> Column: .. versionadded:: 3.5.0 + See Also + -------- + :meth:`pyspark.sql.functions.bitmap_bit_position` + :meth:`pyspark.sql.functions.bitmap_bucket_number` + :meth:`pyspark.sql.functions.bitmap_construct_agg` + :meth:`pyspark.sql.functions.bitmap_count` + Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The input column should be bitmaps created from bitmap_construct_agg(). Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("10",),("20",),("40",)], ["a"]) - >>> df.select(substring(hex( - ... bitmap_or_agg(to_binary(df.a, lit("hex"))) - ... ), 0, 6).alias("r")).collect() - [Row(r='700000')] + >>> df.select(sf.bitmap_or_agg(sf.to_binary(df.a, sf.lit("hex")))).show() + +--------------------------------+ + |bitmap_or_agg(to_binary(a, hex))| + +--------------------------------+ + | [70 00 00 00 00 0...| + +--------------------------------+ """ return _invoke_function_over_columns("bitmap_or_agg", col) diff --git a/python/pyspark/sql/observation.py b/python/pyspark/sql/observation.py index 6ceb6bc90327a..09ae7a339a4cb 100644 --- a/python/pyspark/sql/observation.py +++ b/python/pyspark/sql/observation.py @@ -122,7 +122,7 @@ def _on(self, df: DataFrame, *exprs: Column) -> DataFrame: self._jvm = df._sc._jvm assert self._jvm is not None - cls = self._jvm.org.apache.spark.sql.Observation + cls = getattr(self._jvm, "org.apache.spark.sql.Observation") self._jo = cls(self._name) if self._name is not None else cls() observed_df = df._jdf.observe( self._jo, exprs[0]._jc, _to_seq(df._sc, [c._jc for c in exprs[1:]]) diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py index 56efe0676c08f..343a68bf010bf 100644 --- a/python/pyspark/sql/pandas/group_ops.py +++ b/python/pyspark/sql/pandas/group_ops.py @@ -35,7 +35,8 @@ TimerValues, ) from pyspark.sql.streaming.stateful_processor import StatefulProcessor, StatefulProcessorHandle -from pyspark.sql.types import StructType, _parse_datatype_string +from pyspark.sql.streaming.stateful_processor_util import TransformWithStateInPandasFuncMode +from pyspark.sql.types import StructType if TYPE_CHECKING: from pyspark.sql.pandas._typing import ( @@ -347,9 +348,9 @@ def applyInPandasWithState( ] if isinstance(outputStructType, str): - outputStructType = cast(StructType, _parse_datatype_string(outputStructType)) + outputStructType = cast(StructType, self._df._session._parse_ddl(outputStructType)) if isinstance(stateStructType, str): - stateStructType = cast(StructType, _parse_datatype_string(stateStructType)) + stateStructType = cast(StructType, self._df._session._parse_ddl(stateStructType)) udf = pandas_udf( func, # type: ignore[call-overload] @@ -374,6 +375,7 @@ def transformWithStateInPandas( outputMode: str, timeMode: str, initialState: Optional["GroupedData"] = None, + eventTimeColumnName: str = "", ) -> DataFrame: """ Invokes methods defined in the stateful processor used in arbitrary state API v2. It @@ -500,63 +502,85 @@ def transformWithStateInPandas( if initialState is not None: assert isinstance(initialState, GroupedData) if isinstance(outputStructType, str): - outputStructType = cast(StructType, _parse_datatype_string(outputStructType)) + outputStructType = cast(StructType, self._df._session._parse_ddl(outputStructType)) - def handle_data_with_timers( + def handle_pre_init( + statefulProcessorApiClient: StatefulProcessorApiClient, + ) -> Iterator["PandasDataFrameLike"]: + # Driver handle is different from the handle used on executors; + # On JVM side, we will use `DriverStatefulProcessorHandleImpl` for driver handle which + # will only be used for handling init() and get the state schema on the driver. + driver_handle = StatefulProcessorHandle(statefulProcessorApiClient) + statefulProcessorApiClient.set_handle_state(StatefulProcessorHandleState.PRE_INIT) + statefulProcessor.init(driver_handle) + + # This method is used for the driver-side stateful processor after we have collected + # all the necessary schemas. This instance of the DriverStatefulProcessorHandleImpl + # won't be used again on JVM. + statefulProcessor.close() + + # return a dummy results, no return value is needed for pre init + return iter([]) + + def handle_data_rows( statefulProcessorApiClient: StatefulProcessorApiClient, key: Any, - inputRows: Iterator["PandasDataFrameLike"], + inputRows: Optional[Iterator["PandasDataFrameLike"]] = None, ) -> Iterator["PandasDataFrameLike"]: statefulProcessorApiClient.set_implicit_key(key) - if timeMode != "none": - batch_timestamp = statefulProcessorApiClient.get_batch_timestamp() - watermark_timestamp = statefulProcessorApiClient.get_watermark_timestamp() + + batch_timestamp, watermark_timestamp = statefulProcessorApiClient.get_timestamps( + timeMode + ) + + # process with data rows + if inputRows is not None: + data_iter = statefulProcessor.handleInputRows( + key, inputRows, TimerValues(batch_timestamp, watermark_timestamp) + ) + return data_iter else: - batch_timestamp = -1 - watermark_timestamp = -1 - # process with invalid expiry timer info and emit data rows - data_iter = statefulProcessor.handleInputRows( - key, - inputRows, - TimerValues(batch_timestamp, watermark_timestamp), - ExpiredTimerInfo(False), + return iter([]) + + def handle_expired_timers( + statefulProcessorApiClient: StatefulProcessorApiClient, + ) -> Iterator["PandasDataFrameLike"]: + batch_timestamp, watermark_timestamp = statefulProcessorApiClient.get_timestamps( + timeMode ) - statefulProcessorApiClient.set_handle_state(StatefulProcessorHandleState.DATA_PROCESSED) - if timeMode == "processingtime": + if timeMode.lower() == "processingtime": expiry_list_iter = statefulProcessorApiClient.get_expiry_timers_iterator( batch_timestamp ) - elif timeMode == "eventtime": + elif timeMode.lower() == "eventtime": expiry_list_iter = statefulProcessorApiClient.get_expiry_timers_iterator( watermark_timestamp ) else: expiry_list_iter = iter([[]]) - result_iter_list = [data_iter] - # process with valid expiry time info and with empty input rows, - # only timer related rows will be emitted + # process with expiry timers, only timer related rows will be emitted for expiry_list in expiry_list_iter: for key_obj, expiry_timestamp in expiry_list: - result_iter_list.append( - statefulProcessor.handleInputRows( - key_obj, - iter([]), - TimerValues(batch_timestamp, watermark_timestamp), - ExpiredTimerInfo(True, expiry_timestamp), - ) - ) - # TODO(SPARK-49603) set the handle state in the lazily initialized iterator - - result = itertools.chain(*result_iter_list) - return result + statefulProcessorApiClient.set_implicit_key(key_obj) + for pd in statefulProcessor.handleExpiredTimer( + key=key_obj, + timer_values=TimerValues(batch_timestamp, watermark_timestamp), + expired_timer_info=ExpiredTimerInfo(expiry_timestamp), + ): + yield pd + statefulProcessorApiClient.delete_timer(expiry_timestamp) def transformWithStateUDF( statefulProcessorApiClient: StatefulProcessorApiClient, + mode: TransformWithStateInPandasFuncMode, key: Any, inputRows: Iterator["PandasDataFrameLike"], ) -> Iterator["PandasDataFrameLike"]: + if mode == TransformWithStateInPandasFuncMode.PRE_INIT: + return handle_pre_init(statefulProcessorApiClient) + handle = StatefulProcessorHandle(statefulProcessorApiClient) if statefulProcessorApiClient.handle_state == StatefulProcessorHandleState.CREATED: @@ -565,19 +589,28 @@ def transformWithStateUDF( StatefulProcessorHandleState.INITIALIZED ) - # Key is None when we have processed all the input data from the worker and ready to - # proceed with the cleanup steps. - if key is None: + if mode == TransformWithStateInPandasFuncMode.PROCESS_TIMER: + statefulProcessorApiClient.set_handle_state( + StatefulProcessorHandleState.DATA_PROCESSED + ) + result = handle_expired_timers(statefulProcessorApiClient) + return result + elif mode == TransformWithStateInPandasFuncMode.COMPLETE: + statefulProcessorApiClient.set_handle_state( + StatefulProcessorHandleState.TIMER_PROCESSED + ) statefulProcessorApiClient.remove_implicit_key() statefulProcessor.close() statefulProcessorApiClient.set_handle_state(StatefulProcessorHandleState.CLOSED) return iter([]) - - result = handle_data_with_timers(statefulProcessorApiClient, key, inputRows) - return result + else: + # mode == TransformWithStateInPandasFuncMode.PROCESS_DATA + result = handle_data_rows(statefulProcessorApiClient, key, inputRows) + return result def transformWithStateWithInitStateUDF( statefulProcessorApiClient: StatefulProcessorApiClient, + mode: TransformWithStateInPandasFuncMode, key: Any, inputRows: Iterator["PandasDataFrameLike"], initialStates: Optional[Iterator["PandasDataFrameLike"]] = None, @@ -594,6 +627,9 @@ def transformWithStateWithInitStateUDF( - `initialStates` is None, while `inputRows` is not empty. This is not first batch. `initialStates` is initialized to the positional value as None. """ + if mode == TransformWithStateInPandasFuncMode.PRE_INIT: + return handle_pre_init(statefulProcessorApiClient) + handle = StatefulProcessorHandle(statefulProcessorApiClient) if statefulProcessorApiClient.handle_state == StatefulProcessorHandleState.CREATED: @@ -602,20 +638,30 @@ def transformWithStateWithInitStateUDF( StatefulProcessorHandleState.INITIALIZED ) - # Key is None when we have processed all the input data from the worker and ready to - # proceed with the cleanup steps. - if key is None: + if mode == TransformWithStateInPandasFuncMode.PROCESS_TIMER: + statefulProcessorApiClient.set_handle_state( + StatefulProcessorHandleState.DATA_PROCESSED + ) + result = handle_expired_timers(statefulProcessorApiClient) + return result + elif mode == TransformWithStateInPandasFuncMode.COMPLETE: statefulProcessorApiClient.remove_implicit_key() statefulProcessor.close() statefulProcessorApiClient.set_handle_state(StatefulProcessorHandleState.CLOSED) return iter([]) + else: + # mode == TransformWithStateInPandasFuncMode.PROCESS_DATA + batch_timestamp, watermark_timestamp = statefulProcessorApiClient.get_timestamps( + timeMode + ) # only process initial state if first batch and initial state is not None if initialStates is not None: for cur_initial_state in initialStates: statefulProcessorApiClient.set_implicit_key(key) - # TODO(SPARK-50194) integration with new timer API with initial state - statefulProcessor.handleInitialState(key, cur_initial_state) + statefulProcessor.handleInitialState( + key, cur_initial_state, TimerValues(batch_timestamp, watermark_timestamp) + ) # if we don't have input rows for the given key but only have initial state # for the grouping key, the inputRows iterator could be empty @@ -628,14 +674,14 @@ def transformWithStateWithInitStateUDF( inputRows = itertools.chain([first], inputRows) if not input_rows_empty: - result = handle_data_with_timers(statefulProcessorApiClient, key, inputRows) + result = handle_data_rows(statefulProcessorApiClient, key, inputRows) else: result = iter([]) return result if isinstance(outputStructType, str): - outputStructType = cast(StructType, _parse_datatype_string(outputStructType)) + outputStructType = cast(StructType, self._df._session._parse_ddl(outputStructType)) df = self._df @@ -662,6 +708,7 @@ def transformWithStateWithInitStateUDF( outputMode, timeMode, initial_state_java_obj, + eventTimeColumnName, ) return DataFrame(jdf, self.session) diff --git a/python/pyspark/sql/pandas/map_ops.py b/python/pyspark/sql/pandas/map_ops.py index c11a8b9d8d4d2..424269035f7ee 100644 --- a/python/pyspark/sql/pandas/map_ops.py +++ b/python/pyspark/sql/pandas/map_ops.py @@ -94,7 +94,7 @@ def _build_java_profile( jvm = self.sparkSession.sparkContext._jvm assert jvm is not None - builder = jvm.org.apache.spark.resource.ResourceProfileBuilder() + builder = getattr(jvm, "org.apache.spark.resource.ResourceProfileBuilder")() ereqs = ExecutorResourceRequests(jvm, profile._executor_resource_requests) treqs = TaskResourceRequests(jvm, profile._task_resource_requests) builder.require(ereqs._java_executor_resource_requests) diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 5bf07b87400fe..536bf7307065c 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -36,6 +36,7 @@ _create_converter_from_pandas, _create_converter_to_pandas, ) +from pyspark.sql.streaming.stateful_processor_util import TransformWithStateInPandasFuncMode from pyspark.sql.types import ( DataType, StringType, @@ -1197,7 +1198,11 @@ def generate_data_batches(batches): data_batches = generate_data_batches(_batches) for k, g in groupby(data_batches, key=lambda x: x[0]): - yield (k, g) + yield (TransformWithStateInPandasFuncMode.PROCESS_DATA, k, g) + + yield (TransformWithStateInPandasFuncMode.PROCESS_TIMER, None, None) + + yield (TransformWithStateInPandasFuncMode.COMPLETE, None, None) def dump_stream(self, iterator, stream): """ @@ -1281,4 +1286,8 @@ def flatten_columns(cur_batch, col_name): data_batches = generate_data_batches(_batches) for k, g in groupby(data_batches, key=lambda x: x[0]): - yield (k, g) + yield (TransformWithStateInPandasFuncMode.PROCESS_DATA, k, g) + + yield (TransformWithStateInPandasFuncMode.PROCESS_TIMER, None, None) + + yield (TransformWithStateInPandasFuncMode.COMPLETE, None, None) diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py index 648af21502864..d65126bb3db9e 100644 --- a/python/pyspark/sql/pandas/types.py +++ b/python/pyspark/sql/pandas/types.py @@ -53,14 +53,11 @@ ) from pyspark.errors import PySparkTypeError, UnsupportedOperationException, PySparkValueError from pyspark.loose_version import LooseVersion -from pyspark.sql.utils import has_numpy - -if has_numpy: - import numpy as np if TYPE_CHECKING: import pandas as pd import pyarrow as pa + import numpy as np from pyspark.sql.pandas._typing import SeriesLike as PandasSeriesLike from pyspark.sql.pandas._typing import DataFrameLike as PandasDataFrameLike diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index 5849ae0edd6d9..a351c13ff0a08 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -61,7 +61,7 @@ def require_minimum_pandas_version() -> None: def require_minimum_pyarrow_version() -> None: """Raise ImportError if minimum version of pyarrow is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pyarrow_version = "10.0.0" + minimum_pyarrow_version = "11.0.0" import os diff --git a/python/pyspark/sql/plot/core.py b/python/pyspark/sql/plot/core.py index f7133bdb70ed6..e565a5d1ebf32 100644 --- a/python/pyspark/sql/plot/core.py +++ b/python/pyspark/sql/plot/core.py @@ -19,11 +19,10 @@ from typing import Any, TYPE_CHECKING, List, Optional, Union, Sequence from types import ModuleType -from pyspark.errors import PySparkTypeError, PySparkValueError +from pyspark.errors import PySparkValueError from pyspark.sql import Column, functions as F from pyspark.sql.internal import InternalFunction as SF from pyspark.sql.pandas.utils import require_minimum_pandas_version -from pyspark.sql.types import NumericType from pyspark.sql.utils import NumpyHelper, require_minimum_plotly_version if TYPE_CHECKING: @@ -295,7 +294,7 @@ def area(self, x: str, y: Union[str, list[str]], **kwargs: Any) -> "Figure": """ return self(kind="area", x=x, y=y, **kwargs) - def pie(self, x: str, y: str, **kwargs: Any) -> "Figure": + def pie(self, x: str, y: Optional[str], **kwargs: Any) -> "Figure": """ Generate a pie plot. @@ -306,8 +305,8 @@ def pie(self, x: str, y: str, **kwargs: Any) -> "Figure": ---------- x : str Name of column to be used as the category labels for the pie plot. - y : str - Name of the column to plot. + y : str, optional + Name of the column to plot. If not provided, `subplots=True` must be passed at `kwargs`. **kwargs Additional keyword arguments. @@ -327,19 +326,8 @@ def pie(self, x: str, y: str, **kwargs: Any) -> "Figure": >>> columns = ["sales", "signups", "visits", "date"] >>> df = spark.createDataFrame(data, columns) >>> df.plot.pie(x='date', y='sales') # doctest: +SKIP + >>> df.plot.pie(x='date', subplots=True) # doctest: +SKIP """ - schema = self.data.schema - - # Check if 'y' is a numerical column - y_field = schema[y] if y in schema.names else None - if y_field is None or not isinstance(y_field.dataType, NumericType): - raise PySparkTypeError( - errorClass="PLOT_NOT_NUMERIC_COLUMN_ARGUMENT", - messageParameters={ - "arg_name": "y", - "arg_type": str(y_field.dataType.__class__.__name__) if y_field else "None", - }, - ) return self(kind="pie", x=x, y=y, **kwargs) def box(self, column: Optional[Union[str, List[str]]] = None, **kwargs: Any) -> "Figure": diff --git a/python/pyspark/sql/plot/plotly.py b/python/pyspark/sql/plot/plotly.py index 959562b43552a..526a36033e2fc 100644 --- a/python/pyspark/sql/plot/plotly.py +++ b/python/pyspark/sql/plot/plotly.py @@ -48,13 +48,34 @@ def plot_pyspark(data: "DataFrame", kind: str, **kwargs: Any) -> "Figure": def plot_pie(data: "DataFrame", **kwargs: Any) -> "Figure": - # TODO(SPARK-49530): Support pie subplots with plotly backend from plotly import express pdf = PySparkPlotAccessor.plot_data_map["pie"](data) x = kwargs.pop("x", None) y = kwargs.pop("y", None) - fig = express.pie(pdf, values=y, names=x, **kwargs) + subplots = kwargs.pop("subplots", False) + if y is None and not subplots: + raise PySparkValueError(errorClass="UNSUPPORTED_PIE_PLOT_PARAM", messageParameters={}) + + numeric_ys = process_column_param(y, data) + + if subplots: + # One pie chart per numeric column + from plotly.subplots import make_subplots + + fig = make_subplots( + rows=1, + cols=len(numeric_ys), + # To accommodate domain-based trace - pie chart + specs=[[{"type": "domain"}] * len(numeric_ys)], + ) + for i, y_col in enumerate(numeric_ys): + subplot_fig = express.pie(pdf, values=y_col, names=x, **kwargs) + fig.add_trace( + subplot_fig.data[0], row=1, col=i + 1 + ) # A single pie chart has only one trace + else: + fig = express.pie(pdf, values=numeric_ys[0], names=x, **kwargs) return fig @@ -130,7 +151,7 @@ def plot_box(data: "DataFrame", **kwargs: Any) -> "Figure": def plot_kde(data: "DataFrame", **kwargs: Any) -> "Figure": - from pyspark.sql.utils import has_numpy + from pyspark.testing.utils import have_numpy from pyspark.sql.pandas.utils import require_minimum_pandas_version require_minimum_pandas_version() @@ -145,7 +166,7 @@ def plot_kde(data: "DataFrame", **kwargs: Any) -> "Figure": colnames = process_column_param(kwargs.pop("column", None), data) ind = PySparkKdePlotBase.get_ind(data.select(*colnames), kwargs.pop("ind", None)) - if has_numpy: + if have_numpy: import numpy as np if isinstance(ind, np.ndarray): diff --git a/python/pyspark/sql/protobuf/functions.py b/python/pyspark/sql/protobuf/functions.py index 1e75874e75f9a..ece450a77f4f3 100644 --- a/python/pyspark/sql/protobuf/functions.py +++ b/python/pyspark/sql/protobuf/functions.py @@ -149,13 +149,13 @@ def from_protobuf( elif descFilePath is not None: binary_proto = _read_descriptor_set_file(descFilePath) if binary_proto is not None: - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.protobuf.functions.from_protobuf( - _to_java_column(data), messageName, binary_proto, options or {} - ) + jc = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.protobuf.functions" + ).from_protobuf(_to_java_column(data), messageName, binary_proto, options or {}) else: - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.protobuf.functions.from_protobuf( - _to_java_column(data), messageName, options or {} - ) + jc = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.protobuf.functions" + ).from_protobuf(_to_java_column(data), messageName, options or {}) except TypeError as e: if str(e) == "'JavaPackage' object is not callable": _print_missing_jar("Protobuf", "protobuf", "protobuf", sc.version) @@ -271,13 +271,13 @@ def to_protobuf( elif descFilePath is not None: binary_proto = _read_descriptor_set_file(descFilePath) if binary_proto is not None: - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.protobuf.functions.to_protobuf( - _to_java_column(data), messageName, binary_proto, options or {} - ) + jc = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.protobuf.functions" + ).to_protobuf(_to_java_column(data), messageName, binary_proto, options or {}) else: - jc = cast(JVMView, sc._jvm).org.apache.spark.sql.protobuf.functions.to_protobuf( - _to_java_column(data), messageName, options or {} - ) + jc = getattr( + cast(JVMView, sc._jvm), "org.apache.spark.sql.protobuf.functions" + ).to_protobuf(_to_java_column(data), messageName, options or {}) except TypeError as e: if str(e) == "'JavaPackage' object is not callable": diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 4744bdf861d37..96c8f8a475b26 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -215,7 +215,7 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataFrameReader": Parameters ---------- **options : dict - The dictionary of string keys and prmitive-type values. + The dictionary of string keys and primitive-type values. Examples -------- @@ -1174,7 +1174,9 @@ def jdbc( if predicates is not None: gateway = self._spark._sc._gateway assert gateway is not None - jpredicates = utils.to_java_array(gateway, gateway.jvm.java.lang.String, predicates) + jpredicates = utils.to_java_array( + gateway, getattr(gateway.jvm, "java.lang.String"), predicates + ) return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) return self._df(self._jreader.jdbc(url, table, jprop)) diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index e97b844564100..f5bb269c23d6e 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -18,7 +18,7 @@ import sys import warnings from collections.abc import Sized -from functools import reduce +from functools import reduce, cached_property from threading import RLock from types import TracebackType from typing import ( @@ -58,7 +58,6 @@ _has_nulltype, _merge_type, _create_converter, - _parse_datatype_string, _from_numpy_type, ) from pyspark.errors.exceptions.captured import install_exception_handler @@ -90,12 +89,6 @@ from pyspark.sql.connect.client import SparkConnectClient from pyspark.sql.connect.shell.progress import ProgressHandler -try: - import memory_profiler # noqa: F401 - - has_memory_profiler = True -except Exception: - has_memory_profiler = False __all__ = ["SparkSession"] @@ -779,7 +772,7 @@ def sparkContext(self) -> "SparkContext": """ return self._sc - @property + @cached_property def version(self) -> str: """ The version of Spark on which this application is running. @@ -800,7 +793,7 @@ def version(self) -> str: """ return self._jsparkSession.version() - @property + @cached_property def conf(self) -> RuntimeConfig: """Runtime configuration interface for Spark. @@ -828,11 +821,9 @@ def conf(self) -> RuntimeConfig: >>> spark.conf.get("key") 'value' """ - if not hasattr(self, "_conf"): - self._conf = RuntimeConfig(self._jsparkSession.conf()) - return self._conf + return RuntimeConfig(self._jsparkSession.conf()) - @property + @cached_property def catalog(self) -> "Catalog": """Interface through which the user may create, drop, alter or query underlying databases, tables, functions, etc. @@ -860,9 +851,7 @@ def catalog(self) -> "Catalog": """ from pyspark.sql.catalog import Catalog - if not hasattr(self, "_catalog"): - self._catalog = Catalog(self) - return self._catalog + return Catalog(self) @property def udf(self) -> "UDFRegistration": @@ -1478,7 +1467,9 @@ def createDataFrame( # type: ignore[misc] +-----+---+ |Alice| 1| +-----+---+ - >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP + + >>> pdf = pandas.DataFrame([[1, 2]]) # doctest: +SKIP + >>> spark.createDataFrame(pdf).show() # doctest: +SKIP +---+---+ | 0| 1| +---+---+ @@ -1493,8 +1484,9 @@ def createDataFrame( # type: ignore[misc] +-----+---+ |Alice| 1| +-----+---+ + >>> table = pyarrow.table({'0': [1], '1': [2]}) # doctest: +SKIP - >>> spark.createDataFrame(table).collect() # doctest: +SKIP + >>> spark.createDataFrame(table).show() # doctest: +SKIP +---+---+ | 0| 1| +---+---+ @@ -1511,7 +1503,7 @@ def createDataFrame( # type: ignore[misc] ) if isinstance(schema, str): - schema = cast(Union[AtomicType, StructType, str], _parse_datatype_string(schema)) + schema = cast(Union[AtomicType, StructType, str], self._parse_ddl(schema)) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode("utf-8") if not isinstance(x, str) else x for x in schema] @@ -1913,7 +1905,7 @@ def readStream(self) -> DataStreamReader: """ return DataStreamReader(self) - @property + @cached_property def streams(self) -> "StreamingQueryManager": """Returns a :class:`StreamingQueryManager` that allows managing all the :class:`StreamingQuery` instances active on `this` context. @@ -1947,10 +1939,7 @@ def streams(self) -> "StreamingQueryManager": """ from pyspark.sql.streaming import StreamingQueryManager - if hasattr(self, "_sqm"): - return self._sqm - self._sqm: StreamingQueryManager = StreamingQueryManager(self._jsparkSession.streams()) - return self._sqm + return StreamingQueryManager(self._jsparkSession.streams()) @property def tvf(self) -> "TableValuedFunction": @@ -2211,13 +2200,15 @@ def copyFromLocalToFs(self, local_path: str, dest_path: str) -> None: messageParameters={"feature": "SparkSession.copyFromLocalToFs"}, ) - @remote_only def interruptAll(self) -> List[str]: """ Interrupt all operations of this session currently running on the connected server. .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports Spark Classic. + Returns ------- list of str @@ -2227,18 +2218,25 @@ def interruptAll(self) -> List[str]: ----- There is still a possibility of operation finishing just as it is interrupted. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.interruptAll"}, - ) + java_list = self._jsparkSession.interruptAll() + python_list = list() + + # Use iterator to manually iterate through Java list + java_iterator = java_list.iterator() + while java_iterator.hasNext(): + python_list.append(str(java_iterator.next())) + + return python_list - @remote_only def interruptTag(self, tag: str) -> List[str]: """ Interrupt all operations of this session with the given operation tag. .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports Spark Classic. + Returns ------- list of str @@ -2248,18 +2246,25 @@ def interruptTag(self, tag: str) -> List[str]: ----- There is still a possibility of operation finishing just as it is interrupted. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.interruptTag"}, - ) + java_list = self._jsparkSession.interruptTag(tag) + python_list = list() + + # Use iterator to manually iterate through Java list + java_iterator = java_list.iterator() + while java_iterator.hasNext(): + python_list.append(str(java_iterator.next())) + + return python_list - @remote_only def interruptOperation(self, op_id: str) -> List[str]: """ Interrupt an operation of this session with the given operationId. .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports Spark Classic. + Returns ------- list of str @@ -2269,12 +2274,16 @@ def interruptOperation(self, op_id: str) -> List[str]: ----- There is still a possibility of operation finishing just as it is interrupted. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.interruptOperation"}, - ) + java_list = self._jsparkSession.interruptOperation(op_id) + python_list = list() + + # Use iterator to manually iterate through Java list + java_iterator = java_list.iterator() + while java_iterator.hasNext(): + python_list.append(str(java_iterator.next())) + + return python_list - @remote_only def addTag(self, tag: str) -> None: """ Add a tag to be assigned to all the operations started by this thread in this session. @@ -2289,17 +2298,16 @@ def addTag(self, tag: str) -> None: .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports Spark Classic. + Parameters ---------- tag : str The tag to be added. Cannot contain ',' (comma) character or be an empty string. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.addTag"}, - ) + self._jsparkSession.addTag(tag) - @remote_only def removeTag(self, tag: str) -> None: """ Remove a tag previously added to be assigned to all the operations started by this thread in @@ -2307,17 +2315,16 @@ def removeTag(self, tag: str) -> None: .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports Spark Classic. + Parameters ---------- tag : list of str The tag to be removed. Cannot contain ',' (comma) character or be an empty string. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.removeTag"}, - ) + self._jsparkSession.removeTag(tag) - @remote_only def getTags(self) -> Set[str]: """ Get the tags that are currently set to be assigned to all the operations started by this @@ -2325,27 +2332,40 @@ def getTags(self) -> Set[str]: .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports Spark Classic. + Returns ------- set of str Set of tags of interrupted operations. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.getTags"}, - ) + java_set = self._jsparkSession.getTags() + python_set = set() + + # Use iterator to manually iterate through Java Set + java_iterator = java_set.iterator() + while java_iterator.hasNext(): + python_set.add(str(java_iterator.next())) + + return python_set - @remote_only def clearTags(self) -> None: """ Clear the current thread's operation tags. .. versionadded:: 3.5.0 + + .. versionchanged:: 4.0.0 + Supports Spark Classic. """ - raise PySparkRuntimeError( - errorClass="ONLY_SUPPORTED_WITH_SPARK_CONNECT", - messageParameters={"feature": "SparkSession.clearTags"}, - ) + self._jsparkSession.clearTags() + + def _to_ddl(self, struct: StructType) -> str: + return self._sc._to_ddl(struct) + + def _parse_ddl(self, ddl: str) -> DataType: + return self._sc._parse_ddl(ddl) def _test() -> None: diff --git a/python/pyspark/sql/streaming/list_state_client.py b/python/pyspark/sql/streaming/list_state_client.py index d2152842819a5..cb618d1a691b3 100644 --- a/python/pyspark/sql/streaming/list_state_client.py +++ b/python/pyspark/sql/streaming/list_state_client.py @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import Dict, Iterator, List, Union, cast, Tuple +from typing import Dict, Iterator, List, Union, Tuple from pyspark.sql.streaming.stateful_processor_api_client import StatefulProcessorApiClient -from pyspark.sql.types import StructType, TYPE_CHECKING, _parse_datatype_string +from pyspark.sql.types import StructType, TYPE_CHECKING from pyspark.errors import PySparkRuntimeError import uuid @@ -28,8 +28,16 @@ class ListStateClient: - def __init__(self, stateful_processor_api_client: StatefulProcessorApiClient) -> None: + def __init__( + self, + stateful_processor_api_client: StatefulProcessorApiClient, + schema: Union[StructType, str], + ) -> None: self._stateful_processor_api_client = stateful_processor_api_client + if isinstance(schema, str): + self.schema = self._stateful_processor_api_client._parse_string_schema(schema) + else: + self.schema = schema # A dictionary to store the mapping between list state name and a tuple of pandas DataFrame # and the index of the last row that was read. self.pandas_df_dict: Dict[str, Tuple["PandasDataFrameLike", int]] = {} @@ -105,12 +113,10 @@ def get(self, state_name: str, iterator_id: str) -> Tuple: pandas_row = pandas_df.iloc[index] return tuple(pandas_row) - def append_value(self, state_name: str, schema: Union[StructType, str], value: Tuple) -> None: + def append_value(self, state_name: str, value: Tuple) -> None: import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - if isinstance(schema, str): - schema = cast(StructType, _parse_datatype_string(schema)) - bytes = self._stateful_processor_api_client._serialize_to_bytes(schema, value) + bytes = self._stateful_processor_api_client._serialize_to_bytes(self.schema, value) append_value_call = stateMessage.AppendValue(value=bytes) list_state_call = stateMessage.ListStateCall( stateName=state_name, appendValue=append_value_call @@ -125,13 +131,9 @@ def append_value(self, state_name: str, schema: Union[StructType, str], value: T # TODO(SPARK-49233): Classify user facing errors. raise PySparkRuntimeError(f"Error updating value state: " f"{response_message[1]}") - def append_list( - self, state_name: str, schema: Union[StructType, str], values: List[Tuple] - ) -> None: + def append_list(self, state_name: str, values: List[Tuple]) -> None: import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - if isinstance(schema, str): - schema = cast(StructType, _parse_datatype_string(schema)) append_list_call = stateMessage.AppendList() list_state_call = stateMessage.ListStateCall( stateName=state_name, appendList=append_list_call @@ -141,18 +143,16 @@ def append_list( self._stateful_processor_api_client._send_proto_message(message.SerializeToString()) - self._stateful_processor_api_client._send_arrow_state(schema, values) + self._stateful_processor_api_client._send_arrow_state(self.schema, values) response_message = self._stateful_processor_api_client._receive_proto_message() status = response_message[0] if status != 0: # TODO(SPARK-49233): Classify user facing errors. raise PySparkRuntimeError(f"Error updating value state: " f"{response_message[1]}") - def put(self, state_name: str, schema: Union[StructType, str], values: List[Tuple]) -> None: + def put(self, state_name: str, values: List[Tuple]) -> None: import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - if isinstance(schema, str): - schema = cast(StructType, _parse_datatype_string(schema)) put_call = stateMessage.ListStatePut() list_state_call = stateMessage.ListStateCall(stateName=state_name, listStatePut=put_call) state_variable_request = stateMessage.StateVariableRequest(listStateCall=list_state_call) @@ -160,7 +160,7 @@ def put(self, state_name: str, schema: Union[StructType, str], values: List[Tupl self._stateful_processor_api_client._send_proto_message(message.SerializeToString()) - self._stateful_processor_api_client._send_arrow_state(schema, values) + self._stateful_processor_api_client._send_arrow_state(self.schema, values) response_message = self._stateful_processor_api_client._receive_proto_message() status = response_message[0] if status != 0: diff --git a/python/pyspark/sql/streaming/map_state_client.py b/python/pyspark/sql/streaming/map_state_client.py index 6ec7448b48634..c4761ddd48a16 100644 --- a/python/pyspark/sql/streaming/map_state_client.py +++ b/python/pyspark/sql/streaming/map_state_client.py @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import Dict, Iterator, Union, cast, Tuple, Optional +from typing import Dict, Iterator, Union, Tuple, Optional from pyspark.sql.streaming.stateful_processor_api_client import StatefulProcessorApiClient -from pyspark.sql.types import StructType, TYPE_CHECKING, _parse_datatype_string +from pyspark.sql.types import StructType, TYPE_CHECKING from pyspark.errors import PySparkRuntimeError import uuid @@ -36,11 +36,15 @@ def __init__( ) -> None: self._stateful_processor_api_client = stateful_processor_api_client if isinstance(user_key_schema, str): - self.user_key_schema = cast(StructType, _parse_datatype_string(user_key_schema)) + self.user_key_schema = self._stateful_processor_api_client._parse_string_schema( + user_key_schema + ) else: self.user_key_schema = user_key_schema if isinstance(value_schema, str): - self.value_schema = cast(StructType, _parse_datatype_string(value_schema)) + self.value_schema = self._stateful_processor_api_client._parse_string_schema( + value_schema + ) else: self.value_schema = value_schema # Dictionaries to store the mapping between iterator id and a tuple of pandas DataFrame diff --git a/python/pyspark/sql/streaming/proto/StateMessage_pb2.py b/python/pyspark/sql/streaming/proto/StateMessage_pb2.py index 0a54690513a39..20af541f307cd 100644 --- a/python/pyspark/sql/streaming/proto/StateMessage_pb2.py +++ b/python/pyspark/sql/streaming/proto/StateMessage_pb2.py @@ -40,7 +40,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n;org/apache/spark/sql/execution/streaming/StateMessage.proto\x12.org.apache.spark.sql.execution.streaming.state"\xa0\x04\n\x0cStateRequest\x12\x18\n\x07version\x18\x01 \x01(\x05R\x07version\x12}\n\x15statefulProcessorCall\x18\x02 \x01(\x0b\x32\x45.org.apache.spark.sql.execution.streaming.state.StatefulProcessorCallH\x00R\x15statefulProcessorCall\x12z\n\x14stateVariableRequest\x18\x03 \x01(\x0b\x32\x44.org.apache.spark.sql.execution.streaming.state.StateVariableRequestH\x00R\x14stateVariableRequest\x12\x8c\x01\n\x1aimplicitGroupingKeyRequest\x18\x04 \x01(\x0b\x32J.org.apache.spark.sql.execution.streaming.state.ImplicitGroupingKeyRequestH\x00R\x1aimplicitGroupingKeyRequest\x12\x62\n\x0ctimerRequest\x18\x05 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.TimerRequestH\x00R\x0ctimerRequestB\x08\n\x06method"i\n\rStateResponse\x12\x1e\n\nstatusCode\x18\x01 \x01(\x05R\nstatusCode\x12"\n\x0c\x65rrorMessage\x18\x02 \x01(\tR\x0c\x65rrorMessage\x12\x14\n\x05value\x18\x03 \x01(\x0cR\x05value"x\n\x1cStateResponseWithLongTypeVal\x12\x1e\n\nstatusCode\x18\x01 \x01(\x05R\nstatusCode\x12"\n\x0c\x65rrorMessage\x18\x02 \x01(\tR\x0c\x65rrorMessage\x12\x14\n\x05value\x18\x03 \x01(\x03R\x05value"\xa0\x05\n\x15StatefulProcessorCall\x12h\n\x0esetHandleState\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetHandleStateH\x00R\x0esetHandleState\x12h\n\rgetValueState\x18\x02 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\rgetValueState\x12\x66\n\x0cgetListState\x18\x03 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\x0cgetListState\x12\x64\n\x0bgetMapState\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\x0bgetMapState\x12o\n\x0etimerStateCall\x18\x05 \x01(\x0b\x32\x45.org.apache.spark.sql.execution.streaming.state.TimerStateCallCommandH\x00R\x0etimerStateCall\x12j\n\x0e\x64\x65leteIfExists\x18\x06 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\x0e\x64\x65leteIfExistsB\x08\n\x06method"\xd5\x02\n\x14StateVariableRequest\x12h\n\x0evalueStateCall\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.ValueStateCallH\x00R\x0evalueStateCall\x12\x65\n\rlistStateCall\x18\x02 \x01(\x0b\x32=.org.apache.spark.sql.execution.streaming.state.ListStateCallH\x00R\rlistStateCall\x12\x62\n\x0cmapStateCall\x18\x03 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.MapStateCallH\x00R\x0cmapStateCallB\x08\n\x06method"\x83\x02\n\x1aImplicitGroupingKeyRequest\x12h\n\x0esetImplicitKey\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetImplicitKeyH\x00R\x0esetImplicitKey\x12q\n\x11removeImplicitKey\x18\x02 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.RemoveImplicitKeyH\x00R\x11removeImplicitKeyB\x08\n\x06method"\x81\x02\n\x0cTimerRequest\x12q\n\x11timerValueRequest\x18\x01 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.TimerValueRequestH\x00R\x11timerValueRequest\x12t\n\x12\x65xpiryTimerRequest\x18\x02 \x01(\x0b\x32\x42.org.apache.spark.sql.execution.streaming.state.ExpiryTimerRequestH\x00R\x12\x65xpiryTimerRequestB\x08\n\x06method"\xf6\x01\n\x11TimerValueRequest\x12s\n\x12getProcessingTimer\x18\x01 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.GetProcessingTimeH\x00R\x12getProcessingTimer\x12\x62\n\x0cgetWatermark\x18\x02 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.GetWatermarkH\x00R\x0cgetWatermarkB\x08\n\x06method"B\n\x12\x45xpiryTimerRequest\x12,\n\x11\x65xpiryTimestampMs\x18\x01 \x01(\x03R\x11\x65xpiryTimestampMs"\x13\n\x11GetProcessingTime"\x0e\n\x0cGetWatermark"\xc7\x01\n\x10StateCallCommand\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12\x16\n\x06schema\x18\x02 \x01(\tR\x06schema\x12\x30\n\x13mapStateValueSchema\x18\x03 \x01(\tR\x13mapStateValueSchema\x12K\n\x03ttl\x18\x04 \x01(\x0b\x32\x39.org.apache.spark.sql.execution.streaming.state.TTLConfigR\x03ttl"\xa7\x02\n\x15TimerStateCallCommand\x12[\n\x08register\x18\x01 \x01(\x0b\x32=.org.apache.spark.sql.execution.streaming.state.RegisterTimerH\x00R\x08register\x12U\n\x06\x64\x65lete\x18\x02 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.DeleteTimerH\x00R\x06\x64\x65lete\x12P\n\x04list\x18\x03 \x01(\x0b\x32:.org.apache.spark.sql.execution.streaming.state.ListTimersH\x00R\x04listB\x08\n\x06method"\x92\x03\n\x0eValueStateCall\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12P\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00R\x06\x65xists\x12G\n\x03get\x18\x03 \x01(\x0b\x32\x33.org.apache.spark.sql.execution.streaming.state.GetH\x00R\x03get\x12n\n\x10valueStateUpdate\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.ValueStateUpdateH\x00R\x10valueStateUpdate\x12M\n\x05\x63lear\x18\x05 \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00R\x05\x63learB\x08\n\x06method"\xdf\x04\n\rListStateCall\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12P\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00R\x06\x65xists\x12\x62\n\x0clistStateGet\x18\x03 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.ListStateGetH\x00R\x0clistStateGet\x12\x62\n\x0clistStatePut\x18\x04 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.ListStatePutH\x00R\x0clistStatePut\x12_\n\x0b\x61ppendValue\x18\x05 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.AppendValueH\x00R\x0b\x61ppendValue\x12\\\n\nappendList\x18\x06 \x01(\x0b\x32:.org.apache.spark.sql.execution.streaming.state.AppendListH\x00R\nappendList\x12M\n\x05\x63lear\x18\x07 \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00R\x05\x63learB\x08\n\x06method"\xc2\x06\n\x0cMapStateCall\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12P\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00R\x06\x65xists\x12V\n\x08getValue\x18\x03 \x01(\x0b\x32\x38.org.apache.spark.sql.execution.streaming.state.GetValueH\x00R\x08getValue\x12_\n\x0b\x63ontainsKey\x18\x04 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.ContainsKeyH\x00R\x0b\x63ontainsKey\x12_\n\x0bupdateValue\x18\x05 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.UpdateValueH\x00R\x0bupdateValue\x12V\n\x08iterator\x18\x06 \x01(\x0b\x32\x38.org.apache.spark.sql.execution.streaming.state.IteratorH\x00R\x08iterator\x12J\n\x04keys\x18\x07 \x01(\x0b\x32\x34.org.apache.spark.sql.execution.streaming.state.KeysH\x00R\x04keys\x12P\n\x06values\x18\x08 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ValuesH\x00R\x06values\x12Y\n\tremoveKey\x18\t \x01(\x0b\x32\x39.org.apache.spark.sql.execution.streaming.state.RemoveKeyH\x00R\tremoveKey\x12M\n\x05\x63lear\x18\n \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00R\x05\x63learB\x08\n\x06method""\n\x0eSetImplicitKey\x12\x10\n\x03key\x18\x01 \x01(\x0cR\x03key"\x13\n\x11RemoveImplicitKey"\x08\n\x06\x45xists"\x05\n\x03Get"=\n\rRegisterTimer\x12,\n\x11\x65xpiryTimestampMs\x18\x01 \x01(\x03R\x11\x65xpiryTimestampMs";\n\x0b\x44\x65leteTimer\x12,\n\x11\x65xpiryTimestampMs\x18\x01 \x01(\x03R\x11\x65xpiryTimestampMs",\n\nListTimers\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"(\n\x10ValueStateUpdate\x12\x14\n\x05value\x18\x01 \x01(\x0cR\x05value"\x07\n\x05\x43lear".\n\x0cListStateGet\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"\x0e\n\x0cListStatePut"#\n\x0b\x41ppendValue\x12\x14\n\x05value\x18\x01 \x01(\x0cR\x05value"\x0c\n\nAppendList"$\n\x08GetValue\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey"\'\n\x0b\x43ontainsKey\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey"=\n\x0bUpdateValue\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey\x12\x14\n\x05value\x18\x02 \x01(\x0cR\x05value"*\n\x08Iterator\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"&\n\x04Keys\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"(\n\x06Values\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"%\n\tRemoveKey\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey"c\n\x0eSetHandleState\x12Q\n\x05state\x18\x01 \x01(\x0e\x32;.org.apache.spark.sql.execution.streaming.state.HandleStateR\x05state"+\n\tTTLConfig\x12\x1e\n\ndurationMs\x18\x01 \x01(\x05R\ndurationMs*`\n\x0bHandleState\x12\x0b\n\x07\x43REATED\x10\x00\x12\x0f\n\x0bINITIALIZED\x10\x01\x12\x12\n\x0e\x44\x41TA_PROCESSED\x10\x02\x12\x13\n\x0fTIMER_PROCESSED\x10\x03\x12\n\n\x06\x43LOSED\x10\x04\x62\x06proto3' + b'\n;org/apache/spark/sql/execution/streaming/StateMessage.proto\x12.org.apache.spark.sql.execution.streaming.state"\x84\x05\n\x0cStateRequest\x12\x18\n\x07version\x18\x01 \x01(\x05R\x07version\x12}\n\x15statefulProcessorCall\x18\x02 \x01(\x0b\x32\x45.org.apache.spark.sql.execution.streaming.state.StatefulProcessorCallH\x00R\x15statefulProcessorCall\x12z\n\x14stateVariableRequest\x18\x03 \x01(\x0b\x32\x44.org.apache.spark.sql.execution.streaming.state.StateVariableRequestH\x00R\x14stateVariableRequest\x12\x8c\x01\n\x1aimplicitGroupingKeyRequest\x18\x04 \x01(\x0b\x32J.org.apache.spark.sql.execution.streaming.state.ImplicitGroupingKeyRequestH\x00R\x1aimplicitGroupingKeyRequest\x12\x62\n\x0ctimerRequest\x18\x05 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.TimerRequestH\x00R\x0ctimerRequest\x12\x62\n\x0cutilsRequest\x18\x06 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.UtilsRequestH\x00R\x0cutilsRequestB\x08\n\x06method"i\n\rStateResponse\x12\x1e\n\nstatusCode\x18\x01 \x01(\x05R\nstatusCode\x12"\n\x0c\x65rrorMessage\x18\x02 \x01(\tR\x0c\x65rrorMessage\x12\x14\n\x05value\x18\x03 \x01(\x0cR\x05value"x\n\x1cStateResponseWithLongTypeVal\x12\x1e\n\nstatusCode\x18\x01 \x01(\x05R\nstatusCode\x12"\n\x0c\x65rrorMessage\x18\x02 \x01(\tR\x0c\x65rrorMessage\x12\x14\n\x05value\x18\x03 \x01(\x03R\x05value"z\n\x1eStateResponseWithStringTypeVal\x12\x1e\n\nstatusCode\x18\x01 \x01(\x05R\nstatusCode\x12"\n\x0c\x65rrorMessage\x18\x02 \x01(\tR\x0c\x65rrorMessage\x12\x14\n\x05value\x18\x03 \x01(\tR\x05value"\xa0\x05\n\x15StatefulProcessorCall\x12h\n\x0esetHandleState\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetHandleStateH\x00R\x0esetHandleState\x12h\n\rgetValueState\x18\x02 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\rgetValueState\x12\x66\n\x0cgetListState\x18\x03 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\x0cgetListState\x12\x64\n\x0bgetMapState\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\x0bgetMapState\x12o\n\x0etimerStateCall\x18\x05 \x01(\x0b\x32\x45.org.apache.spark.sql.execution.streaming.state.TimerStateCallCommandH\x00R\x0etimerStateCall\x12j\n\x0e\x64\x65leteIfExists\x18\x06 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00R\x0e\x64\x65leteIfExistsB\x08\n\x06method"\xd5\x02\n\x14StateVariableRequest\x12h\n\x0evalueStateCall\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.ValueStateCallH\x00R\x0evalueStateCall\x12\x65\n\rlistStateCall\x18\x02 \x01(\x0b\x32=.org.apache.spark.sql.execution.streaming.state.ListStateCallH\x00R\rlistStateCall\x12\x62\n\x0cmapStateCall\x18\x03 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.MapStateCallH\x00R\x0cmapStateCallB\x08\n\x06method"\x83\x02\n\x1aImplicitGroupingKeyRequest\x12h\n\x0esetImplicitKey\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetImplicitKeyH\x00R\x0esetImplicitKey\x12q\n\x11removeImplicitKey\x18\x02 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.RemoveImplicitKeyH\x00R\x11removeImplicitKeyB\x08\n\x06method"\x81\x02\n\x0cTimerRequest\x12q\n\x11timerValueRequest\x18\x01 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.TimerValueRequestH\x00R\x11timerValueRequest\x12t\n\x12\x65xpiryTimerRequest\x18\x02 \x01(\x0b\x32\x42.org.apache.spark.sql.execution.streaming.state.ExpiryTimerRequestH\x00R\x12\x65xpiryTimerRequestB\x08\n\x06method"\xf6\x01\n\x11TimerValueRequest\x12s\n\x12getProcessingTimer\x18\x01 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.GetProcessingTimeH\x00R\x12getProcessingTimer\x12\x62\n\x0cgetWatermark\x18\x02 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.GetWatermarkH\x00R\x0cgetWatermarkB\x08\n\x06method"B\n\x12\x45xpiryTimerRequest\x12,\n\x11\x65xpiryTimestampMs\x18\x01 \x01(\x03R\x11\x65xpiryTimestampMs"\x13\n\x11GetProcessingTime"\x0e\n\x0cGetWatermark"\x8b\x01\n\x0cUtilsRequest\x12q\n\x11parseStringSchema\x18\x01 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.ParseStringSchemaH\x00R\x11parseStringSchemaB\x08\n\x06method"+\n\x11ParseStringSchema\x12\x16\n\x06schema\x18\x01 \x01(\tR\x06schema"\xc7\x01\n\x10StateCallCommand\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12\x16\n\x06schema\x18\x02 \x01(\tR\x06schema\x12\x30\n\x13mapStateValueSchema\x18\x03 \x01(\tR\x13mapStateValueSchema\x12K\n\x03ttl\x18\x04 \x01(\x0b\x32\x39.org.apache.spark.sql.execution.streaming.state.TTLConfigR\x03ttl"\xa7\x02\n\x15TimerStateCallCommand\x12[\n\x08register\x18\x01 \x01(\x0b\x32=.org.apache.spark.sql.execution.streaming.state.RegisterTimerH\x00R\x08register\x12U\n\x06\x64\x65lete\x18\x02 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.DeleteTimerH\x00R\x06\x64\x65lete\x12P\n\x04list\x18\x03 \x01(\x0b\x32:.org.apache.spark.sql.execution.streaming.state.ListTimersH\x00R\x04listB\x08\n\x06method"\x92\x03\n\x0eValueStateCall\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12P\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00R\x06\x65xists\x12G\n\x03get\x18\x03 \x01(\x0b\x32\x33.org.apache.spark.sql.execution.streaming.state.GetH\x00R\x03get\x12n\n\x10valueStateUpdate\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.ValueStateUpdateH\x00R\x10valueStateUpdate\x12M\n\x05\x63lear\x18\x05 \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00R\x05\x63learB\x08\n\x06method"\xdf\x04\n\rListStateCall\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12P\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00R\x06\x65xists\x12\x62\n\x0clistStateGet\x18\x03 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.ListStateGetH\x00R\x0clistStateGet\x12\x62\n\x0clistStatePut\x18\x04 \x01(\x0b\x32<.org.apache.spark.sql.execution.streaming.state.ListStatePutH\x00R\x0clistStatePut\x12_\n\x0b\x61ppendValue\x18\x05 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.AppendValueH\x00R\x0b\x61ppendValue\x12\\\n\nappendList\x18\x06 \x01(\x0b\x32:.org.apache.spark.sql.execution.streaming.state.AppendListH\x00R\nappendList\x12M\n\x05\x63lear\x18\x07 \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00R\x05\x63learB\x08\n\x06method"\xc2\x06\n\x0cMapStateCall\x12\x1c\n\tstateName\x18\x01 \x01(\tR\tstateName\x12P\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00R\x06\x65xists\x12V\n\x08getValue\x18\x03 \x01(\x0b\x32\x38.org.apache.spark.sql.execution.streaming.state.GetValueH\x00R\x08getValue\x12_\n\x0b\x63ontainsKey\x18\x04 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.ContainsKeyH\x00R\x0b\x63ontainsKey\x12_\n\x0bupdateValue\x18\x05 \x01(\x0b\x32;.org.apache.spark.sql.execution.streaming.state.UpdateValueH\x00R\x0bupdateValue\x12V\n\x08iterator\x18\x06 \x01(\x0b\x32\x38.org.apache.spark.sql.execution.streaming.state.IteratorH\x00R\x08iterator\x12J\n\x04keys\x18\x07 \x01(\x0b\x32\x34.org.apache.spark.sql.execution.streaming.state.KeysH\x00R\x04keys\x12P\n\x06values\x18\x08 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ValuesH\x00R\x06values\x12Y\n\tremoveKey\x18\t \x01(\x0b\x32\x39.org.apache.spark.sql.execution.streaming.state.RemoveKeyH\x00R\tremoveKey\x12M\n\x05\x63lear\x18\n \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00R\x05\x63learB\x08\n\x06method""\n\x0eSetImplicitKey\x12\x10\n\x03key\x18\x01 \x01(\x0cR\x03key"\x13\n\x11RemoveImplicitKey"\x08\n\x06\x45xists"\x05\n\x03Get"=\n\rRegisterTimer\x12,\n\x11\x65xpiryTimestampMs\x18\x01 \x01(\x03R\x11\x65xpiryTimestampMs";\n\x0b\x44\x65leteTimer\x12,\n\x11\x65xpiryTimestampMs\x18\x01 \x01(\x03R\x11\x65xpiryTimestampMs",\n\nListTimers\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"(\n\x10ValueStateUpdate\x12\x14\n\x05value\x18\x01 \x01(\x0cR\x05value"\x07\n\x05\x43lear".\n\x0cListStateGet\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"\x0e\n\x0cListStatePut"#\n\x0b\x41ppendValue\x12\x14\n\x05value\x18\x01 \x01(\x0cR\x05value"\x0c\n\nAppendList"$\n\x08GetValue\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey"\'\n\x0b\x43ontainsKey\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey"=\n\x0bUpdateValue\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey\x12\x14\n\x05value\x18\x02 \x01(\x0cR\x05value"*\n\x08Iterator\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"&\n\x04Keys\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"(\n\x06Values\x12\x1e\n\niteratorId\x18\x01 \x01(\tR\niteratorId"%\n\tRemoveKey\x12\x18\n\x07userKey\x18\x01 \x01(\x0cR\x07userKey"c\n\x0eSetHandleState\x12Q\n\x05state\x18\x01 \x01(\x0e\x32;.org.apache.spark.sql.execution.streaming.state.HandleStateR\x05state"+\n\tTTLConfig\x12\x1e\n\ndurationMs\x18\x01 \x01(\x05R\ndurationMs*n\n\x0bHandleState\x12\x0c\n\x08PRE_INIT\x10\x00\x12\x0b\n\x07\x43REATED\x10\x01\x12\x0f\n\x0bINITIALIZED\x10\x02\x12\x12\n\x0e\x44\x41TA_PROCESSED\x10\x03\x12\x13\n\x0fTIMER_PROCESSED\x10\x04\x12\n\n\x06\x43LOSED\x10\x05\x62\x06proto3' ) _globals = globals() @@ -50,82 +50,88 @@ ) if not _descriptor._USE_C_DESCRIPTORS: DESCRIPTOR._loaded_options = None - _globals["_HANDLESTATE"]._serialized_start = 5997 - _globals["_HANDLESTATE"]._serialized_end = 6093 + _globals["_HANDLESTATE"]._serialized_start = 6408 + _globals["_HANDLESTATE"]._serialized_end = 6518 _globals["_STATEREQUEST"]._serialized_start = 112 - _globals["_STATEREQUEST"]._serialized_end = 656 - _globals["_STATERESPONSE"]._serialized_start = 658 - _globals["_STATERESPONSE"]._serialized_end = 763 - _globals["_STATERESPONSEWITHLONGTYPEVAL"]._serialized_start = 765 - _globals["_STATERESPONSEWITHLONGTYPEVAL"]._serialized_end = 885 - _globals["_STATEFULPROCESSORCALL"]._serialized_start = 888 - _globals["_STATEFULPROCESSORCALL"]._serialized_end = 1560 - _globals["_STATEVARIABLEREQUEST"]._serialized_start = 1563 - _globals["_STATEVARIABLEREQUEST"]._serialized_end = 1904 - _globals["_IMPLICITGROUPINGKEYREQUEST"]._serialized_start = 1907 - _globals["_IMPLICITGROUPINGKEYREQUEST"]._serialized_end = 2166 - _globals["_TIMERREQUEST"]._serialized_start = 2169 - _globals["_TIMERREQUEST"]._serialized_end = 2426 - _globals["_TIMERVALUEREQUEST"]._serialized_start = 2429 - _globals["_TIMERVALUEREQUEST"]._serialized_end = 2675 - _globals["_EXPIRYTIMERREQUEST"]._serialized_start = 2677 - _globals["_EXPIRYTIMERREQUEST"]._serialized_end = 2743 - _globals["_GETPROCESSINGTIME"]._serialized_start = 2745 - _globals["_GETPROCESSINGTIME"]._serialized_end = 2764 - _globals["_GETWATERMARK"]._serialized_start = 2766 - _globals["_GETWATERMARK"]._serialized_end = 2780 - _globals["_STATECALLCOMMAND"]._serialized_start = 2783 - _globals["_STATECALLCOMMAND"]._serialized_end = 2982 - _globals["_TIMERSTATECALLCOMMAND"]._serialized_start = 2985 - _globals["_TIMERSTATECALLCOMMAND"]._serialized_end = 3280 - _globals["_VALUESTATECALL"]._serialized_start = 3283 - _globals["_VALUESTATECALL"]._serialized_end = 3685 - _globals["_LISTSTATECALL"]._serialized_start = 3688 - _globals["_LISTSTATECALL"]._serialized_end = 4295 - _globals["_MAPSTATECALL"]._serialized_start = 4298 - _globals["_MAPSTATECALL"]._serialized_end = 5132 - _globals["_SETIMPLICITKEY"]._serialized_start = 5134 - _globals["_SETIMPLICITKEY"]._serialized_end = 5168 - _globals["_REMOVEIMPLICITKEY"]._serialized_start = 5170 - _globals["_REMOVEIMPLICITKEY"]._serialized_end = 5189 - _globals["_EXISTS"]._serialized_start = 5191 - _globals["_EXISTS"]._serialized_end = 5199 - _globals["_GET"]._serialized_start = 5201 - _globals["_GET"]._serialized_end = 5206 - _globals["_REGISTERTIMER"]._serialized_start = 5208 - _globals["_REGISTERTIMER"]._serialized_end = 5269 - _globals["_DELETETIMER"]._serialized_start = 5271 - _globals["_DELETETIMER"]._serialized_end = 5330 - _globals["_LISTTIMERS"]._serialized_start = 5332 - _globals["_LISTTIMERS"]._serialized_end = 5376 - _globals["_VALUESTATEUPDATE"]._serialized_start = 5378 - _globals["_VALUESTATEUPDATE"]._serialized_end = 5418 - _globals["_CLEAR"]._serialized_start = 5420 - _globals["_CLEAR"]._serialized_end = 5427 - _globals["_LISTSTATEGET"]._serialized_start = 5429 - _globals["_LISTSTATEGET"]._serialized_end = 5475 - _globals["_LISTSTATEPUT"]._serialized_start = 5477 - _globals["_LISTSTATEPUT"]._serialized_end = 5491 - _globals["_APPENDVALUE"]._serialized_start = 5493 - _globals["_APPENDVALUE"]._serialized_end = 5528 - _globals["_APPENDLIST"]._serialized_start = 5530 - _globals["_APPENDLIST"]._serialized_end = 5542 - _globals["_GETVALUE"]._serialized_start = 5544 - _globals["_GETVALUE"]._serialized_end = 5580 - _globals["_CONTAINSKEY"]._serialized_start = 5582 - _globals["_CONTAINSKEY"]._serialized_end = 5621 - _globals["_UPDATEVALUE"]._serialized_start = 5623 - _globals["_UPDATEVALUE"]._serialized_end = 5684 - _globals["_ITERATOR"]._serialized_start = 5686 - _globals["_ITERATOR"]._serialized_end = 5728 - _globals["_KEYS"]._serialized_start = 5730 - _globals["_KEYS"]._serialized_end = 5768 - _globals["_VALUES"]._serialized_start = 5770 - _globals["_VALUES"]._serialized_end = 5810 - _globals["_REMOVEKEY"]._serialized_start = 5812 - _globals["_REMOVEKEY"]._serialized_end = 5849 - _globals["_SETHANDLESTATE"]._serialized_start = 5851 - _globals["_SETHANDLESTATE"]._serialized_end = 5950 - _globals["_TTLCONFIG"]._serialized_start = 5952 - _globals["_TTLCONFIG"]._serialized_end = 5995 + _globals["_STATEREQUEST"]._serialized_end = 756 + _globals["_STATERESPONSE"]._serialized_start = 758 + _globals["_STATERESPONSE"]._serialized_end = 863 + _globals["_STATERESPONSEWITHLONGTYPEVAL"]._serialized_start = 865 + _globals["_STATERESPONSEWITHLONGTYPEVAL"]._serialized_end = 985 + _globals["_STATERESPONSEWITHSTRINGTYPEVAL"]._serialized_start = 987 + _globals["_STATERESPONSEWITHSTRINGTYPEVAL"]._serialized_end = 1109 + _globals["_STATEFULPROCESSORCALL"]._serialized_start = 1112 + _globals["_STATEFULPROCESSORCALL"]._serialized_end = 1784 + _globals["_STATEVARIABLEREQUEST"]._serialized_start = 1787 + _globals["_STATEVARIABLEREQUEST"]._serialized_end = 2128 + _globals["_IMPLICITGROUPINGKEYREQUEST"]._serialized_start = 2131 + _globals["_IMPLICITGROUPINGKEYREQUEST"]._serialized_end = 2390 + _globals["_TIMERREQUEST"]._serialized_start = 2393 + _globals["_TIMERREQUEST"]._serialized_end = 2650 + _globals["_TIMERVALUEREQUEST"]._serialized_start = 2653 + _globals["_TIMERVALUEREQUEST"]._serialized_end = 2899 + _globals["_EXPIRYTIMERREQUEST"]._serialized_start = 2901 + _globals["_EXPIRYTIMERREQUEST"]._serialized_end = 2967 + _globals["_GETPROCESSINGTIME"]._serialized_start = 2969 + _globals["_GETPROCESSINGTIME"]._serialized_end = 2988 + _globals["_GETWATERMARK"]._serialized_start = 2990 + _globals["_GETWATERMARK"]._serialized_end = 3004 + _globals["_UTILSREQUEST"]._serialized_start = 3007 + _globals["_UTILSREQUEST"]._serialized_end = 3146 + _globals["_PARSESTRINGSCHEMA"]._serialized_start = 3148 + _globals["_PARSESTRINGSCHEMA"]._serialized_end = 3191 + _globals["_STATECALLCOMMAND"]._serialized_start = 3194 + _globals["_STATECALLCOMMAND"]._serialized_end = 3393 + _globals["_TIMERSTATECALLCOMMAND"]._serialized_start = 3396 + _globals["_TIMERSTATECALLCOMMAND"]._serialized_end = 3691 + _globals["_VALUESTATECALL"]._serialized_start = 3694 + _globals["_VALUESTATECALL"]._serialized_end = 4096 + _globals["_LISTSTATECALL"]._serialized_start = 4099 + _globals["_LISTSTATECALL"]._serialized_end = 4706 + _globals["_MAPSTATECALL"]._serialized_start = 4709 + _globals["_MAPSTATECALL"]._serialized_end = 5543 + _globals["_SETIMPLICITKEY"]._serialized_start = 5545 + _globals["_SETIMPLICITKEY"]._serialized_end = 5579 + _globals["_REMOVEIMPLICITKEY"]._serialized_start = 5581 + _globals["_REMOVEIMPLICITKEY"]._serialized_end = 5600 + _globals["_EXISTS"]._serialized_start = 5602 + _globals["_EXISTS"]._serialized_end = 5610 + _globals["_GET"]._serialized_start = 5612 + _globals["_GET"]._serialized_end = 5617 + _globals["_REGISTERTIMER"]._serialized_start = 5619 + _globals["_REGISTERTIMER"]._serialized_end = 5680 + _globals["_DELETETIMER"]._serialized_start = 5682 + _globals["_DELETETIMER"]._serialized_end = 5741 + _globals["_LISTTIMERS"]._serialized_start = 5743 + _globals["_LISTTIMERS"]._serialized_end = 5787 + _globals["_VALUESTATEUPDATE"]._serialized_start = 5789 + _globals["_VALUESTATEUPDATE"]._serialized_end = 5829 + _globals["_CLEAR"]._serialized_start = 5831 + _globals["_CLEAR"]._serialized_end = 5838 + _globals["_LISTSTATEGET"]._serialized_start = 5840 + _globals["_LISTSTATEGET"]._serialized_end = 5886 + _globals["_LISTSTATEPUT"]._serialized_start = 5888 + _globals["_LISTSTATEPUT"]._serialized_end = 5902 + _globals["_APPENDVALUE"]._serialized_start = 5904 + _globals["_APPENDVALUE"]._serialized_end = 5939 + _globals["_APPENDLIST"]._serialized_start = 5941 + _globals["_APPENDLIST"]._serialized_end = 5953 + _globals["_GETVALUE"]._serialized_start = 5955 + _globals["_GETVALUE"]._serialized_end = 5991 + _globals["_CONTAINSKEY"]._serialized_start = 5993 + _globals["_CONTAINSKEY"]._serialized_end = 6032 + _globals["_UPDATEVALUE"]._serialized_start = 6034 + _globals["_UPDATEVALUE"]._serialized_end = 6095 + _globals["_ITERATOR"]._serialized_start = 6097 + _globals["_ITERATOR"]._serialized_end = 6139 + _globals["_KEYS"]._serialized_start = 6141 + _globals["_KEYS"]._serialized_end = 6179 + _globals["_VALUES"]._serialized_start = 6181 + _globals["_VALUES"]._serialized_end = 6221 + _globals["_REMOVEKEY"]._serialized_start = 6223 + _globals["_REMOVEKEY"]._serialized_end = 6260 + _globals["_SETHANDLESTATE"]._serialized_start = 6262 + _globals["_SETHANDLESTATE"]._serialized_end = 6361 + _globals["_TTLCONFIG"]._serialized_start = 6363 + _globals["_TTLCONFIG"]._serialized_end = 6406 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/streaming/proto/StateMessage_pb2.pyi b/python/pyspark/sql/streaming/proto/StateMessage_pb2.pyi index 52f66928294cb..ac4b03b820349 100644 --- a/python/pyspark/sql/streaming/proto/StateMessage_pb2.pyi +++ b/python/pyspark/sql/streaming/proto/StateMessage_pb2.pyi @@ -56,19 +56,21 @@ class _HandleStateEnumTypeWrapper( builtins.type, ): # noqa: F821 DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor - CREATED: _HandleState.ValueType # 0 - INITIALIZED: _HandleState.ValueType # 1 - DATA_PROCESSED: _HandleState.ValueType # 2 - TIMER_PROCESSED: _HandleState.ValueType # 3 - CLOSED: _HandleState.ValueType # 4 + PRE_INIT: _HandleState.ValueType # 0 + CREATED: _HandleState.ValueType # 1 + INITIALIZED: _HandleState.ValueType # 2 + DATA_PROCESSED: _HandleState.ValueType # 3 + TIMER_PROCESSED: _HandleState.ValueType # 4 + CLOSED: _HandleState.ValueType # 5 class HandleState(_HandleState, metaclass=_HandleStateEnumTypeWrapper): ... -CREATED: HandleState.ValueType # 0 -INITIALIZED: HandleState.ValueType # 1 -DATA_PROCESSED: HandleState.ValueType # 2 -TIMER_PROCESSED: HandleState.ValueType # 3 -CLOSED: HandleState.ValueType # 4 +PRE_INIT: HandleState.ValueType # 0 +CREATED: HandleState.ValueType # 1 +INITIALIZED: HandleState.ValueType # 2 +DATA_PROCESSED: HandleState.ValueType # 3 +TIMER_PROCESSED: HandleState.ValueType # 4 +CLOSED: HandleState.ValueType # 5 global___HandleState = HandleState class StateRequest(google.protobuf.message.Message): @@ -79,6 +81,7 @@ class StateRequest(google.protobuf.message.Message): STATEVARIABLEREQUEST_FIELD_NUMBER: builtins.int IMPLICITGROUPINGKEYREQUEST_FIELD_NUMBER: builtins.int TIMERREQUEST_FIELD_NUMBER: builtins.int + UTILSREQUEST_FIELD_NUMBER: builtins.int version: builtins.int @property def statefulProcessorCall(self) -> global___StatefulProcessorCall: ... @@ -88,6 +91,8 @@ class StateRequest(google.protobuf.message.Message): def implicitGroupingKeyRequest(self) -> global___ImplicitGroupingKeyRequest: ... @property def timerRequest(self) -> global___TimerRequest: ... + @property + def utilsRequest(self) -> global___UtilsRequest: ... def __init__( self, *, @@ -96,6 +101,7 @@ class StateRequest(google.protobuf.message.Message): stateVariableRequest: global___StateVariableRequest | None = ..., implicitGroupingKeyRequest: global___ImplicitGroupingKeyRequest | None = ..., timerRequest: global___TimerRequest | None = ..., + utilsRequest: global___UtilsRequest | None = ..., ) -> None: ... def HasField( self, @@ -110,6 +116,8 @@ class StateRequest(google.protobuf.message.Message): b"statefulProcessorCall", "timerRequest", b"timerRequest", + "utilsRequest", + b"utilsRequest", ], ) -> builtins.bool: ... def ClearField( @@ -125,6 +133,8 @@ class StateRequest(google.protobuf.message.Message): b"statefulProcessorCall", "timerRequest", b"timerRequest", + "utilsRequest", + b"utilsRequest", "version", b"version", ], @@ -137,6 +147,7 @@ class StateRequest(google.protobuf.message.Message): "stateVariableRequest", "implicitGroupingKeyRequest", "timerRequest", + "utilsRequest", ] | None ): ... @@ -193,6 +204,31 @@ class StateResponseWithLongTypeVal(google.protobuf.message.Message): global___StateResponseWithLongTypeVal = StateResponseWithLongTypeVal +class StateResponseWithStringTypeVal(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + STATUSCODE_FIELD_NUMBER: builtins.int + ERRORMESSAGE_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + statusCode: builtins.int + errorMessage: builtins.str + value: builtins.str + def __init__( + self, + *, + statusCode: builtins.int = ..., + errorMessage: builtins.str = ..., + value: builtins.str = ..., + ) -> None: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "errorMessage", b"errorMessage", "statusCode", b"statusCode", "value", b"value" + ], + ) -> None: ... + +global___StateResponseWithStringTypeVal = StateResponseWithStringTypeVal + class StatefulProcessorCall(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -492,6 +528,49 @@ class GetWatermark(google.protobuf.message.Message): global___GetWatermark = GetWatermark +class UtilsRequest(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + PARSESTRINGSCHEMA_FIELD_NUMBER: builtins.int + @property + def parseStringSchema(self) -> global___ParseStringSchema: ... + def __init__( + self, + *, + parseStringSchema: global___ParseStringSchema | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "method", b"method", "parseStringSchema", b"parseStringSchema" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "method", b"method", "parseStringSchema", b"parseStringSchema" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["method", b"method"] + ) -> typing_extensions.Literal["parseStringSchema"] | None: ... + +global___UtilsRequest = UtilsRequest + +class ParseStringSchema(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SCHEMA_FIELD_NUMBER: builtins.int + schema: builtins.str + def __init__( + self, + *, + schema: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["schema", b"schema"]) -> None: ... + +global___ParseStringSchema = ParseStringSchema + class StateCallCommand(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index 6aa01d2f83a42..69282dce37afe 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -1317,9 +1317,9 @@ def trigger( }, ) interval = processingTime.strip() - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime( - interval - ) + jTrigger = getattr( + self._spark._sc._jvm, "org.apache.spark.sql.streaming.Trigger" + ).ProcessingTime(interval) elif once is not None: if once is not True: @@ -1328,7 +1328,9 @@ def trigger( messageParameters={"arg_name": "once", "arg_value": str(once)}, ) - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once() + jTrigger = getattr( + self._spark._sc._jvm, "org.apache.spark.sql.streaming.Trigger" + ).Once() elif continuous is not None: if type(continuous) != str or len(continuous.strip()) == 0: @@ -1337,16 +1339,18 @@ def trigger( messageParameters={"arg_name": "continuous", "arg_value": str(continuous)}, ) interval = continuous.strip() - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous( - interval - ) + jTrigger = getattr( + self._spark._sc._jvm, "org.apache.spark.sql.streaming.Trigger" + ).Continuous(interval) else: if availableNow is not True: raise PySparkValueError( errorClass="VALUE_NOT_TRUE", messageParameters={"arg_name": "availableNow", "arg_value": str(availableNow)}, ) - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.AvailableNow() + jTrigger = getattr( + self._spark._sc._jvm, "org.apache.spark.sql.streaming.Trigger" + ).AvailableNow() self._jwrite = self._jwrite.trigger(jTrigger) return self @@ -1557,11 +1561,9 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt serializer = AutoBatchedSerializer(CPickleSerializer()) wrapped_func = _wrap_function(self._spark._sc, func, serializer, serializer) assert self._spark._sc._jvm is not None - jForeachWriter = ( - self._spark._sc._jvm.org.apache.spark.sql.execution.python.PythonForeachWriter( - wrapped_func, self._df._jdf.schema() - ) - ) + jForeachWriter = getattr( + self._spark._sc._jvm, "org.apache.spark.sql.execution.python.PythonForeachWriter" + )(wrapped_func, self._df._jdf.schema()) self._jwrite.foreach(jForeachWriter) return self diff --git a/python/pyspark/sql/streaming/state.py b/python/pyspark/sql/streaming/state.py index 0ea5590ef2e65..cd067a8413e1c 100644 --- a/python/pyspark/sql/streaming/state.py +++ b/python/pyspark/sql/streaming/state.py @@ -19,7 +19,6 @@ from typing import Tuple, Optional from pyspark.sql.types import Row, StructType, TimestampType -from pyspark.sql.utils import has_numpy from pyspark.errors import PySparkTypeError, PySparkValueError, PySparkRuntimeError __all__ = ["GroupState", "GroupStateTimeout"] @@ -132,6 +131,8 @@ def update(self, newValue: Tuple) -> None: """ Update the value of the state. The value of the state cannot be null. """ + from pyspark.testing.utils import have_numpy + if newValue is None: raise PySparkTypeError( errorClass="CANNOT_BE_NONE", @@ -139,7 +140,7 @@ def update(self, newValue: Tuple) -> None: ) converted = [] - if has_numpy: + if have_numpy: import numpy as np # In order to convert NumPy types to Python primitive types. diff --git a/python/pyspark/sql/streaming/stateful_processor.py b/python/pyspark/sql/streaming/stateful_processor.py index 20078c215bace..b04bb955488ab 100644 --- a/python/pyspark/sql/streaming/stateful_processor.py +++ b/python/pyspark/sql/streaming/stateful_processor.py @@ -45,12 +45,9 @@ class ValueState: .. versionadded:: 4.0.0 """ - def __init__( - self, value_state_client: ValueStateClient, state_name: str, schema: Union[StructType, str] - ) -> None: + def __init__(self, value_state_client: ValueStateClient, state_name: str) -> None: self._value_state_client = value_state_client self._state_name = state_name - self.schema = schema def exists(self) -> bool: """ @@ -68,7 +65,7 @@ def update(self, new_value: Tuple) -> None: """ Update the value of the state. """ - self._value_state_client.update(self._state_name, self.schema, new_value) + self._value_state_client.update(self._state_name, new_value) def clear(self) -> None: """ @@ -105,21 +102,13 @@ def get_current_watermark_in_ms(self) -> int: class ExpiredTimerInfo: """ - Class used for arbitrary stateful operations with transformWithState to access expired timer - info. When is_valid is false, the expiry timestamp is invalid. + Class used to provide access to expired timer's expiry time. .. versionadded:: 4.0.0 """ - def __init__(self, is_valid: bool, expiry_time_in_ms: int = -1) -> None: - self._is_valid = is_valid + def __init__(self, expiry_time_in_ms: int = -1) -> None: self._expiry_time_in_ms = expiry_time_in_ms - def is_valid(self) -> bool: - """ - Whether the expiry info is valid. - """ - return self._is_valid - def get_expiry_time_in_ms(self) -> int: """ Get the timestamp for expired timer, return timestamp in millisecond. @@ -135,12 +124,9 @@ class ListState: .. versionadded:: 4.0.0 """ - def __init__( - self, list_state_client: ListStateClient, state_name: str, schema: Union[StructType, str] - ) -> None: + def __init__(self, list_state_client: ListStateClient, state_name: str) -> None: self._list_state_client = list_state_client self._state_name = state_name - self.schema = schema def exists(self) -> bool: """ @@ -158,19 +144,19 @@ def put(self, new_state: List[Tuple]) -> None: """ Update the values of the list state. """ - self._list_state_client.put(self._state_name, self.schema, new_state) + self._list_state_client.put(self._state_name, new_state) def append_value(self, new_state: Tuple) -> None: """ Append a new value to the list state. """ - self._list_state_client.append_value(self._state_name, self.schema, new_state) + self._list_state_client.append_value(self._state_name, new_state) def append_list(self, new_state: List[Tuple]) -> None: """ Append a list of new values to the list state. """ - self._list_state_client.append_list(self._state_name, self.schema, new_state) + self._list_state_client.append_list(self._state_name, new_state) def clear(self) -> None: """ @@ -283,7 +269,7 @@ def getValueState( If ttl is not specified the state will never expire. """ self.stateful_processor_api_client.get_value_state(state_name, schema, ttl_duration_ms) - return ValueState(ValueStateClient(self.stateful_processor_api_client), state_name, schema) + return ValueState(ValueStateClient(self.stateful_processor_api_client, schema), state_name) def getListState( self, state_name: str, schema: Union[StructType, str], ttl_duration_ms: Optional[int] = None @@ -307,7 +293,7 @@ def getListState( If ttl is not specified the state will never expire. """ self.stateful_processor_api_client.get_list_state(state_name, schema, ttl_duration_ms) - return ListState(ListStateClient(self.stateful_processor_api_client), state_name, schema) + return ListState(ListStateClient(self.stateful_processor_api_client, schema), state_name) def getMapState( self, @@ -398,7 +384,6 @@ def handleInputRows( key: Any, rows: Iterator["PandasDataFrameLike"], timer_values: TimerValues, - expired_timer_info: ExpiredTimerInfo, ) -> Iterator["PandasDataFrameLike"]: """ Function that will allow users to interact with input data rows along with the grouping key. @@ -420,11 +405,29 @@ def handleInputRows( timer_values: TimerValues Timer value for the current batch that process the input rows. Users can get the processing or event time timestamp from TimerValues. - expired_timer_info: ExpiredTimerInfo - Timestamp of expired timers on the grouping key. """ ... + def handleExpiredTimer( + self, key: Any, timer_values: TimerValues, expired_timer_info: ExpiredTimerInfo + ) -> Iterator["PandasDataFrameLike"]: + """ + Optional to implement. Will act return an empty iterator if not defined. + Function that will be invoked when a timer is fired for a given key. Users can choose to + evict state, register new timers and optionally provide output rows. + + Parameters + ---------- + key : Any + grouping key. + timer_values: TimerValues + Timer value for the current batch that process the input rows. + Users can get the processing or event time timestamp from TimerValues. + expired_timer_info: ExpiredTimerInfo + Instance of ExpiredTimerInfo that provides access to expired timer. + """ + return iter([]) + @abstractmethod def close(self) -> None: """ @@ -433,9 +436,21 @@ def close(self) -> None: """ ... - def handleInitialState(self, key: Any, initialState: "PandasDataFrameLike") -> None: + def handleInitialState( + self, key: Any, initialState: "PandasDataFrameLike", timer_values: TimerValues + ) -> None: """ Optional to implement. Will act as no-op if not defined or no initial state input. Function that will be invoked only in the first batch for users to process initial states. + + Parameters + ---------- + key : Any + grouping key. + initialState: :class:`pandas.DataFrame` + One dataframe in the initial state associated with the key. + timer_values: TimerValues + Timer value for the current batch that process the input rows. + Users can get the processing or event time timestamp from TimerValues. """ pass diff --git a/python/pyspark/sql/streaming/stateful_processor_api_client.py b/python/pyspark/sql/streaming/stateful_processor_api_client.py index 353f75e267962..6fd56481bc612 100644 --- a/python/pyspark/sql/streaming/stateful_processor_api_client.py +++ b/python/pyspark/sql/streaming/stateful_processor_api_client.py @@ -15,20 +15,19 @@ # limitations under the License. # from enum import Enum +import json import os import socket -from typing import Any, Dict, List, Union, Optional, cast, Tuple, Iterator +from typing import Any, Dict, List, Union, Optional, Tuple, Iterator from pyspark.serializers import write_int, read_int, UTF8Deserializer from pyspark.sql.pandas.serializers import ArrowStreamSerializer from pyspark.sql.types import ( StructType, TYPE_CHECKING, - _parse_datatype_string, Row, ) from pyspark.sql.pandas.types import convert_pandas_using_numpy_type -from pyspark.sql.utils import has_numpy from pyspark.serializers import CPickleSerializer from pyspark.errors import PySparkRuntimeError import uuid @@ -40,6 +39,7 @@ class StatefulProcessorHandleState(Enum): + PRE_INIT = 0 CREATED = 1 INITIALIZED = 2 DATA_PROCESSED = 3 @@ -48,25 +48,36 @@ class StatefulProcessorHandleState(Enum): class StatefulProcessorApiClient: - def __init__(self, state_server_port: int, key_schema: StructType) -> None: + def __init__( + self, state_server_port: int, key_schema: StructType, is_driver: bool = False + ) -> None: self.key_schema = key_schema self._client_socket = socket.socket() self._client_socket.connect(("localhost", state_server_port)) self.sockfile = self._client_socket.makefile( "rwb", int(os.environ.get("SPARK_BUFFER_SIZE", 65536)) ) - self.handle_state = StatefulProcessorHandleState.CREATED + if is_driver: + self.handle_state = StatefulProcessorHandleState.PRE_INIT + else: + self.handle_state = StatefulProcessorHandleState.CREATED self.utf8_deserializer = UTF8Deserializer() self.pickleSer = CPickleSerializer() self.serializer = ArrowStreamSerializer() # Dictionaries to store the mapping between iterator id and a tuple of pandas DataFrame # and the index of the last row that was read. self.list_timer_iterator_cursors: Dict[str, Tuple["PandasDataFrameLike", int]] = {} + # statefulProcessorApiClient is initialized per batch per partition, + # so we will have new timestamps for a new batch + self._batch_timestamp = -1 + self._watermark_timestamp = -1 def set_handle_state(self, state: StatefulProcessorHandleState) -> None: import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - if state == StatefulProcessorHandleState.CREATED: + if state == StatefulProcessorHandleState.PRE_INIT: + proto_state = stateMessage.PRE_INIT + elif state == StatefulProcessorHandleState.CREATED: proto_state = stateMessage.CREATED elif state == StatefulProcessorHandleState.INITIALIZED: proto_state = stateMessage.INITIALIZED @@ -125,7 +136,7 @@ def get_value_state( import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage if isinstance(schema, str): - schema = cast(StructType, _parse_datatype_string(schema)) + schema = self._parse_string_schema(schema) state_call_command = stateMessage.StateCallCommand() state_call_command.stateName = state_name @@ -148,7 +159,7 @@ def get_list_state( import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage if isinstance(schema, str): - schema = cast(StructType, _parse_datatype_string(schema)) + schema = self._parse_string_schema(schema) state_call_command = stateMessage.StateCallCommand() state_call_command.stateName = state_name @@ -266,47 +277,15 @@ def get_expiry_timers_iterator( # TODO(SPARK-49233): Classify user facing errors. raise PySparkRuntimeError(f"Error getting expiry timers: " f"{response_message[1]}") - def get_batch_timestamp(self) -> int: - import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - - get_processing_time_call = stateMessage.GetProcessingTime() - timer_value_call = stateMessage.TimerValueRequest( - getProcessingTimer=get_processing_time_call - ) - timer_request = stateMessage.TimerRequest(timerValueRequest=timer_value_call) - message = stateMessage.StateRequest(timerRequest=timer_request) - - self._send_proto_message(message.SerializeToString()) - response_message = self._receive_proto_message_with_long_value() - status = response_message[0] - if status != 0: - # TODO(SPARK-49233): Classify user facing errors. - raise PySparkRuntimeError( - f"Error getting processing timestamp: " f"{response_message[1]}" - ) + def get_timestamps(self, time_mode: str) -> Tuple[int, int]: + if time_mode.lower() == "none": + return -1, -1 else: - timestamp = response_message[2] - return timestamp - - def get_watermark_timestamp(self) -> int: - import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - - get_watermark_call = stateMessage.GetWatermark() - timer_value_call = stateMessage.TimerValueRequest(getWatermark=get_watermark_call) - timer_request = stateMessage.TimerRequest(timerValueRequest=timer_value_call) - message = stateMessage.StateRequest(timerRequest=timer_request) - - self._send_proto_message(message.SerializeToString()) - response_message = self._receive_proto_message_with_long_value() - status = response_message[0] - if status != 0: - # TODO(SPARK-49233): Classify user facing errors. - raise PySparkRuntimeError( - f"Error getting eventtime timestamp: " f"{response_message[1]}" - ) - else: - timestamp = response_message[2] - return timestamp + if self._batch_timestamp == -1: + self._batch_timestamp = self._get_batch_timestamp() + if self._watermark_timestamp == -1: + self._watermark_timestamp = self._get_watermark_timestamp() + return self._batch_timestamp, self._watermark_timestamp def get_map_state( self, @@ -318,9 +297,9 @@ def get_map_state( import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage if isinstance(user_key_schema, str): - user_key_schema = cast(StructType, _parse_datatype_string(user_key_schema)) + user_key_schema = self._parse_string_schema(user_key_schema) if isinstance(value_schema, str): - value_schema = cast(StructType, _parse_datatype_string(value_schema)) + value_schema = self._parse_string_schema(value_schema) state_call_command = stateMessage.StateCallCommand() state_call_command.stateName = state_name @@ -353,6 +332,48 @@ def delete_if_exists(self, state_name: str) -> None: # TODO(SPARK-49233): Classify user facing errors. raise PySparkRuntimeError(f"Error deleting state: " f"{response_message[1]}") + def _get_batch_timestamp(self) -> int: + import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage + + get_processing_time_call = stateMessage.GetProcessingTime() + timer_value_call = stateMessage.TimerValueRequest( + getProcessingTimer=get_processing_time_call + ) + timer_request = stateMessage.TimerRequest(timerValueRequest=timer_value_call) + message = stateMessage.StateRequest(timerRequest=timer_request) + + self._send_proto_message(message.SerializeToString()) + response_message = self._receive_proto_message_with_long_value() + status = response_message[0] + if status != 0: + # TODO(SPARK-49233): Classify user facing errors. + raise PySparkRuntimeError( + f"Error getting processing timestamp: " f"{response_message[1]}" + ) + else: + timestamp = response_message[2] + return timestamp + + def _get_watermark_timestamp(self) -> int: + import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage + + get_watermark_call = stateMessage.GetWatermark() + timer_value_call = stateMessage.TimerValueRequest(getWatermark=get_watermark_call) + timer_request = stateMessage.TimerRequest(timerValueRequest=timer_value_call) + message = stateMessage.StateRequest(timerRequest=timer_request) + + self._send_proto_message(message.SerializeToString()) + response_message = self._receive_proto_message_with_long_value() + status = response_message[0] + if status != 0: + # TODO(SPARK-49233): Classify user facing errors. + raise PySparkRuntimeError( + f"Error getting eventtime timestamp: " f"{response_message[1]}" + ) + else: + timestamp = response_message[2] + return timestamp + def _send_proto_message(self, message: bytes) -> None: # Writing zero here to indicate message version. This allows us to evolve the message # format or even changing the message protocol in the future. @@ -379,12 +400,24 @@ def _receive_proto_message_with_long_value(self) -> Tuple[int, str, int]: message.ParseFromString(bytes) return message.statusCode, message.errorMessage, message.value + def _receive_proto_message_with_string_value(self) -> Tuple[int, str, str]: + import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage + + length = read_int(self.sockfile) + bytes = self.sockfile.read(length) + message = stateMessage.StateResponseWithStringTypeVal() + message.ParseFromString(bytes) + return message.statusCode, message.errorMessage, message.value + def _receive_str(self) -> str: return self.utf8_deserializer.loads(self.sockfile) def _serialize_to_bytes(self, schema: StructType, data: Tuple) -> bytes: + from pyspark.testing.utils import have_numpy + converted = [] - if has_numpy: + + if have_numpy: import numpy as np # In order to convert NumPy types to Python primitive types. @@ -422,6 +455,24 @@ def _send_arrow_state(self, schema: StructType, state: List[Tuple]) -> None: def _read_arrow_state(self) -> Any: return self.serializer.load_stream(self.sockfile) + # Parse a string schema into a StructType schema. This method will perform an API call to + # JVM side to parse the schema string. + def _parse_string_schema(self, schema: str) -> StructType: + import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage + + parse_string_schema_call = stateMessage.ParseStringSchema(schema=schema) + utils_request = stateMessage.UtilsRequest(parseStringSchema=parse_string_schema_call) + message = stateMessage.StateRequest(utilsRequest=utils_request) + + self._send_proto_message(message.SerializeToString()) + response_message = self._receive_proto_message_with_string_value() + status = response_message[0] + if status != 0: + # TODO(SPARK-49233): Classify user facing errors. + raise PySparkRuntimeError(f"Error parsing string schema: " f"{response_message[1]}") + else: + return StructType.fromJson(json.loads(response_message[2])) + class ListTimerIterator: def __init__(self, stateful_processor_api_client: StatefulProcessorApiClient): diff --git a/python/pyspark/sql/streaming/stateful_processor_util.py b/python/pyspark/sql/streaming/stateful_processor_util.py new file mode 100644 index 0000000000000..d69c1a943862c --- /dev/null +++ b/python/pyspark/sql/streaming/stateful_processor_util.py @@ -0,0 +1,28 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from enum import Enum + +# This file places the utilities for transformWithStateInPandas; we have a separate file to avoid +# putting internal classes to the stateful_processor.py file which contains public APIs. + + +class TransformWithStateInPandasFuncMode(Enum): + PROCESS_DATA = 1 + PROCESS_TIMER = 2 + COMPLETE = 3 + PRE_INIT = 4 diff --git a/python/pyspark/sql/streaming/transform_with_state_driver_worker.py b/python/pyspark/sql/streaming/transform_with_state_driver_worker.py new file mode 100644 index 0000000000000..99d386f07b5b6 --- /dev/null +++ b/python/pyspark/sql/streaming/transform_with_state_driver_worker.py @@ -0,0 +1,102 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import json +from typing import Any, Iterator, TYPE_CHECKING + +from pyspark.util import local_connect_and_auth +from pyspark.serializers import ( + write_int, + read_int, + UTF8Deserializer, + CPickleSerializer, +) +from pyspark import worker +from pyspark.util import handle_worker_exception +from typing import IO +from pyspark.worker_util import check_python_version +from pyspark.sql.streaming.stateful_processor_api_client import StatefulProcessorApiClient +from pyspark.sql.streaming.stateful_processor_util import TransformWithStateInPandasFuncMode +from pyspark.sql.types import StructType + +if TYPE_CHECKING: + from pyspark.sql.pandas._typing import ( + DataFrameLike as PandasDataFrameLike, + ) + +pickle_ser = CPickleSerializer() +utf8_deserializer = UTF8Deserializer() + + +def main(infile: IO, outfile: IO) -> None: + check_python_version(infile) + + log_name = "Streaming TransformWithStateInPandas Python worker" + print(f"Starting {log_name}.\n") + + def process( + processor: StatefulProcessorApiClient, + mode: TransformWithStateInPandasFuncMode, + key: Any, + input: Iterator["PandasDataFrameLike"], + ) -> None: + print(f"{log_name} Starting execution of UDF: {func}.\n") + func(processor, mode, key, input) + print(f"{log_name} Completed execution of UDF: {func}.\n") + + try: + func, return_type = worker.read_command(pickle_ser, infile) + print( + f"{log_name} finish init stage of Python runner. Received UDF from JVM: {func}, " + f"received return type of UDF: {return_type}.\n" + ) + # send signal for getting args + write_int(0, outfile) + outfile.flush() + + # This driver runner will only be used on the first batch of a query, + # and the following code block should be only run once for each query run + state_server_port = read_int(infile) + key_schema = StructType.fromJson(json.loads(utf8_deserializer.loads(infile))) + print( + f"{log_name} received parameters for UDF. State server port: {state_server_port}, " + f"key schema: {key_schema}.\n" + ) + + stateful_processor_api_client = StatefulProcessorApiClient(state_server_port, key_schema) + process( + stateful_processor_api_client, + TransformWithStateInPandasFuncMode.PRE_INIT, + None, + iter([]), + ) + write_int(0, outfile) + outfile.flush() + except Exception as e: + handle_worker_exception(e, outfile) + outfile.flush() + + +if __name__ == "__main__": + # Read information about how to connect back to the JVM from the environment. + java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) + auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] + (sock_file, sock) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() + main(sock_file, sock_file) diff --git a/python/pyspark/sql/streaming/value_state_client.py b/python/pyspark/sql/streaming/value_state_client.py index fd783af7931da..532a89cf92d22 100644 --- a/python/pyspark/sql/streaming/value_state_client.py +++ b/python/pyspark/sql/streaming/value_state_client.py @@ -14,18 +14,26 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import Union, cast, Tuple, Optional +from typing import Union, Tuple, Optional from pyspark.sql.streaming.stateful_processor_api_client import StatefulProcessorApiClient -from pyspark.sql.types import StructType, _parse_datatype_string +from pyspark.sql.types import StructType from pyspark.errors import PySparkRuntimeError __all__ = ["ValueStateClient"] class ValueStateClient: - def __init__(self, stateful_processor_api_client: StatefulProcessorApiClient) -> None: + def __init__( + self, + stateful_processor_api_client: StatefulProcessorApiClient, + schema: Union[StructType, str], + ) -> None: self._stateful_processor_api_client = stateful_processor_api_client + if isinstance(schema, str): + self.schema = self._stateful_processor_api_client._parse_string_schema(schema) + else: + self.schema = schema def exists(self, state_name: str) -> bool: import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage @@ -69,12 +77,10 @@ def get(self, state_name: str) -> Optional[Tuple]: # TODO(SPARK-49233): Classify user facing errors. raise PySparkRuntimeError(f"Error getting value state: " f"{response_message[1]}") - def update(self, state_name: str, schema: Union[StructType, str], value: Tuple) -> None: + def update(self, state_name: str, value: Tuple) -> None: import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage - if isinstance(schema, str): - schema = cast(StructType, _parse_datatype_string(schema)) - bytes = self._stateful_processor_api_client._serialize_to_bytes(schema, value) + bytes = self._stateful_processor_api_client._serialize_to_bytes(self.schema, value) update_call = stateMessage.ValueStateUpdate(value=bytes) value_state_call = stateMessage.ValueStateCall( stateName=state_name, valueStateUpdate=update_call diff --git a/python/pyspark/sql/table_arg.py b/python/pyspark/sql/table_arg.py new file mode 100644 index 0000000000000..cacfd24b2f1ba --- /dev/null +++ b/python/pyspark/sql/table_arg.py @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TYPE_CHECKING + +from pyspark.sql.tvf_argument import TableValuedFunctionArgument +from pyspark.sql.utils import get_active_spark_context + + +if TYPE_CHECKING: + from py4j.java_gateway import JavaObject + from pyspark.sql._typing import ColumnOrName + + +class TableArg(TableValuedFunctionArgument): + def __init__(self, j_table_arg: "JavaObject"): + self._j_table_arg = j_table_arg + + def partitionBy(self, *cols: "ColumnOrName") -> "TableArg": + from pyspark.sql.classic.column import _to_java_column, _to_seq + + sc = get_active_spark_context() + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] + j_cols = _to_seq(sc, cols, _to_java_column) + new_j_table_arg = self._j_table_arg.partitionBy(j_cols) + return TableArg(new_j_table_arg) + + def orderBy(self, *cols: "ColumnOrName") -> "TableArg": + from pyspark.sql.classic.column import _to_java_column, _to_seq + + sc = get_active_spark_context() + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] + j_cols = _to_seq(sc, cols, _to_java_column) + new_j_table_arg = self._j_table_arg.orderBy(j_cols) + return TableArg(new_j_table_arg) + + def withSinglePartition(self) -> "TableArg": + new_j_table_arg = self._j_table_arg.withSinglePartition() + return TableArg(new_j_table_arg) diff --git a/python/pyspark/sql/tests/arrow/__init__.py b/python/pyspark/sql/tests/arrow/__init__.py new file mode 100644 index 0000000000000..cce3acad34a49 --- /dev/null +++ b/python/pyspark/sql/tests/arrow/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py similarity index 99% rename from python/pyspark/sql/tests/test_arrow.py rename to python/pyspark/sql/tests/arrow/test_arrow.py index b71bdb1eece28..a2ee113b6386e 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/arrow/test_arrow.py @@ -1778,7 +1778,7 @@ def conf(cls): if __name__ == "__main__": - from pyspark.sql.tests.test_arrow import * # noqa: F401 + from pyspark.sql.tests.arrow.test_arrow import * # noqa: F401 try: import xmlrunner # type: ignore diff --git a/python/pyspark/sql/tests/test_arrow_cogrouped_map.py b/python/pyspark/sql/tests/arrow/test_arrow_cogrouped_map.py similarity index 96% rename from python/pyspark/sql/tests/test_arrow_cogrouped_map.py rename to python/pyspark/sql/tests/arrow/test_arrow_cogrouped_map.py index a90574b7f1928..80b12d3a7798b 100644 --- a/python/pyspark/sql/tests/test_arrow_cogrouped_map.py +++ b/python/pyspark/sql/tests/arrow/test_arrow_cogrouped_map.py @@ -299,6 +299,16 @@ def summarize(left, right): "+---------+------------+----------+-------------+\n", ) + def test_self_join(self): + df = self.spark.createDataFrame([(1, 1)], ("k", "v")) + + def arrow_func(key, left, right): + return pa.Table.from_pydict({"x": [2], "y": [2]}) + + df2 = df.groupby("k").cogroup(df.groupby("k")).applyInArrow(arrow_func, "x long, y long") + + self.assertEqual(df2.join(df2).count(), 1) + class CogroupedMapInArrowTests(CogroupedMapInArrowTestsMixin, ReusedSQLTestCase): @classmethod @@ -324,7 +334,7 @@ def tearDownClass(cls): if __name__ == "__main__": - from pyspark.sql.tests.test_arrow_cogrouped_map import * # noqa: F401 + from pyspark.sql.tests.arrow.test_arrow_cogrouped_map import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/test_arrow_grouped_map.py b/python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py similarity index 96% rename from python/pyspark/sql/tests/test_arrow_grouped_map.py rename to python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py index f9947d0788b87..c9ad602edfd27 100644 --- a/python/pyspark/sql/tests/test_arrow_grouped_map.py +++ b/python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py @@ -255,6 +255,16 @@ def foo(_): self.assertEqual(r.a, "hi") self.assertEqual(r.b, 1) + def test_self_join(self): + df = self.spark.createDataFrame([(1, 1)], ("k", "v")) + + def arrow_func(key, table): + return pa.Table.from_pydict({"x": [2], "y": [2]}) + + df2 = df.groupby("k").applyInArrow(arrow_func, schema="x long, y long") + + self.assertEqual(df2.join(df2).count(), 1) + class GroupedMapInArrowTests(GroupedMapInArrowTestsMixin, ReusedSQLTestCase): @classmethod @@ -280,7 +290,7 @@ def tearDownClass(cls): if __name__ == "__main__": - from pyspark.sql.tests.test_arrow_grouped_map import * # noqa: F401 + from pyspark.sql.tests.arrow.test_arrow_grouped_map import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/test_arrow_map.py b/python/pyspark/sql/tests/arrow/test_arrow_map.py similarity index 98% rename from python/pyspark/sql/tests/test_arrow_map.py rename to python/pyspark/sql/tests/arrow/test_arrow_map.py index 2e82869230db4..71bb36a902e3e 100644 --- a/python/pyspark/sql/tests/test_arrow_map.py +++ b/python/pyspark/sql/tests/arrow/test_arrow_map.py @@ -195,7 +195,7 @@ def tearDownClass(cls): if __name__ == "__main__": - from pyspark.sql.tests.test_arrow_map import * # noqa: F401 + from pyspark.sql.tests.arrow.test_arrow_map import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/test_arrow_python_udf.py b/python/pyspark/sql/tests/arrow/test_arrow_python_udf.py similarity index 94% rename from python/pyspark/sql/tests/test_arrow_python_udf.py rename to python/pyspark/sql/tests/arrow/test_arrow_python_udf.py index 095414334848b..1f430d1ab00d9 100644 --- a/python/pyspark/sql/tests/test_arrow_python_udf.py +++ b/python/pyspark/sql/tests/arrow/test_arrow_python_udf.py @@ -238,8 +238,22 @@ def tearDownClass(cls): super(PythonUDFArrowTests, cls).tearDownClass() +class AsyncPythonUDFArrowTests(PythonUDFArrowTests): + @classmethod + def setUpClass(cls): + super(AsyncPythonUDFArrowTests, cls).setUpClass() + cls.spark.conf.set("spark.sql.execution.pythonUDF.arrow.concurrency.level", "4") + + @classmethod + def tearDownClass(cls): + try: + cls.spark.conf.unset("spark.sql.execution.pythonUDF.arrow.concurrency.level") + finally: + super(AsyncPythonUDFArrowTests, cls).tearDownClass() + + if __name__ == "__main__": - from pyspark.sql.tests.test_arrow_python_udf import * # noqa: F401 + from pyspark.sql.tests.arrow.test_arrow_python_udf import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/arrow/__init__.py b/python/pyspark/sql/tests/connect/arrow/__init__.py new file mode 100644 index 0000000000000..cce3acad34a49 --- /dev/null +++ b/python/pyspark/sql/tests/connect/arrow/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow.py similarity index 97% rename from python/pyspark/sql/tests/connect/test_parity_arrow.py rename to python/pyspark/sql/tests/connect/arrow/test_parity_arrow.py index 885b3001b1db1..fa8cf286b9bd6 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow.py +++ b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow.py @@ -17,7 +17,7 @@ import unittest -from pyspark.sql.tests.test_arrow import ArrowTestsMixin +from pyspark.sql.tests.arrow.test_arrow import ArrowTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils @@ -139,7 +139,7 @@ def test_create_dataframe_namedtuples(self): if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_arrow import * # noqa: F401 + from pyspark.sql.tests.connect.arrow.test_parity_arrow import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow_cogrouped_map.py b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_cogrouped_map.py similarity index 86% rename from python/pyspark/sql/tests/connect/test_parity_arrow_cogrouped_map.py rename to python/pyspark/sql/tests/connect/arrow/test_parity_arrow_cogrouped_map.py index 90c5f2c9b0613..c14c69b5ed4ec 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow_cogrouped_map.py +++ b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_cogrouped_map.py @@ -17,7 +17,7 @@ import unittest -from pyspark.sql.tests.test_arrow_cogrouped_map import CogroupedMapInArrowTestsMixin +from pyspark.sql.tests.arrow.test_arrow_cogrouped_map import CogroupedMapInArrowTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase @@ -26,7 +26,7 @@ class CogroupedMapInArrowParityTests(CogroupedMapInArrowTestsMixin, ReusedConnec if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_arrow_cogrouped_map import * # noqa: F401 + from pyspark.sql.tests.connect.arrow.test_parity_arrow_cogrouped_map import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow_grouped_map.py b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_grouped_map.py similarity index 87% rename from python/pyspark/sql/tests/connect/test_parity_arrow_grouped_map.py rename to python/pyspark/sql/tests/connect/arrow/test_parity_arrow_grouped_map.py index 0fb96ba13b838..ca12a8b06fdbf 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow_grouped_map.py +++ b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_grouped_map.py @@ -17,7 +17,7 @@ import unittest -from pyspark.sql.tests.test_arrow_grouped_map import GroupedMapInArrowTestsMixin +from pyspark.sql.tests.arrow.test_arrow_grouped_map import GroupedMapInArrowTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase @@ -26,7 +26,7 @@ class GroupedApplyInArrowParityTests(GroupedMapInArrowTestsMixin, ReusedConnectT if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_arrow_grouped_map import * # noqa: F401 + from pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow_map.py b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_map.py similarity index 88% rename from python/pyspark/sql/tests/connect/test_parity_arrow_map.py rename to python/pyspark/sql/tests/connect/arrow/test_parity_arrow_map.py index ed51d0d3d1996..1da356f524f31 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +++ b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_map.py @@ -17,7 +17,7 @@ import unittest -from pyspark.sql.tests.test_arrow_map import MapInArrowTestsMixin +from pyspark.sql.tests.arrow.test_arrow_map import MapInArrowTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase @@ -26,7 +26,7 @@ class ArrowMapParityTests(MapInArrowTestsMixin, ReusedConnectTestCase): if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_arrow_map import * # noqa: F401 + from pyspark.sql.tests.connect.arrow.test_parity_arrow_map import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_python_udf.py similarity index 90% rename from python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py rename to python/pyspark/sql/tests/connect/arrow/test_parity_arrow_python_udf.py index 732008eb05a35..fe81513f005f9 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +++ b/python/pyspark/sql/tests/connect/arrow/test_parity_arrow_python_udf.py @@ -16,7 +16,7 @@ # from pyspark.sql.tests.connect.test_parity_udf import UDFParityTests -from pyspark.sql.tests.test_arrow_python_udf import PythonUDFArrowTestsMixin +from pyspark.sql.tests.arrow.test_arrow_python_udf import PythonUDFArrowTestsMixin class ArrowPythonUDFParityTests(UDFParityTests, PythonUDFArrowTestsMixin): @@ -35,7 +35,7 @@ def tearDownClass(cls): if __name__ == "__main__": import unittest - from pyspark.sql.tests.connect.test_parity_arrow_python_udf import * # noqa: F401 + from pyspark.sql.tests.connect.arrow.test_parity_arrow_python_udf import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/pandas/__init__.py b/python/pyspark/sql/tests/connect/pandas/__init__.py new file mode 100644 index 0000000000000..cce3acad34a49 --- /dev/null +++ b/python/pyspark/sql/tests/connect/pandas/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_cogrouped_map.py similarity index 93% rename from python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_cogrouped_map.py index 00d71bda2d938..a71e6369f5e2f 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_cogrouped_map.py @@ -28,7 +28,7 @@ class CogroupedApplyInPandasTests( if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_pandas_cogrouped_map import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_cogrouped_map import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_grouped_map.py similarity index 94% rename from python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_grouped_map.py index 8c76313c5c96b..52110718808be 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_grouped_map.py @@ -28,7 +28,7 @@ def test_supported_types(self): if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_pandas_grouped_map import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_grouped_map_with_state.py similarity index 92% rename from python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_grouped_map_with_state.py index 67d42a7c86138..2da8b4aa3be8a 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_grouped_map_with_state.py @@ -29,7 +29,7 @@ class GroupedApplyInPandasWithStateTests( if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_pandas_grouped_map_with_state import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map_with_state import * # noqa: F401,E501 try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_map.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_map.py similarity index 93% rename from python/pyspark/sql/tests/connect/test_parity_pandas_map.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_map.py index 999afd24c6528..965ef5dcf8949 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_map.py @@ -28,7 +28,7 @@ class MapInPandasParityTests( if __name__ == "__main__": import unittest - from pyspark.sql.tests.connect.test_parity_pandas_map import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_map import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf.py similarity index 93% rename from python/pyspark/sql/tests/connect/test_parity_pandas_udf.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf.py index 364e41716474b..aa2b4748ff3f7 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf.py @@ -25,7 +25,7 @@ class PandasUDFParityTests(PandasUDFTestsMixin, ReusedConnectTestCase): if __name__ == "__main__": import unittest - from pyspark.sql.tests.connect.test_parity_pandas_udf import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_udf import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_grouped_agg.py similarity index 93% rename from python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_grouped_agg.py index fdb81bffbce12..dfcb2b94c1bcc 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_grouped_agg.py @@ -28,7 +28,7 @@ class PandasUDFGroupedAggParityTests( if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_pandas_udf_grouped_agg import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_grouped_agg import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_scalar.py similarity index 93% rename from python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_scalar.py index 451f0f68d6ee5..9cab05f569d46 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_scalar.py @@ -24,7 +24,7 @@ class PandasUDFScalarParityTests(ScalarPandasUDFTestsMixin, ReusedConnectTestCas if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_pandas_udf_scalar import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_scalar import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_window.py similarity index 93% rename from python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py rename to python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_window.py index b2288c9d949e5..08da1b4648b22 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +++ b/python/pyspark/sql/tests/connect/pandas/test_parity_pandas_udf_window.py @@ -28,7 +28,7 @@ class PandasUDFWindowParityTests( if __name__ == "__main__": - from pyspark.sql.tests.connect.test_parity_pandas_udf_window import * # noqa: F401 + from pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_window import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py b/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py index 1a8c7190e31a6..c4c10c963a48b 100644 --- a/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py +++ b/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py @@ -110,6 +110,12 @@ def func(iterator): cdf1 = cdf.mapInPandas(func, schema) self.assertEqual(cdf1._cached_schema, schema) + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf1 = cdf.mapInPandas(func, "a int, b string") + # Properly cache the parsed schema + self.assertEqual(cdf1._cached_schema, schema) + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): # 'mapInPandas' depends on the method 'pandas_udf', which is dispatched # based on 'is_remote'. However, in SparkConnectSQLTestCase, the remote @@ -180,6 +186,12 @@ def normalize(pdf): cdf1 = cdf.groupby("id").applyInPandas(normalize, schema) self.assertEqual(cdf1._cached_schema, schema) + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf1 = cdf.groupby("id").applyInPandas(normalize, "id long, v double") + # Properly cache the parsed schema + self.assertEqual(cdf1._cached_schema, schema) + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): self.assertFalse(is_remote()) sdf1 = sdf.groupby("id").applyInPandas(normalize, schema) diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index e29873173cc3a..d1e2558305291 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -54,7 +54,7 @@ def setUpClass(cls): # Disable the shared namespace so pyspark.sql.functions, etc point the regular # PySpark libraries. os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1" - cls.connect = cls.spark # Switch Spark Connect session and regular PySpark sesion. + cls.connect = cls.spark # Switch Spark Connect session and regular PySpark session. cls.spark = PySparkSession._instantiatedSession assert cls.spark is not None @@ -590,6 +590,10 @@ def test_aggregation_functions(self): (CF.avg, SF.avg), (CF.collect_list, SF.collect_list), (CF.collect_set, SF.collect_set), + (CF.listagg, SF.listagg), + (CF.listagg_distinct, SF.listagg_distinct), + (CF.string_agg, SF.string_agg), + (CF.string_agg_distinct, SF.string_agg_distinct), (CF.count, SF.count), (CF.first, SF.first), (CF.kurtosis, SF.kurtosis), diff --git a/python/pyspark/sql/tests/connect/test_connect_readwriter.py b/python/pyspark/sql/tests/connect/test_connect_readwriter.py index db1e94cb6863e..06266b86de3ff 100644 --- a/python/pyspark/sql/tests/connect/test_connect_readwriter.py +++ b/python/pyspark/sql/tests/connect/test_connect_readwriter.py @@ -146,6 +146,16 @@ def test_parquet(self): self.connect.read.parquet(d).toPandas(), self.spark.read.parquet(d).toPandas() ) + def test_parquet_compression_option(self): + # SPARK-50537: Fix compression option being overwritten in df.write.parquet + with tempfile.TemporaryDirectory(prefix="test_parquet") as d: + self.connect.range(10).write.mode("overwrite").option("compression", "gzip").parquet(d) + self.assertTrue(any(file.endswith(".gz.parquet") for file in os.listdir(d))) + # Read the Parquet file as a DataFrame. + self.assert_eq( + self.connect.read.parquet(d).toPandas(), self.spark.read.parquet(d).toPandas() + ) + def test_text(self): # SPARK-41849: Implement DataFrameReader.text with tempfile.TemporaryDirectory(prefix="test_text") as d: diff --git a/python/pyspark/sql/tests/connect/test_parity_job_cancellation.py b/python/pyspark/sql/tests/connect/test_parity_job_cancellation.py new file mode 100644 index 0000000000000..ddb4554afa55a --- /dev/null +++ b/python/pyspark/sql/tests/connect/test_parity_job_cancellation.py @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import threading + +from pyspark import inheritable_thread_target +from pyspark.sql.tests.test_job_cancellation import JobCancellationTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase + + +class JobCancellationParityTests(JobCancellationTestsMixin, ReusedConnectTestCase): + def test_inheritable_tags_with_deco(self): + @inheritable_thread_target(self.spark) + def func(target): + return target() + + self.check_inheritable_tags( + create_thread=lambda target, session: threading.Thread(target=func, args=(target,)) + ) + + +if __name__ == "__main__": + import unittest + from pyspark.sql.tests.connect.test_parity_job_cancellation import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/connect/test_parity_subquery.py b/python/pyspark/sql/tests/connect/test_parity_subquery.py index 1cba3a7d49956..f3225fcb7f2dd 100644 --- a/python/pyspark/sql/tests/connect/test_parity_subquery.py +++ b/python/pyspark/sql/tests/connect/test_parity_subquery.py @@ -17,13 +17,33 @@ import unittest +from pyspark.sql import functions as sf from pyspark.sql.tests.test_subquery import SubqueryTestsMixin +from pyspark.testing import assertDataFrameEqual from pyspark.testing.connectutils import ReusedConnectTestCase -@unittest.skip("TODO(SPARK-50134): Support subquery in connect") class SubqueryParityTests(SubqueryTestsMixin, ReusedConnectTestCase): - pass + def test_scalar_subquery_with_missing_outer_reference(self): + with self.tempView("l", "r"): + self.df1.createOrReplaceTempView("l") + self.df2.createOrReplaceTempView("r") + + assertDataFrameEqual( + self.spark.table("l").select( + "a", + ( + self.spark.table("r") + .where(sf.col("c") == sf.col("a")) + .select(sf.sum("d")) + .scalar() + ), + ), + self.spark.sql("""SELECT a, (SELECT sum(d) FROM r WHERE c = a) FROM l"""), + ) + + def test_subquery_in_unpivot(self): + self.check_subquery_in_unpivot(None, None) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py b/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py index 274364b181441..5c46130c5b50d 100644 --- a/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py +++ b/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py @@ -21,9 +21,9 @@ from pyspark.sql.tests.test_udf_profiler import ( UDFProfiler2TestsMixin, _do_computation, - has_flameprof, ) from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.utils import have_flameprof class UDFProfilerParityTests(UDFProfiler2TestsMixin, ReusedConnectTestCase): @@ -65,7 +65,7 @@ def action(df): io.getvalue(), f"10.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) diff --git a/python/pyspark/sql/tests/connect/test_parity_udtf.py b/python/pyspark/sql/tests/connect/test_parity_udtf.py index 6955e7377b4c4..6f4e4133335eb 100644 --- a/python/pyspark/sql/tests/connect/test_parity_udtf.py +++ b/python/pyspark/sql/tests/connect/test_parity_udtf.py @@ -76,6 +76,14 @@ def test_udtf_with_analyze_using_file(self): def test_udtf_access_spark_session(self): super().test_udtf_access_spark_session() + @unittest.skip("TODO(SPARK-50393): support df.asTable() in Spark Connect") + def test_df_asTable(self): + super().test_df_asTable() + + @unittest.skip("TODO(SPARK-50393): support df.asTable() in Spark Connect") + def test_df_asTable_chaining_methods(self): + super().test_df_asTable_chaining_methods() + def _add_pyfile(self, path): self.spark.addArtifacts(path, pyfile=True) diff --git a/python/pyspark/sql/tests/connect/test_session.py b/python/pyspark/sql/tests/connect/test_session.py index 6f0e4aaad3f89..e327c868895f4 100644 --- a/python/pyspark/sql/tests/connect/test_session.py +++ b/python/pyspark/sql/tests/connect/test_session.py @@ -14,18 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import threading -import time + import unittest from typing import Optional -from pyspark import InheritableThread, inheritable_thread_target from pyspark.sql.connect.client import DefaultChannelBuilder from pyspark.sql.connect.session import SparkSession as RemoteSparkSession -from pyspark.testing.connectutils import should_test_connect - -if should_test_connect: - from pyspark.testing.connectutils import ReusedConnectTestCase class CustomChannelBuilder(DefaultChannelBuilder): @@ -104,178 +98,3 @@ def test_default_session_expires_when_client_closes(self): s3 = RemoteSparkSession.builder.remote("sc://other").getOrCreate() self.assertIsNot(s1, s3) - - -class JobCancellationTests(ReusedConnectTestCase): - def test_tags(self): - self.spark.clearTags() - self.spark.addTag("a") - self.assertEqual(self.spark.getTags(), {"a"}) - self.spark.addTag("b") - self.spark.removeTag("a") - self.assertEqual(self.spark.getTags(), {"b"}) - self.spark.addTag("c") - self.spark.clearTags() - self.assertEqual(self.spark.getTags(), set()) - self.spark.clearTags() - - def test_tags_multithread(self): - output1 = None - output2 = None - - def tag1(): - nonlocal output1 - - self.spark.addTag("tag1") - output1 = self.spark.getTags() - - def tag2(): - nonlocal output2 - - self.spark.addTag("tag2") - output2 = self.spark.getTags() - - t1 = threading.Thread(target=tag1) - t1.start() - t1.join() - t2 = threading.Thread(target=tag2) - t2.start() - t2.join() - - self.assertIsNotNone(output1) - self.assertEquals(output1, {"tag1"}) - self.assertIsNotNone(output2) - self.assertEquals(output2, {"tag2"}) - - def test_interrupt_tag(self): - thread_ids = range(4) - self.check_job_cancellation( - lambda job_group: self.spark.addTag(job_group), - lambda job_group: self.spark.interruptTag(job_group), - thread_ids, - [i for i in thread_ids if i % 2 == 0], - [i for i in thread_ids if i % 2 != 0], - ) - self.spark.clearTags() - - def test_interrupt_all(self): - thread_ids = range(4) - self.check_job_cancellation( - lambda job_group: None, - lambda job_group: self.spark.interruptAll(), - thread_ids, - thread_ids, - [], - ) - self.spark.clearTags() - - def check_job_cancellation( - self, setter, canceller, thread_ids, thread_ids_to_cancel, thread_ids_to_run - ): - job_id_a = "job_ids_to_cancel" - job_id_b = "job_ids_to_run" - threads = [] - - # A list which records whether job is cancelled. - # The index of the array is the thread index which job run in. - is_job_cancelled = [False for _ in thread_ids] - - def run_job(job_id, index): - """ - Executes a job with the group ``job_group``. Each job waits for 3 seconds - and then exits. - """ - try: - setter(job_id) - - def func(itr): - for pdf in itr: - time.sleep(pdf._1.iloc[0]) - yield pdf - - self.spark.createDataFrame([[20]]).repartition(1).mapInPandas( - func, schema="_1 LONG" - ).collect() - is_job_cancelled[index] = False - except Exception: - # Assume that exception means job cancellation. - is_job_cancelled[index] = True - - # Test if job succeeded when not cancelled. - run_job(job_id_a, 0) - self.assertFalse(is_job_cancelled[0]) - self.spark.clearTags() - - # Run jobs - for i in thread_ids_to_cancel: - t = threading.Thread(target=run_job, args=(job_id_a, i)) - t.start() - threads.append(t) - - for i in thread_ids_to_run: - t = threading.Thread(target=run_job, args=(job_id_b, i)) - t.start() - threads.append(t) - - # Wait to make sure all jobs are executed. - time.sleep(10) - # And then, cancel one job group. - canceller(job_id_a) - - # Wait until all threads launching jobs are finished. - for t in threads: - t.join() - - for i in thread_ids_to_cancel: - self.assertTrue( - is_job_cancelled[i], "Thread {i}: Job in group A was not cancelled.".format(i=i) - ) - - for i in thread_ids_to_run: - self.assertFalse( - is_job_cancelled[i], "Thread {i}: Job in group B did not succeeded.".format(i=i) - ) - - def test_inheritable_tags(self): - self.check_inheritable_tags( - create_thread=lambda target, session: InheritableThread(target, session=session) - ) - self.check_inheritable_tags( - create_thread=lambda target, session: threading.Thread( - target=inheritable_thread_target(session)(target) - ) - ) - - # Test decorator usage - @inheritable_thread_target(self.spark) - def func(target): - return target() - - self.check_inheritable_tags( - create_thread=lambda target, session: threading.Thread(target=func, args=(target,)) - ) - - def check_inheritable_tags(self, create_thread): - spark = self.spark - spark.addTag("a") - first = set() - second = set() - - def get_inner_local_prop(): - spark.addTag("c") - second.update(spark.getTags()) - - def get_outer_local_prop(): - spark.addTag("b") - first.update(spark.getTags()) - t2 = create_thread(target=get_inner_local_prop, session=spark) - t2.start() - t2.join() - - t1 = create_thread(target=get_outer_local_prop, session=spark) - t1.start() - t1.join() - - self.assertEqual(spark.getTags(), {"a"}) - self.assertEqual(first, {"a", "b"}) - self.assertEqual(second, {"a", "b", "c"}) diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py index f85a7b03eddab..1f9532352679a 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py @@ -154,7 +154,7 @@ def merge_pandas(lft, _): ): (left.groupby("id", "k").cogroup(right.groupby("id"))).applyInPandas( merge_pandas, "id long, k int, v int" - ).schema + ).count() def test_apply_in_pandas_not_returning_pandas_dataframe(self): with self.quiet(): diff --git a/python/pyspark/sql/tests/test_pandas_sqlmetrics.py b/python/pyspark/sql/tests/pandas/test_pandas_sqlmetrics.py similarity index 96% rename from python/pyspark/sql/tests/test_pandas_sqlmetrics.py rename to python/pyspark/sql/tests/pandas/test_pandas_sqlmetrics.py index 22a0e92e818db..cb2f2ff285684 100644 --- a/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_sqlmetrics.py @@ -57,7 +57,7 @@ def test_pandas(col1): if __name__ == "__main__": - from pyspark.sql.tests.test_pandas_sqlmetrics import * # noqa: F401 + from pyspark.sql.tests.pandas.test_pandas_sqlmetrics import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py b/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py index 8901f09e9272d..516a95a91a5e7 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py @@ -15,6 +15,7 @@ # limitations under the License. # +import json import os import time import tempfile @@ -26,14 +27,8 @@ from pyspark import SparkConf from pyspark.errors import PySparkRuntimeError -from pyspark.sql.functions import split -from pyspark.sql.types import ( - StringType, - StructType, - StructField, - Row, - IntegerType, -) +from pyspark.sql.functions import array_sort, col, explode, split +from pyspark.sql.types import StringType, StructType, StructField, Row, IntegerType, TimestampType from pyspark.testing import assertDataFrameEqual from pyspark.testing.sqlutils import ( ReusedSQLTestCase, @@ -61,6 +56,7 @@ def conf(cls): "org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider", ) cfg.set("spark.sql.execution.arrow.transformWithStateInPandas.maxRecordsPerBatch", "2") + cfg.set("spark.sql.session.timeZone", "UTC") return cfg def _prepare_input_data(self, input_path, col1, col2): @@ -104,9 +100,17 @@ def build_test_df_with_3_cols(self, input_path): return df_final def _test_transform_with_state_in_pandas_basic( - self, stateful_processor, check_results, single_batch=False, timeMode="None" + self, + stateful_processor, + check_results, + single_batch=False, + timeMode="None", + checkpoint_path=None, + initial_state=None, ): input_path = tempfile.mkdtemp() + if checkpoint_path is None: + checkpoint_path = tempfile.mkdtemp() self._prepare_test_resource1(input_path) if not single_batch: time.sleep(2) @@ -132,8 +136,10 @@ def _test_transform_with_state_in_pandas_basic( outputStructType=output_schema, outputMode="Update", timeMode=timeMode, + initialState=initial_state, ) .writeStream.queryName("this_query") + .option("checkpointLocation", checkpoint_path) .foreachBatch(check_results) .outputMode("update") .start() @@ -247,11 +253,15 @@ def check_results(batch_df, _): # test list state with ttl has the same behavior as list state when state doesn't expire. def test_transform_with_state_in_pandas_list_state_large_ttl(self): - def check_results(batch_df, _): - assert set(batch_df.sort("id").collect()) == { - Row(id="0", countAsString="2"), - Row(id="1", countAsString="2"), - } + def check_results(batch_df, batch_id): + if batch_id == 0: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + else: + for q in self.spark.streams.active: + q.stop() self._test_transform_with_state_in_pandas_basic( ListStateLargeTTLProcessor(), check_results, True, "processingTime" @@ -268,11 +278,15 @@ def check_results(batch_df, _): # test map state with ttl has the same behavior as map state when state doesn't expire. def test_transform_with_state_in_pandas_map_state_large_ttl(self): - def check_results(batch_df, _): - assert set(batch_df.sort("id").collect()) == { - Row(id="0", countAsString="2"), - Row(id="1", countAsString="2"), - } + def check_results(batch_df, batch_id): + if batch_id == 0: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + else: + for q in self.spark.streams.active: + q.stop() self._test_transform_with_state_in_pandas_basic( MapStateLargeTTLProcessor(), check_results, True, "processingTime" @@ -287,16 +301,22 @@ def check_results(batch_df, batch_id): Row(id="0", countAsString="2"), Row(id="1", countAsString="2"), } - else: + elif batch_id == 1: assert set(batch_df.sort("id").collect()) == { Row(id="0", countAsString="3"), Row(id="1", countAsString="2"), } + else: + for q in self.spark.streams.active: + q.stop() self._test_transform_with_state_in_pandas_basic( SimpleTTLStatefulProcessor(), check_results, False, "processingTime" ) + @unittest.skipIf( + "COVERAGE_PROCESS_START" in os.environ, "Flaky with coverage enabled, skipping for now." + ) def test_value_state_ttl_expiration(self): def check_results(batch_df, batch_id): if batch_id == 0: @@ -348,8 +368,11 @@ def check_results(batch_df, batch_id): Row(id="ttl-map-state-count-1", count=3), ], ) + else: + for q in self.spark.streams.active: + q.stop() if batch_id == 0 or batch_id == 1: - time.sleep(6) + time.sleep(4) input_dir = tempfile.TemporaryDirectory() input_path = input_dir.name @@ -466,7 +489,7 @@ def check_results(batch_df, batch_id): ).first()["timeValues"] check_timestamp(batch_df) - else: + elif batch_id == 2: assert set(batch_df.sort("id").select("id", "countAsString").collect()) == { Row(id="0", countAsString="3"), Row(id="0", countAsString="-1"), @@ -480,6 +503,10 @@ def check_results(batch_df, batch_id): ).first()["timeValues"] assert current_batch_expired_timestamp > self.first_expired_timestamp + else: + for q in self.spark.streams.active: + q.stop() + self._test_transform_with_state_in_pandas_proc_timer( ProcTimeStatefulProcessor(), check_results ) @@ -546,31 +573,56 @@ def prepare_batch3(input_path): def test_transform_with_state_in_pandas_event_time(self): def check_results(batch_df, batch_id): if batch_id == 0: - assert set(batch_df.sort("id").collect()) == {Row(id="a", timestamp="20")} - elif batch_id == 1: + # watermark for late event = 0 + # watermark for eviction = 0 + # timer is registered with expiration time = 0, hence expired at the same batch assert set(batch_df.sort("id").collect()) == { Row(id="a", timestamp="20"), Row(id="a-expired", timestamp="0"), } + elif batch_id == 1: + # watermark for late event = 0 + # watermark for eviction = 10 (20 - 10) + # timer is registered with expiration time = 10, hence expired at the same batch + assert set(batch_df.sort("id").collect()) == { + Row(id="a", timestamp="4"), + Row(id="a-expired", timestamp="10000"), + } + elif batch_id == 2: + # watermark for late event = 10 + # watermark for eviction = 10 (unchanged as 4 < 10) + # timer is registered with expiration time = 10, hence expired at the same batch + assert set(batch_df.sort("id").collect()) == { + Row(id="a", timestamp="15"), + Row(id="a-expired", timestamp="10000"), + } else: - # watermark has not progressed, so timer registered in batch 1(watermark = 10) - # has not yet expired - assert set(batch_df.sort("id").collect()) == {Row(id="a", timestamp="15")} + for q in self.spark.streams.active: + q.stop() self._test_transform_with_state_in_pandas_event_time( EventTimeStatefulProcessor(), check_results ) - def _test_transform_with_state_init_state_in_pandas(self, stateful_processor, check_results): + def _test_transform_with_state_init_state_in_pandas( + self, + stateful_processor, + check_results, + time_mode="None", + checkpoint_path=None, + initial_state=None, + ): input_path = tempfile.mkdtemp() + if checkpoint_path is None: + checkpoint_path = tempfile.mkdtemp() self._prepare_test_resource1(input_path) time.sleep(2) self._prepare_input_data(input_path + "/text-test2.txt", [0, 3], [67, 12]) - df = self._build_test_df(input_path) - for q in self.spark.streams.active: q.stop() + + df = self._build_test_df(input_path) self.assertTrue(df.isStreaming) output_schema = StructType( @@ -580,8 +632,9 @@ def _test_transform_with_state_init_state_in_pandas(self, stateful_processor, ch ] ) - data = [("0", 789), ("3", 987)] - initial_state = self.spark.createDataFrame(data, "id string, initVal int").groupBy("id") + if initial_state is None: + data = [("0", 789), ("3", 987)] + initial_state = self.spark.createDataFrame(data, "id string, initVal int").groupBy("id") q = ( df.groupBy("id") @@ -589,10 +642,11 @@ def _test_transform_with_state_init_state_in_pandas(self, stateful_processor, ch statefulProcessor=stateful_processor, outputStructType=output_schema, outputMode="Update", - timeMode="None", + timeMode=time_mode, initialState=initial_state, ) .writeStream.queryName("this_query") + .option("checkpointLocation", checkpoint_path) .foreachBatch(check_results) .outputMode("update") .start() @@ -677,6 +731,9 @@ def check_results(batch_df, batch_id): Row(id1="0", id2="1", value=str(123 + 46)), Row(id1="1", id2="2", value=str(146 + 346)), } + else: + for q in self.spark.streams.active: + q.stop() self._test_transform_with_state_non_contiguous_grouping_cols( SimpleStatefulProcessorWithInitialState(), check_results @@ -690,6 +747,9 @@ def check_results(batch_df, batch_id): Row(id1="0", id2="1", value=str(789 + 123 + 46)), Row(id1="1", id2="2", value=str(146 + 346)), } + else: + for q in self.spark.streams.active: + q.stop() # grouping key of initial state is also not starting from the beginning of attributes data = [(789, "0", "1"), (987, "3", "2")] @@ -701,6 +761,539 @@ def check_results(batch_df, batch_id): SimpleStatefulProcessorWithInitialState(), check_results, initial_state ) + def _test_transform_with_state_in_pandas_chaining_ops( + self, stateful_processor, check_results, timeMode="None", grouping_cols=["outputTimestamp"] + ): + import pyspark.sql.functions as f + + input_path = tempfile.mkdtemp() + self._prepare_input_data(input_path + "/text-test3.txt", ["a", "b"], [10, 15]) + time.sleep(2) + self._prepare_input_data(input_path + "/text-test4.txt", ["a", "c"], [11, 25]) + time.sleep(2) + self._prepare_input_data(input_path + "/text-test1.txt", ["a"], [5]) + + df = self._build_test_df(input_path) + df = df.select( + "id", f.from_unixtime(f.col("temperature")).alias("eventTime").cast("timestamp") + ).withWatermark("eventTime", "5 seconds") + + for q in self.spark.streams.active: + q.stop() + self.assertTrue(df.isStreaming) + + output_schema = StructType( + [ + StructField("id", StringType(), True), + StructField("outputTimestamp", TimestampType(), True), + ] + ) + + q = ( + df.groupBy("id") + .transformWithStateInPandas( + statefulProcessor=stateful_processor, + outputStructType=output_schema, + outputMode="Append", + timeMode=timeMode, + eventTimeColumnName="outputTimestamp", + ) + .groupBy(grouping_cols) + .count() + .writeStream.queryName("chaining_ops_query") + .foreachBatch(check_results) + .outputMode("append") + .start() + ) + + self.assertEqual(q.name, "chaining_ops_query") + self.assertTrue(q.isActive) + q.processAllAvailable() + q.awaitTermination(10) + + def test_transform_with_state_in_pandas_chaining_ops(self): + def check_results(batch_df, batch_id): + import datetime + + if batch_id == 0: + assert batch_df.isEmpty() + elif batch_id == 1: + # eviction watermark = 15 - 5 = 10 (max event time from batch 0), + # late event watermark = 0 (eviction event time from batch 0) + assert set( + batch_df.sort("outputTimestamp").select("outputTimestamp", "count").collect() + ) == { + Row(outputTimestamp=datetime.datetime(1970, 1, 1, 0, 0, 10), count=1), + } + elif batch_id == 2: + # eviction watermark = 25 - 5 = 20, late event watermark = 10; + # row with watermark=5<10 is dropped so it does not show up in the results; + # row with eventTime<=20 are finalized and emitted + assert set( + batch_df.sort("outputTimestamp").select("outputTimestamp", "count").collect() + ) == { + Row(outputTimestamp=datetime.datetime(1970, 1, 1, 0, 0, 11), count=1), + Row(outputTimestamp=datetime.datetime(1970, 1, 1, 0, 0, 15), count=1), + } + + self._test_transform_with_state_in_pandas_chaining_ops( + StatefulProcessorChainingOps(), check_results, "eventTime" + ) + self._test_transform_with_state_in_pandas_chaining_ops( + StatefulProcessorChainingOps(), check_results, "eventTime", ["outputTimestamp", "id"] + ) + + def test_transform_with_state_init_state_with_timers(self): + def check_results(batch_df, batch_id): + if batch_id == 0: + # timers are registered and handled in the first batch for + # rows in initial state; For key=0 and key=3 which contains + # expired timers, both should be handled by handleExpiredTimers + # regardless of whether key exists in the data rows or not + expired_df = batch_df.filter(batch_df["id"].contains("expired")) + data_df = batch_df.filter(~batch_df["id"].contains("expired")) + assert set(expired_df.sort("id").select("id").collect()) == { + Row(id="0-expired"), + Row(id="3-expired"), + } + assert set(data_df.sort("id").collect()) == { + Row(id="0", value=str(789 + 123 + 46)), + Row(id="1", value=str(146 + 346)), + } + elif batch_id == 1: + # handleInitialState is only processed in the first batch, + # no more timer is registered so no more expired timers + assert set(batch_df.sort("id").collect()) == { + Row(id="0", value=str(789 + 123 + 46 + 67)), + Row(id="3", value=str(987 + 12)), + } + else: + for q in self.spark.streams.active: + q.stop() + + self._test_transform_with_state_init_state_in_pandas( + StatefulProcessorWithInitialStateTimers(), check_results, "processingTime" + ) + + def test_transform_with_state_in_pandas_batch_query(self): + data = [("0", 123), ("0", 46), ("1", 146), ("1", 346)] + df = self.spark.createDataFrame(data, "id string, temperature int") + + output_schema = StructType( + [ + StructField("id", StringType(), True), + StructField("countAsString", StringType(), True), + ] + ) + batch_result = df.groupBy("id").transformWithStateInPandas( + statefulProcessor=MapStateProcessor(), + outputStructType=output_schema, + outputMode="Update", + timeMode="None", + ) + assert set(batch_result.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + + def test_transform_with_state_in_pandas_batch_query_initial_state(self): + data = [("0", 123), ("0", 46), ("1", 146), ("1", 346)] + df = self.spark.createDataFrame(data, "id string, temperature int") + + init_data = [("0", 789), ("3", 987)] + initial_state = self.spark.createDataFrame(init_data, "id string, initVal int").groupBy( + "id" + ) + + output_schema = StructType( + [ + StructField("id", StringType(), True), + StructField("value", StringType(), True), + ] + ) + batch_result = df.groupBy("id").transformWithStateInPandas( + statefulProcessor=SimpleStatefulProcessorWithInitialState(), + outputStructType=output_schema, + outputMode="Update", + timeMode="None", + initialState=initial_state, + ) + assert set(batch_result.sort("id").collect()) == { + Row(id="0", value=str(789 + 123 + 46)), + Row(id="1", value=str(146 + 346)), + } + + # This test covers mapState with TTL, an empty state variable + # and additional test against initial state python runner + @unittest.skipIf( + "COVERAGE_PROCESS_START" in os.environ, "Flaky with coverage enabled, skipping for now." + ) + def test_transform_with_map_state_metadata(self): + checkpoint_path = tempfile.mktemp() + + def check_results(batch_df, batch_id): + if batch_id == 0: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + else: + # check for state metadata source + metadata_df = self.spark.read.format("state-metadata").load(checkpoint_path) + assert set( + metadata_df.select( + "operatorId", + "operatorName", + "stateStoreName", + "numPartitions", + "minBatchId", + "maxBatchId", + ).collect() + ) == { + Row( + operatorId=0, + operatorName="transformWithStateInPandasExec", + stateStoreName="default", + numPartitions=5, + minBatchId=0, + maxBatchId=0, + ) + } + operator_properties_json_obj = json.loads( + metadata_df.select("operatorProperties").collect()[0][0] + ) + assert operator_properties_json_obj["timeMode"] == "ProcessingTime" + assert operator_properties_json_obj["outputMode"] == "Update" + + state_var_list = operator_properties_json_obj["stateVariables"] + assert len(state_var_list) == 3 + for state_var in state_var_list: + if state_var["stateName"] == "mapState": + assert state_var["stateVariableType"] == "MapState" + assert state_var["ttlEnabled"] + elif state_var["stateName"] == "listState": + assert state_var["stateVariableType"] == "ListState" + assert not state_var["ttlEnabled"] + else: + assert state_var["stateName"] == "$procTimers_keyToTimestamp" + assert state_var["stateVariableType"] == "TimerState" + + # check for state data source + map_state_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "mapState") + .load() + ) + assert map_state_df.selectExpr( + "key.id AS groupingKey", + "user_map_key.name AS mapKey", + "user_map_value.value.count AS mapValue", + ).sort("groupingKey").collect() == [ + Row(groupingKey="0", mapKey="key2", mapValue=2), + Row(groupingKey="1", mapKey="key2", mapValue=2), + ] + + # check for map state with flatten option + map_state_df_non_flatten = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "mapState") + .option("flattenCollectionTypes", False) + .load() + ) + assert map_state_df_non_flatten.select( + "key.id", explode(col("map_value")).alias("map_key", "map_value") + ).selectExpr( + "id AS groupingKey", + "map_key.name AS mapKey", + "map_value.value.count AS mapValue", + ).sort( + "groupingKey" + ).collect() == [ + Row(groupingKey="0", mapKey="key2", mapValue=2), + Row(groupingKey="1", mapKey="key2", mapValue=2), + ] + + ttl_df = map_state_df.selectExpr( + "user_map_value.ttlExpirationMs AS TTLVal" + ).collect() + # check if there are two rows containing TTL value in map state dataframe + assert len(ttl_df) == 2 + # check if two rows are of the same TTL value + assert len(set(ttl_df)) == 1 + + list_state_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "listState") + .load() + ) + assert list_state_df.isEmpty() + + for q in self.spark.streams.active: + q.stop() + + self._test_transform_with_state_in_pandas_basic( + MapStateLargeTTLProcessor(), + check_results, + True, + "processingTime", + checkpoint_path=checkpoint_path, + initial_state=None, + ) + + # run the same test suite again but with no-op initial state + # TWS with initial state is using a different python runner + init_data = [("0", 789), ("3", 987)] + initial_state = self.spark.createDataFrame(init_data, "id string, temperature int").groupBy( + "id" + ) + self._test_transform_with_state_in_pandas_basic( + MapStateLargeTTLProcessor(), + check_results, + True, + "processingTime", + checkpoint_path=checkpoint_path, + initial_state=initial_state, + ) + + # This test covers multiple list state variables and flatten option + def test_transform_with_list_state_metadata(self): + checkpoint_path = tempfile.mktemp() + + def check_results(batch_df, batch_id): + if batch_id == 0: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + else: + # check for state metadata source + metadata_df = self.spark.read.format("state-metadata").load(checkpoint_path) + operator_properties_json_obj = json.loads( + metadata_df.select("operatorProperties").collect()[0][0] + ) + state_var_list = operator_properties_json_obj["stateVariables"] + assert len(state_var_list) == 3 + for state_var in state_var_list: + if state_var["stateName"] in ["listState1", "listState2"]: + state_var["stateVariableType"] == "ListState" + else: + assert state_var["stateName"] == "$procTimers_keyToTimestamp" + assert state_var["stateVariableType"] == "TimerState" + + # check for state data source and flatten option + list_state_1_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "listState1") + .option("flattenCollectionTypes", True) + .load() + ) + assert list_state_1_df.selectExpr( + "key.id AS groupingKey", + "list_element.temperature AS listElement", + ).sort("groupingKey", "listElement").collect() == [ + Row(groupingKey="0", listElement=20), + Row(groupingKey="0", listElement=20), + Row(groupingKey="0", listElement=111), + Row(groupingKey="0", listElement=120), + Row(groupingKey="0", listElement=120), + Row(groupingKey="1", listElement=20), + Row(groupingKey="1", listElement=20), + Row(groupingKey="1", listElement=111), + Row(groupingKey="1", listElement=120), + Row(groupingKey="1", listElement=120), + ] + + list_state_2_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "listState2") + .option("flattenCollectionTypes", False) + .load() + ) + assert list_state_2_df.selectExpr( + "key.id AS groupingKey", "list_value.temperature AS valueList" + ).sort("groupingKey").withColumn( + "valueSortedList", array_sort(col("valueList")) + ).select( + "groupingKey", "valueSortedList" + ).collect() == [ + Row(groupingKey="0", valueSortedList=[20, 20, 120, 120, 222]), + Row(groupingKey="1", valueSortedList=[20, 20, 120, 120, 222]), + ] + + for q in self.spark.streams.active: + q.stop() + + self._test_transform_with_state_in_pandas_basic( + ListStateProcessor(), + check_results, + True, + "processingTime", + checkpoint_path=checkpoint_path, + initial_state=None, + ) + + # This test covers value state variable and read change feed, + # snapshotStartBatchId related options + def test_transform_with_value_state_metadata(self): + checkpoint_path = tempfile.mktemp() + + def check_results(batch_df, batch_id): + if batch_id == 0: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + else: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="3"), + Row(id="1", countAsString="2"), + } + + # check for state metadata source + metadata_df = self.spark.read.format("state-metadata").load(checkpoint_path) + operator_properties_json_obj = json.loads( + metadata_df.select("operatorProperties").collect()[0][0] + ) + state_var_list = operator_properties_json_obj["stateVariables"] + + assert len(state_var_list) == 3 + for state_var in state_var_list: + if state_var["stateName"] in ["numViolations", "tempState"]: + state_var["stateVariableType"] == "ValueState" + else: + assert state_var["stateName"] == "$procTimers_keyToTimestamp" + assert state_var["stateVariableType"] == "TimerState" + + # check for state data source and readChangeFeed + value_state_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "numViolations") + .option("readChangeFeed", True) + .option("changeStartBatchId", 0) + .load() + ).selectExpr( + "change_type", "key.id AS groupingKey", "value.value AS value", "partition_id" + ) + + assert value_state_df.select("change_type", "groupingKey", "value").sort( + "groupingKey" + ).collect() == [ + Row(change_type="update", groupingKey="0", value=1), + Row(change_type="update", groupingKey="1", value=2), + ] + + partition_id_list = [ + row["partition_id"] for row in value_state_df.select("partition_id").collect() + ] + + for partition_id in partition_id_list: + # check for state data source and snapshotStartBatchId options + state_snapshot_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "numViolations") + .option("snapshotPartitionId", partition_id) + .option("snapshotStartBatchId", 0) + .load() + ) + + assert ( + value_state_df.select("partition_id", "groupingKey", "value") + .filter(value_state_df["partition_id"] == partition_id) + .sort("groupingKey") + .collect() + == state_snapshot_df.selectExpr( + "partition_id", "key.id AS groupingKey", "value.value AS value" + ) + .sort("groupingKey") + .collect() + ) + + for q in self.spark.streams.active: + q.stop() + + with self.sql_conf( + {"spark.sql.streaming.stateStore.rocksdb.changelogCheckpointing.enabled": "true"} + ): + self._test_transform_with_state_in_pandas_basic( + SimpleStatefulProcessor(), + check_results, + False, + "processingTime", + checkpoint_path=checkpoint_path, + ) + + def test_transform_with_state_restart_with_multiple_rows_init_state(self): + def check_results(batch_df, _): + assert set(batch_df.sort("id").collect()) == { + Row(id="0", countAsString="2"), + Row(id="1", countAsString="2"), + } + + def check_results_for_new_query(batch_df, batch_id): + if batch_id == 0: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", value=str(123 + 46)), + Row(id="1", value=str(146 + 346)), + } + else: + assert set(batch_df.sort("id").collect()) == { + Row(id="0", value=str(123 + 46 + 67)), + Row(id="3", value=str(12)), + } + # verify values in initial state is appended into list state for all keys + df = ( + self.spark.read.format("statestore") + .option("path", new_checkpoint_path) + .option("stateVarName", "list_state") + .load() + ).selectExpr("key.id AS id", "list_element.value AS value") + + def dataframe_to_value_list(output_df): + return [ + row["value"] for row in output_df.sort("value").select("value").collect() + ] + + assert dataframe_to_value_list(df.filter(df.id == "0")) == [20, 20, 111, 120, 120] + assert dataframe_to_value_list(df.filter(df.id == "1")) == [20, 20, 111, 120, 120] + + # run a tws query and read state data source dataframe from its checkpoint + checkpoint_path = tempfile.mkdtemp() + self._test_transform_with_state_in_pandas_basic( + ListStateProcessor(), check_results, True, checkpoint_path=checkpoint_path + ) + list_state_df = ( + self.spark.read.format("statestore") + .option("path", checkpoint_path) + .option("stateVarName", "listState1") + .load() + ).selectExpr("key.id AS id", "list_element.temperature AS initVal") + init_df = list_state_df.groupBy("id") + + # run a new tws query and pass state data source dataframe as initial state + # multiple rows exist in the initial state with the same grouping key + new_checkpoint_path = tempfile.mkdtemp() + self._test_transform_with_state_init_state_in_pandas( + StatefulProcessorWithListStateInitialState(), + check_results_for_new_query, + checkpoint_path=new_checkpoint_path, + initial_state=init_df, + ) + + # run the same test suites again but with single shuffle partition + def test_transform_with_state_with_timers_single_partition(self): + with self.sql_conf({"spark.sql.shuffle.partitions": "1"}): + self.test_transform_with_state_init_state_with_timers() + self.test_transform_with_state_in_pandas_event_time() + self.test_transform_with_state_in_pandas_proc_timer() + self.test_transform_with_state_restart_with_multiple_rows_init_state() + class SimpleStatefulProcessorWithInitialState(StatefulProcessor): # this dict is the same as input initial state dataframe @@ -709,10 +1302,9 @@ class SimpleStatefulProcessorWithInitialState(StatefulProcessor): def init(self, handle: StatefulProcessorHandle) -> None: state_schema = StructType([StructField("value", IntegerType(), True)]) self.value_state = handle.getValueState("value_state", state_schema) + self.handle = handle - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: exists = self.value_state.exists() if exists: value_row = self.value_state.get() @@ -735,7 +1327,7 @@ def handleInputRows( else: yield pd.DataFrame({"id": key, "value": str(accumulated_value)}) - def handleInitialState(self, key, initialState) -> None: + def handleInitialState(self, key, initialState, timer_values) -> None: init_val = initialState.at[0, "initVal"] self.value_state.update((init_val,)) if len(key) == 1: @@ -745,6 +1337,30 @@ def close(self) -> None: pass +class StatefulProcessorWithInitialStateTimers(SimpleStatefulProcessorWithInitialState): + def handleExpiredTimer(self, key, timer_values, expired_timer_info) -> Iterator[pd.DataFrame]: + self.handle.deleteTimer(expired_timer_info.get_expiry_time_in_ms()) + str_key = f"{str(key[0])}-expired" + yield pd.DataFrame( + {"id": (str_key,), "value": str(expired_timer_info.get_expiry_time_in_ms())} + ) + + def handleInitialState(self, key, initialState, timer_values) -> None: + super().handleInitialState(key, initialState, timer_values) + self.handle.registerTimer(timer_values.get_current_processing_time_in_ms() - 1) + + +class StatefulProcessorWithListStateInitialState(SimpleStatefulProcessorWithInitialState): + def init(self, handle: StatefulProcessorHandle) -> None: + super().init(handle) + list_ele_schema = StructType([StructField("value", IntegerType(), True)]) + self.list_state = handle.getListState("list_state", list_ele_schema) + + def handleInitialState(self, key, initialState, timer_values) -> None: + for val in initialState["initVal"].tolist(): + self.list_state.append_value((val,)) + + # A stateful processor that output the max event time it has seen. Register timer for # current watermark. Clear max state if timer expires. class EventTimeStatefulProcessor(StatefulProcessor): @@ -753,33 +1369,30 @@ def init(self, handle: StatefulProcessorHandle) -> None: self.handle = handle self.max_state = handle.getValueState("max_state", state_schema) - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: - if expired_timer_info.is_valid(): - self.max_state.clear() - self.handle.deleteTimer(expired_timer_info.get_expiry_time_in_ms()) - str_key = f"{str(key[0])}-expired" - yield pd.DataFrame( - {"id": (str_key,), "timestamp": str(expired_timer_info.get_expiry_time_in_ms())} - ) + def handleExpiredTimer(self, key, timer_values, expired_timer_info) -> Iterator[pd.DataFrame]: + self.max_state.clear() + self.handle.deleteTimer(expired_timer_info.get_expiry_time_in_ms()) + str_key = f"{str(key[0])}-expired" + yield pd.DataFrame( + {"id": (str_key,), "timestamp": str(expired_timer_info.get_expiry_time_in_ms())} + ) - else: - timestamp_list = [] - for pdf in rows: - # int64 will represent timestamp in nanosecond, restore to second - timestamp_list.extend((pdf["eventTime"].astype("int64") // 10**9).tolist()) + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: + timestamp_list = [] + for pdf in rows: + # int64 will represent timestamp in nanosecond, restore to second + timestamp_list.extend((pdf["eventTime"].astype("int64") // 10**9).tolist()) - if self.max_state.exists(): - cur_max = int(self.max_state.get()[0]) - else: - cur_max = 0 - max_event_time = str(max(cur_max, max(timestamp_list))) + if self.max_state.exists(): + cur_max = int(self.max_state.get()[0]) + else: + cur_max = 0 + max_event_time = str(max(cur_max, max(timestamp_list))) - self.max_state.update((max_event_time,)) - self.handle.registerTimer(timer_values.get_current_watermark_in_ms()) + self.max_state.update((max_event_time,)) + self.handle.registerTimer(timer_values.get_current_watermark_in_ms()) - yield pd.DataFrame({"id": key, "timestamp": max_event_time}) + yield pd.DataFrame({"id": key, "timestamp": max_event_time}) def close(self) -> None: pass @@ -793,54 +1406,49 @@ def init(self, handle: StatefulProcessorHandle) -> None: self.handle = handle self.count_state = handle.getValueState("count_state", state_schema) - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: - if expired_timer_info.is_valid(): - # reset count state each time the timer is expired - timer_list_1 = [e for e in self.handle.listTimers()] - timer_list_2 = [] - idx = 0 - for e in self.handle.listTimers(): - timer_list_2.append(e) - # check multiple iterator on the same grouping key works - assert timer_list_2[idx] == timer_list_1[idx] - idx += 1 - - if len(timer_list_1) > 0: - # before deleting the expiring timers, there are 2 timers - - # one timer we just registered, and one that is going to be deleted - assert len(timer_list_1) == 2 - self.count_state.clear() - self.handle.deleteTimer(expired_timer_info.get_expiry_time_in_ms()) - yield pd.DataFrame( - { - "id": key, - "countAsString": str("-1"), - "timeValues": str(expired_timer_info.get_expiry_time_in_ms()), - } - ) + def handleExpiredTimer(self, key, timer_values, expired_timer_info) -> Iterator[pd.DataFrame]: + # reset count state each time the timer is expired + timer_list_1 = [e for e in self.handle.listTimers()] + timer_list_2 = [] + idx = 0 + for e in self.handle.listTimers(): + timer_list_2.append(e) + # check multiple iterator on the same grouping key works + assert timer_list_2[idx] == timer_list_1[idx] + idx += 1 + + if len(timer_list_1) > 0: + assert len(timer_list_1) == 2 + self.count_state.clear() + self.handle.deleteTimer(expired_timer_info.get_expiry_time_in_ms()) + yield pd.DataFrame( + { + "id": key, + "countAsString": str("-1"), + "timeValues": str(expired_timer_info.get_expiry_time_in_ms()), + } + ) + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: + if not self.count_state.exists(): + count = 0 else: - if not self.count_state.exists(): - count = 0 - else: - count = int(self.count_state.get()[0]) + count = int(self.count_state.get()[0]) - if key == ("0",): - self.handle.registerTimer(timer_values.get_current_processing_time_in_ms()) + if key == ("0",): + self.handle.registerTimer(timer_values.get_current_processing_time_in_ms() + 1) - rows_count = 0 - for pdf in rows: - pdf_count = len(pdf) - rows_count += pdf_count + rows_count = 0 + for pdf in rows: + pdf_count = len(pdf) + rows_count += pdf_count - count = count + rows_count + count = count + rows_count - self.count_state.update((str(count),)) - timestamp = str(timer_values.get_current_processing_time_in_ms()) + self.count_state.update((str(count),)) + timestamp = str(timer_values.get_current_processing_time_in_ms()) - yield pd.DataFrame({"id": key, "countAsString": str(count), "timeValues": timestamp}) + yield pd.DataFrame({"id": key, "countAsString": str(count), "timeValues": timestamp}) def close(self) -> None: pass @@ -851,14 +1459,13 @@ class SimpleStatefulProcessor(StatefulProcessor, unittest.TestCase): batch_id = 0 def init(self, handle: StatefulProcessorHandle) -> None: + # Test both string type and struct type schemas + self.num_violations_state = handle.getValueState("numViolations", "value int") state_schema = StructType([StructField("value", IntegerType(), True)]) - self.num_violations_state = handle.getValueState("numViolations", state_schema) self.temp_state = handle.getValueState("tempState", state_schema) handle.deleteIfExists("tempState") - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: with self.assertRaisesRegex(PySparkRuntimeError, "Error checking value state exists"): self.temp_state.exists() new_violations = 0 @@ -886,6 +1493,19 @@ def close(self) -> None: pass +class StatefulProcessorChainingOps(StatefulProcessor): + def init(self, handle: StatefulProcessorHandle) -> None: + pass + + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: + for pdf in rows: + timestamp_list = pdf["eventTime"].tolist() + yield pd.DataFrame({"id": key, "outputTimestamp": timestamp_list[0]}) + + def close(self) -> None: + pass + + # A stateful processor that inherit all behavior of SimpleStatefulProcessor except that it use # ttl state with a large timeout. class SimpleTTLStatefulProcessor(SimpleStatefulProcessor, unittest.TestCase): @@ -907,9 +1527,7 @@ def init(self, handle: StatefulProcessorHandle) -> None: "ttl-map-state", user_key_schema, state_schema, 10000 ) - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: count = 0 ttl_count = 0 ttl_list_state_count = 0 @@ -959,9 +1577,7 @@ def init(self, handle: StatefulProcessorHandle) -> None: state_schema = StructType([StructField("value", IntegerType(), True)]) self.num_violations_state = handle.getValueState("numViolations", state_schema) - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: count = 0 exists = self.num_violations_state.exists() assert not exists @@ -985,9 +1601,7 @@ def init(self, handle: StatefulProcessorHandle) -> None: self.list_state1 = handle.getListState("listState1", state_schema) self.list_state2 = handle.getListState("listState2", state_schema) - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: count = 0 for pdf in rows: list_state_rows = [(120,), (20,)] @@ -1038,13 +1652,10 @@ def init(self, handle: StatefulProcessorHandle) -> None: class MapStateProcessor(StatefulProcessor): def init(self, handle: StatefulProcessorHandle): - key_schema = StructType([StructField("name", StringType(), True)]) - value_schema = StructType([StructField("count", IntegerType(), True)]) - self.map_state = handle.getMapState("mapState", key_schema, value_schema) + # Test string type schemas + self.map_state = handle.getMapState("mapState", "name string", "count int") - def handleInputRows( - self, key, rows, timer_values, expired_timer_info - ) -> Iterator[pd.DataFrame]: + def handleInputRows(self, key, rows, timer_values) -> Iterator[pd.DataFrame]: count = 0 key1 = ("key1",) key2 = ("key2",) @@ -1084,6 +1695,7 @@ def init(self, handle: StatefulProcessorHandle) -> None: key_schema = StructType([StructField("name", StringType(), True)]) value_schema = StructType([StructField("count", IntegerType(), True)]) self.map_state = handle.getMapState("mapState", key_schema, value_schema, 30000) + self.list_state = handle.getListState("listState", key_schema) class TransformWithStateInPandasTests(TransformWithStateInPandasTestsMixin, ReusedSQLTestCase): diff --git a/python/pyspark/sql/tests/plot/test_frame_plot_plotly.py b/python/pyspark/sql/tests/plot/test_frame_plot_plotly.py index fd264c3488823..3dafd71c1a329 100644 --- a/python/pyspark/sql/tests/plot/test_frame_plot_plotly.py +++ b/python/pyspark/sql/tests/plot/test_frame_plot_plotly.py @@ -301,6 +301,7 @@ def test_area_plot(self): self._check_fig_data(fig["data"][2], **expected_fig_data) def test_pie_plot(self): + # single column as 'y' fig = self.sdf3.plot(kind="pie", x="date", y="sales") expected_x = [ datetime(2018, 1, 31, 0, 0), @@ -308,13 +309,39 @@ def test_pie_plot(self): datetime(2018, 3, 31, 0, 0), datetime(2018, 4, 30, 0, 0), ] - expected_fig_data = { + expected_fig_data_sales = { "name": "", "labels": expected_x, "values": [3, 2, 3, 9], "type": "pie", } - self._check_fig_data(fig["data"][0], **expected_fig_data) + self._check_fig_data(fig["data"][0], **expected_fig_data_sales) + + # all numeric columns as 'y' + expected_fig_data_signups = { + "name": "", + "labels": expected_x, + "values": [5, 5, 6, 12], + "type": "pie", + } + expected_fig_data_visits = { + "name": "", + "labels": expected_x, + "values": [20, 42, 28, 62], + "type": "pie", + } + fig = self.sdf3.plot(kind="pie", x="date", subplots=True) + self._check_fig_data(fig["data"][0], **expected_fig_data_sales) + self._check_fig_data(fig["data"][1], **expected_fig_data_signups) + self._check_fig_data(fig["data"][2], **expected_fig_data_visits) + + # not specify subplots + with self.assertRaises(PySparkValueError) as pe: + self.sdf3.plot(kind="pie", x="date") + + self.check_error( + exception=pe.exception, errorClass="UNSUPPORTED_PIE_PLOT_PARAM", messageParameters={} + ) # y is not a numerical column with self.assertRaises(PySparkTypeError) as pe: @@ -322,8 +349,12 @@ def test_pie_plot(self): self.check_error( exception=pe.exception, - errorClass="PLOT_NOT_NUMERIC_COLUMN_ARGUMENT", - messageParameters={"arg_name": "y", "arg_type": "StringType"}, + errorClass="PLOT_INVALID_TYPE_COLUMN", + messageParameters={ + "col_name": "category", + "valid_types": "NumericType", + "col_type": "StringType", + }, ) def test_box_plot(self): diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py index de8f30baebca5..9db66aa252ee6 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py @@ -146,7 +146,7 @@ def func(df: DataFrame, batch_id: int): def my_test_function_2(): return 2 - def test_streaming_foreach_batch_fuction_calling(self): + def test_streaming_foreach_batch_function_calling(self): def my_test_function_3(): return 3 diff --git a/python/pyspark/sql/tests/test_connect_compatibility.py b/python/pyspark/sql/tests/test_connect_compatibility.py index 3d74e796cd7a0..4ac68292b4020 100644 --- a/python/pyspark/sql/tests/test_connect_compatibility.py +++ b/python/pyspark/sql/tests/test_connect_compatibility.py @@ -264,18 +264,11 @@ def test_spark_session_compatibility(self): expected_missing_connect_methods = { "addArtifact", "addArtifacts", - "addTag", "clearProgressHandlers", - "clearTags", "copyFromLocalToFs", - "getTags", - "interruptAll", - "interruptOperation", - "interruptTag", "newSession", "registerProgressHandler", "removeProgressHandler", - "removeTag", } expected_missing_classic_methods = set() self.check_compatibility( diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index cd6a57429cfa9..e85877cc87e09 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -1044,6 +1044,38 @@ def test_transpose(self): messageParameters={"dt1": '"STRING"', "dt2": '"BIGINT"'}, ) + def test_transpose_with_invalid_index_columns(self): + # SPARK-50602: invalid index columns + df = self.spark.createDataFrame([{"a": "x", "b": "y", "c": "z"}]) + + with self.assertRaises(AnalysisException) as pe: + df.transpose(col("a") + 1).collect() + self.check_error( + exception=pe.exception, + errorClass="TRANSPOSE_INVALID_INDEX_COLUMN", + messageParameters={"reason": "Index column must be an atomic attribute"}, + ) + + def test_metadata_column(self): + with self.sql_conf( + {"spark.sql.catalog.testcat": "org.apache.spark.sql.connector.catalog.InMemoryCatalog"} + ): + tbl = "testcat.t" + with self.table(tbl): + self.spark.sql( + f""" + CREATE TABLE {tbl} (index bigint, data string) + PARTITIONED BY (bucket(4, index), index) + """ + ) + self.spark.sql(f"""INSERT INTO {tbl} VALUES (1, 'a'), (2, 'b'), (3, 'c')""") + + df = self.spark.sql(f"""SELECT * FROM {tbl}""") + assertDataFrameEqual( + df.select(df.metadataColumn("index")), + [Row(0), Row(0), Row(0)], + ) + class DataFrameTests(DataFrameTestsMixin, ReusedSQLTestCase): def test_query_execution_unsupported_in_classic(self): diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index cf8f685ea4499..39db72b235bf9 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -30,8 +30,9 @@ from pyspark.sql.avro.functions import from_avro, to_avro from pyspark.sql.column import Column from pyspark.sql.functions.builtin import nullifzero, randstr, uniform, zeroifnull +from pyspark.sql.types import StructType, StructField, StringType from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils -from pyspark.testing.utils import have_numpy +from pyspark.testing.utils import have_numpy, assertDataFrameEqual class FunctionsTestsMixin: @@ -338,29 +339,29 @@ def test_try_parse_url(self): [("https://spark.apache.org/path?query=1", "QUERY", "query")], ["url", "part", "key"], ) - actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() - self.assertEqual(actual, [Row("1")]) + actual = df.select(F.try_parse_url(df.url, df.part, df.key)) + assertDataFrameEqual(actual, [Row("1")]) df = self.spark.createDataFrame( [("inva lid://spark.apache.org/path?query=1", "QUERY", "query")], ["url", "part", "key"], ) - actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() - self.assertEqual(actual, [Row(None)]) + actual = df.select(F.try_parse_url(df.url, df.part, df.key)) + assertDataFrameEqual(actual, [Row(None)]) def test_try_make_timestamp(self): data = [(2024, 5, 22, 10, 30, 0)] df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) actual = df.select( F.try_make_timestamp(df.year, df.month, df.day, df.hour, df.minute, df.second) - ).collect() - self.assertEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30))]) + ) + assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30))]) data = [(2024, 13, 22, 10, 30, 0)] df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) actual = df.select( F.try_make_timestamp(df.year, df.month, df.day, df.hour, df.minute, df.second) - ).collect() - self.assertEqual(actual, [Row(None)]) + ) + assertDataFrameEqual(actual, [Row(None)]) def test_try_make_timestamp_ltz(self): # use local timezone here to avoid flakiness @@ -372,8 +373,8 @@ def test_try_make_timestamp_ltz(self): F.try_make_timestamp_ltz( df.year, df.month, df.day, df.hour, df.minute, df.second, df.timezone ) - ).collect() - self.assertEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30, 0))]) + ) + assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30, 0))]) # use local timezone here to avoid flakiness data = [(2024, 13, 22, 10, 30, 0, datetime.datetime.now().astimezone().tzinfo.__str__())] @@ -384,23 +385,23 @@ def test_try_make_timestamp_ltz(self): F.try_make_timestamp_ltz( df.year, df.month, df.day, df.hour, df.minute, df.second, df.timezone ) - ).collect() - self.assertEqual(actual, [Row(None)]) + ) + assertDataFrameEqual(actual, [Row(None)]) def test_try_make_timestamp_ntz(self): data = [(2024, 5, 22, 10, 30, 0)] df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) actual = df.select( F.try_make_timestamp_ntz(df.year, df.month, df.day, df.hour, df.minute, df.second) - ).collect() - self.assertEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30))]) + ) + assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30))]) data = [(2024, 13, 22, 10, 30, 0)] df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) actual = df.select( F.try_make_timestamp_ntz(df.year, df.month, df.day, df.hour, df.minute, df.second) - ).collect() - self.assertEqual(actual, [Row(None)]) + ) + assertDataFrameEqual(actual, [Row(None)]) def test_string_functions(self): string_functions = [ @@ -442,51 +443,51 @@ def test_string_functions(self): ) for name in string_functions: - self.assertEqual( - df.select(getattr(F, name)("name")).first()[0], - df.select(getattr(F, name)(F.col("name"))).first()[0], + assertDataFrameEqual( + df.select(getattr(F, name)("name")), + df.select(getattr(F, name)(F.col("name"))), ) def test_collation(self): df = self.spark.createDataFrame([("a",), ("b",)], ["name"]) - actual = df.select(F.collation(F.collate("name", "UNICODE"))).distinct().collect() - self.assertEqual([Row("UNICODE")], actual) + actual = df.select(F.collation(F.collate("name", "UNICODE"))).distinct() + assertDataFrameEqual([Row("SYSTEM.BUILTIN.UNICODE")], actual) def test_try_make_interval(self): df = self.spark.createDataFrame([(2147483647,)], ["num"]) - actual = df.select(F.isnull(F.try_make_interval("num"))).collect() - self.assertEqual([Row(True)], actual) + actual = df.select(F.isnull(F.try_make_interval("num"))) + assertDataFrameEqual([Row(True)], actual) def test_octet_length_function(self): # SPARK-36751: add octet length api for python df = self.spark.createDataFrame([("cat",), ("\U0001F408",)], ["cat"]) - actual = df.select(F.octet_length("cat")).collect() - self.assertEqual([Row(3), Row(4)], actual) + actual = df.select(F.octet_length("cat")) + assertDataFrameEqual([Row(3), Row(4)], actual) def test_bit_length_function(self): # SPARK-36751: add bit length api for python df = self.spark.createDataFrame([("cat",), ("\U0001F408",)], ["cat"]) - actual = df.select(F.bit_length("cat")).collect() - self.assertEqual([Row(24), Row(32)], actual) + actual = df.select(F.bit_length("cat")) + assertDataFrameEqual([Row(24), Row(32)], actual) def test_array_contains_function(self): df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ["data"]) - actual = df.select(F.array_contains(df.data, "1").alias("b")).collect() - self.assertEqual([Row(b=True), Row(b=False)], actual) + actual = df.select(F.array_contains(df.data, "1").alias("b")) + assertDataFrameEqual([Row(b=True), Row(b=False)], actual) def test_levenshtein_function(self): df = self.spark.createDataFrame([("kitten", "sitting")], ["l", "r"]) - actual_without_threshold = df.select(F.levenshtein(df.l, df.r).alias("b")).collect() - self.assertEqual([Row(b=3)], actual_without_threshold) - actual_with_threshold = df.select(F.levenshtein(df.l, df.r, 2).alias("b")).collect() - self.assertEqual([Row(b=-1)], actual_with_threshold) + actual_without_threshold = df.select(F.levenshtein(df.l, df.r).alias("b")) + assertDataFrameEqual([Row(b=3)], actual_without_threshold) + actual_with_threshold = df.select(F.levenshtein(df.l, df.r, 2).alias("b")) + assertDataFrameEqual([Row(b=-1)], actual_with_threshold) def test_between_function(self): df = self.spark.createDataFrame( [Row(a=1, b=2, c=3), Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)] ) - self.assertEqual( - [Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)], df.filter(df.a.between(df.b, df.c)).collect() + assertDataFrameEqual( + [Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)], df.filter(df.a.between(df.b, df.c)) ) def test_dayofweek(self): @@ -602,7 +603,7 @@ def test_first_last_ignorenulls(self): F.last(df2.id, False).alias("c"), F.last(df2.id, True).alias("d"), ) - self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect()) + assertDataFrameEqual([Row(a=None, b=1, c=None, d=98)], df3) def test_approxQuantile(self): df = self.spark.createDataFrame([Row(a=i, b=i + 10) for i in range(10)]) @@ -660,20 +661,20 @@ def test_sort_with_nulls_order(self): df = self.spark.createDataFrame( [("Tom", 80), (None, 60), ("Alice", 50)], ["name", "height"] ) - self.assertEqual( - df.select(df.name).orderBy(F.asc_nulls_first("name")).collect(), + assertDataFrameEqual( + df.select(df.name).orderBy(F.asc_nulls_first("name")), [Row(name=None), Row(name="Alice"), Row(name="Tom")], ) - self.assertEqual( - df.select(df.name).orderBy(F.asc_nulls_last("name")).collect(), + assertDataFrameEqual( + df.select(df.name).orderBy(F.asc_nulls_last("name")), [Row(name="Alice"), Row(name="Tom"), Row(name=None)], ) - self.assertEqual( - df.select(df.name).orderBy(F.desc_nulls_first("name")).collect(), + assertDataFrameEqual( + df.select(df.name).orderBy(F.desc_nulls_first("name")), [Row(name=None), Row(name="Tom"), Row(name="Alice")], ) - self.assertEqual( - df.select(df.name).orderBy(F.desc_nulls_last("name")).collect(), + assertDataFrameEqual( + df.select(df.name).orderBy(F.desc_nulls_last("name")), [Row(name="Tom"), Row(name="Alice"), Row(name=None)], ) @@ -710,20 +711,16 @@ def test_slice(self): ) expected = [Row(sliced=[2, 3]), Row(sliced=[5])] - self.assertEqual(df.select(F.slice(df.x, 2, 2).alias("sliced")).collect(), expected) - self.assertEqual( - df.select(F.slice(df.x, F.lit(2), F.lit(2)).alias("sliced")).collect(), expected - ) - self.assertEqual( - df.select(F.slice("x", "index", "len").alias("sliced")).collect(), expected - ) + assertDataFrameEqual(df.select(F.slice(df.x, 2, 2).alias("sliced")), expected) + assertDataFrameEqual(df.select(F.slice(df.x, F.lit(2), F.lit(2)).alias("sliced")), expected) + assertDataFrameEqual(df.select(F.slice("x", "index", "len").alias("sliced")), expected) - self.assertEqual( - df.select(F.slice(df.x, F.size(df.x) - 1, F.lit(1)).alias("sliced")).collect(), + assertDataFrameEqual( + df.select(F.slice(df.x, F.size(df.x) - 1, F.lit(1)).alias("sliced")), [Row(sliced=[2]), Row(sliced=[4])], ) - self.assertEqual( - df.select(F.slice(df.x, F.lit(1), F.size(df.x) - 1).alias("sliced")).collect(), + assertDataFrameEqual( + df.select(F.slice(df.x, F.lit(1), F.size(df.x) - 1).alias("sliced")), [Row(sliced=[1, 2]), Row(sliced=[4])], ) @@ -732,11 +729,9 @@ def test_array_repeat(self): df = df.withColumn("repeat_n", F.lit(3)) expected = [Row(val=[0, 0, 0])] - self.assertEqual(df.select(F.array_repeat("id", 3).alias("val")).collect(), expected) - self.assertEqual(df.select(F.array_repeat("id", F.lit(3)).alias("val")).collect(), expected) - self.assertEqual( - df.select(F.array_repeat("id", "repeat_n").alias("val")).collect(), expected - ) + assertDataFrameEqual(df.select(F.array_repeat("id", 3).alias("val")), expected) + assertDataFrameEqual(df.select(F.array_repeat("id", F.lit(3)).alias("val")), expected) + assertDataFrameEqual(df.select(F.array_repeat("id", "repeat_n").alias("val")), expected) def test_input_file_name_udf(self): df = self.spark.read.text("python/test_support/hello/hello.txt") @@ -748,11 +743,11 @@ def test_least(self): df = self.spark.createDataFrame([(1, 4, 3)], ["a", "b", "c"]) expected = [Row(least=1)] - self.assertEqual(df.select(F.least(df.a, df.b, df.c).alias("least")).collect(), expected) - self.assertEqual( - df.select(F.least(F.lit(3), F.lit(5), F.lit(1)).alias("least")).collect(), expected + assertDataFrameEqual(df.select(F.least(df.a, df.b, df.c).alias("least")), expected) + assertDataFrameEqual( + df.select(F.least(F.lit(3), F.lit(5), F.lit(1)).alias("least")), expected ) - self.assertEqual(df.select(F.least("a", "b", "c").alias("least")).collect(), expected) + assertDataFrameEqual(df.select(F.least("a", "b", "c").alias("least")), expected) with self.assertRaises(PySparkValueError) as pe: df.select(F.least(df.a).alias("least")).collect() @@ -794,11 +789,9 @@ def test_overlay(self): df = self.spark.createDataFrame([("SPARK_SQL", "CORE", 7, 0)], ("x", "y", "pos", "len")) exp = [Row(ol="SPARK_CORESQL")] - self.assertEqual(df.select(F.overlay(df.x, df.y, 7, 0).alias("ol")).collect(), exp) - self.assertEqual( - df.select(F.overlay(df.x, df.y, F.lit(7), F.lit(0)).alias("ol")).collect(), exp - ) - self.assertEqual(df.select(F.overlay("x", "y", "pos", "len").alias("ol")).collect(), exp) + assertDataFrameEqual(df.select(F.overlay(df.x, df.y, 7, 0).alias("ol")), exp) + assertDataFrameEqual(df.select(F.overlay(df.x, df.y, F.lit(7), F.lit(0)).alias("ol")), exp) + assertDataFrameEqual(df.select(F.overlay("x", "y", "pos", "len").alias("ol")), exp) with self.assertRaises(PySparkTypeError) as pe: df.select(F.overlay(df.x, df.y, 7.5, 0).alias("ol")).collect() @@ -1147,6 +1140,70 @@ def test_collect_functions(self): ["1", "2", "2", "2"], ) + def test_listagg_functions(self): + df = self.spark.createDataFrame( + [(1, "1"), (2, "2"), (None, None), (1, "2")], ["key", "value"] + ) + df_with_bytes = self.spark.createDataFrame( + [(b"\x01",), (b"\x02",), (None,), (b"\x03",), (b"\x02",)], ["bytes"] + ) + df_with_nulls = self.spark.createDataFrame( + [(None,), (None,), (None,), (None,), (None,)], + StructType([StructField("nulls", StringType(), True)]), + ) + # listagg and string_agg are aliases + for listagg_ref in [F.listagg, F.string_agg]: + self.assertEqual(df.select(listagg_ref(df.key).alias("r")).collect()[0].r, "121") + self.assertEqual(df.select(listagg_ref(df.value).alias("r")).collect()[0].r, "122") + self.assertEqual( + df.select(listagg_ref(df.value, ",").alias("r")).collect()[0].r, "1,2,2" + ) + self.assertEqual( + df_with_bytes.select(listagg_ref(df_with_bytes.bytes, b"\x42").alias("r")) + .collect()[0] + .r, + b"\x01\x42\x02\x42\x03\x42\x02", + ) + self.assertEqual( + df_with_nulls.select(listagg_ref(df_with_nulls.nulls).alias("r")).collect()[0].r, + None, + ) + + def test_listagg_distinct_functions(self): + df = self.spark.createDataFrame( + [(1, "1"), (2, "2"), (None, None), (1, "2")], ["key", "value"] + ) + df_with_bytes = self.spark.createDataFrame( + [(b"\x01",), (b"\x02",), (None,), (b"\x03",), (b"\x02",)], ["bytes"] + ) + df_with_nulls = self.spark.createDataFrame( + [(None,), (None,), (None,), (None,), (None,)], + StructType([StructField("nulls", StringType(), True)]), + ) + # listagg_distinct and string_agg_distinct are aliases + for listagg_distinct_ref in [F.listagg_distinct, F.string_agg_distinct]: + self.assertEqual( + df.select(listagg_distinct_ref(df.key).alias("r")).collect()[0].r, "12" + ) + self.assertEqual( + df.select(listagg_distinct_ref(df.value).alias("r")).collect()[0].r, "12" + ) + self.assertEqual( + df.select(listagg_distinct_ref(df.value, ",").alias("r")).collect()[0].r, "1,2" + ) + self.assertEqual( + df_with_bytes.select(listagg_distinct_ref(df_with_bytes.bytes, b"\x42").alias("r")) + .collect()[0] + .r, + b"\x01\x42\x02\x42\x03", + ) + self.assertEqual( + df_with_nulls.select(listagg_distinct_ref(df_with_nulls.nulls).alias("r")) + .collect()[0] + .r, + None, + ) + def test_datetime_functions(self): df = self.spark.range(1).selectExpr("'2017-01-22' as dateCol") parse_result = df.select(F.to_date(F.col("dateCol"))).first() @@ -1158,8 +1215,8 @@ def test_assert_true(self): def check_assert_true(self, tpe): df = self.spark.range(3) - self.assertEqual( - df.select(F.assert_true(df.id < 3)).toDF("val").collect(), + assertDataFrameEqual( + df.select(F.assert_true(df.id < 3)).toDF("val"), [Row(val=None), Row(val=None), Row(val=None)], ) @@ -1296,17 +1353,17 @@ def test_np_scalar_input(self): df = self.spark.createDataFrame([([1, 2, 3],), ([],)], ["data"]) for dtype in [np.int8, np.int16, np.int32, np.int64]: - res = df.select(F.array_contains(df.data, dtype(1)).alias("b")).collect() - self.assertEqual([Row(b=True), Row(b=False)], res) - res = df.select(F.array_position(df.data, dtype(1)).alias("c")).collect() - self.assertEqual([Row(c=1), Row(c=0)], res) + res = df.select(F.array_contains(df.data, dtype(1)).alias("b")) + assertDataFrameEqual([Row(b=True), Row(b=False)], res) + res = df.select(F.array_position(df.data, dtype(1)).alias("c")) + assertDataFrameEqual([Row(c=1), Row(c=0)], res) df = self.spark.createDataFrame([([1.0, 2.0, 3.0],), ([],)], ["data"]) for dtype in [np.float32, np.float64]: - res = df.select(F.array_contains(df.data, dtype(1)).alias("b")).collect() - self.assertEqual([Row(b=True), Row(b=False)], res) - res = df.select(F.array_position(df.data, dtype(1)).alias("c")).collect() - self.assertEqual([Row(c=1), Row(c=0)], res) + res = df.select(F.array_contains(df.data, dtype(1)).alias("b")) + assertDataFrameEqual([Row(b=True), Row(b=False)], res) + res = df.select(F.array_position(df.data, dtype(1)).alias("c")) + assertDataFrameEqual([Row(c=1), Row(c=0)], res) @unittest.skipIf(not have_numpy, "NumPy not installed") def test_ndarray_input(self): @@ -1723,46 +1780,42 @@ class IntEnum(Enum): def test_nullifzero_zeroifnull(self): df = self.spark.createDataFrame([(0,), (1,)], ["a"]) - result = df.select(nullifzero(df.a).alias("r")).collect() - self.assertEqual([Row(r=None), Row(r=1)], result) + result = df.select(nullifzero(df.a).alias("r")) + assertDataFrameEqual([Row(r=None), Row(r=1)], result) df = self.spark.createDataFrame([(None,), (1,)], ["a"]) - result = df.select(zeroifnull(df.a).alias("r")).collect() - self.assertEqual([Row(r=0), Row(r=1)], result) + result = df.select(zeroifnull(df.a).alias("r")) + assertDataFrameEqual([Row(r=0), Row(r=1)], result) def test_randstr_uniform(self): df = self.spark.createDataFrame([(0,)], ["a"]) - result = df.select(randstr(F.lit(5), F.lit(0)).alias("x")).selectExpr("length(x)").collect() - self.assertEqual([Row(5)], result) + result = df.select(randstr(F.lit(5), F.lit(0)).alias("x")).selectExpr("length(x)") + assertDataFrameEqual([Row(5)], result) # The random seed is optional. - result = df.select(randstr(F.lit(5)).alias("x")).selectExpr("length(x)").collect() - self.assertEqual([Row(5)], result) + result = df.select(randstr(F.lit(5)).alias("x")).selectExpr("length(x)") + assertDataFrameEqual([Row(5)], result) df = self.spark.createDataFrame([(0,)], ["a"]) - result = ( - df.select(uniform(F.lit(10), F.lit(20), F.lit(0)).alias("x")) - .selectExpr("x > 5") - .collect() - ) - self.assertEqual([Row(True)], result) + result = df.select(uniform(F.lit(10), F.lit(20), F.lit(0)).alias("x")).selectExpr("x > 5") + assertDataFrameEqual([Row(True)], result) # The random seed is optional. - result = df.select(uniform(F.lit(10), F.lit(20)).alias("x")).selectExpr("x > 5").collect() - self.assertEqual([Row(True)], result) + result = df.select(uniform(F.lit(10), F.lit(20)).alias("x")).selectExpr("x > 5") + assertDataFrameEqual([Row(True)], result) def test_string_validation(self): df = self.spark.createDataFrame([("abc",)], ["a"]) # test is_valid_utf8 - result_is_valid_utf8 = df.select(F.is_valid_utf8(df.a).alias("r")).collect() - self.assertEqual([Row(r=True)], result_is_valid_utf8) + result_is_valid_utf8 = df.select(F.is_valid_utf8(df.a).alias("r")) + assertDataFrameEqual([Row(r=True)], result_is_valid_utf8) # test make_valid_utf8 - result_make_valid_utf8 = df.select(F.make_valid_utf8(df.a).alias("r")).collect() - self.assertEqual([Row(r="abc")], result_make_valid_utf8) + result_make_valid_utf8 = df.select(F.make_valid_utf8(df.a).alias("r")) + assertDataFrameEqual([Row(r="abc")], result_make_valid_utf8) # test validate_utf8 - result_validate_utf8 = df.select(F.validate_utf8(df.a).alias("r")).collect() - self.assertEqual([Row(r="abc")], result_validate_utf8) + result_validate_utf8 = df.select(F.validate_utf8(df.a).alias("r")) + assertDataFrameEqual([Row(r="abc")], result_validate_utf8) # test try_validate_utf8 - result_try_validate_utf8 = df.select(F.try_validate_utf8(df.a).alias("r")).collect() - self.assertEqual([Row(r="abc")], result_try_validate_utf8) + result_try_validate_utf8 = df.select(F.try_validate_utf8(df.a).alias("r")) + assertDataFrameEqual([Row(r="abc")], result_try_validate_utf8) class FunctionsTests(ReusedSQLTestCase, FunctionsTestsMixin): diff --git a/python/pyspark/sql/tests/test_group.py b/python/pyspark/sql/tests/test_group.py index 8e3d2d8d00033..bbc089b00c133 100644 --- a/python/pyspark/sql/tests/test_group.py +++ b/python/pyspark/sql/tests/test_group.py @@ -36,11 +36,11 @@ def test_agg_func(self): data = [Row(key=1, value=10), Row(key=1, value=20), Row(key=1, value=30)] df = self.spark.createDataFrame(data) g = df.groupBy("key") - self.assertEqual(g.max("value").collect(), [Row(**{"key": 1, "max(value)": 30})]) - self.assertEqual(g.min("value").collect(), [Row(**{"key": 1, "min(value)": 10})]) - self.assertEqual(g.sum("value").collect(), [Row(**{"key": 1, "sum(value)": 60})]) - self.assertEqual(g.count().collect(), [Row(key=1, count=3)]) - self.assertEqual(g.mean("value").collect(), [Row(**{"key": 1, "avg(value)": 20.0})]) + assertDataFrameEqual(g.max("value"), [Row(**{"key": 1, "max(value)": 30})]) + assertDataFrameEqual(g.min("value"), [Row(**{"key": 1, "min(value)": 10})]) + assertDataFrameEqual(g.sum("value"), [Row(**{"key": 1, "sum(value)": 60})]) + assertDataFrameEqual(g.count(), [Row(key=1, count=3)]) + assertDataFrameEqual(g.mean("value"), [Row(**{"key": 1, "avg(value)": 20.0})]) data = [ Row(electronic="Smartphone", year=2018, sales=150000), @@ -59,7 +59,7 @@ def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({"key": "max", "value": "count"}).collect()[0])) - self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) + assertDataFrameEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions diff --git a/python/pyspark/sql/tests/test_job_cancellation.py b/python/pyspark/sql/tests/test_job_cancellation.py new file mode 100644 index 0000000000000..3f30f78808892 --- /dev/null +++ b/python/pyspark/sql/tests/test_job_cancellation.py @@ -0,0 +1,205 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +import threading +import time + +from pyspark import InheritableThread, inheritable_thread_target +from pyspark.testing.sqlutils import ReusedSQLTestCase + + +class JobCancellationTestsMixin: + def test_tags(self): + self.spark.clearTags() + self.spark.addTag("a") + self.assertEqual(self.spark.getTags(), {"a"}) + self.spark.addTag("b") + self.spark.removeTag("a") + self.assertEqual(self.spark.getTags(), {"b"}) + self.spark.addTag("c") + self.spark.clearTags() + self.assertEqual(self.spark.getTags(), set()) + self.spark.clearTags() + + def test_tags_multithread(self): + output1 = None + output2 = None + + def tag1(): + nonlocal output1 + + self.spark.addTag("tag1") + output1 = self.spark.getTags() + + def tag2(): + nonlocal output2 + + self.spark.addTag("tag2") + output2 = self.spark.getTags() + + t1 = threading.Thread(target=tag1) + t1.start() + t1.join() + t2 = threading.Thread(target=tag2) + t2.start() + t2.join() + + self.assertIsNotNone(output1) + self.assertEquals(output1, {"tag1"}) + self.assertIsNotNone(output2) + self.assertEquals(output2, {"tag2"}) + + def check_job_cancellation( + self, setter, canceller, thread_ids, thread_ids_to_cancel, thread_ids_to_run + ): + job_id_a = "job_ids_to_cancel" + job_id_b = "job_ids_to_run" + threads = [] + + # A list which records whether job is cancelled. + # The index of the array is the thread index which job run in. + is_job_cancelled = [False for _ in thread_ids] + + def run_job(job_id, index): + """ + Executes a job with the group ``job_group``. Each job waits for 3 seconds + and then exits. + """ + try: + setter(job_id) + + def func(itr): + for pdf in itr: + time.sleep(pdf._1.iloc[0]) + yield pdf + + self.spark.createDataFrame([[20]]).repartition(1).mapInPandas( + func, schema="_1 LONG" + ).collect() + is_job_cancelled[index] = False + except Exception: + # Assume that exception means job cancellation. + is_job_cancelled[index] = True + + # Test if job succeeded when not cancelled. + run_job(job_id_a, 0) + self.assertFalse(is_job_cancelled[0]) + self.spark.clearTags() + + # Run jobs + for i in thread_ids_to_cancel: + t = threading.Thread(target=run_job, args=(job_id_a, i)) + t.start() + threads.append(t) + + for i in thread_ids_to_run: + t = threading.Thread(target=run_job, args=(job_id_b, i)) + t.start() + threads.append(t) + + # Wait to make sure all jobs are executed. + time.sleep(10) + # And then, cancel one job group. + canceller(job_id_a) + + # Wait until all threads launching jobs are finished. + for t in threads: + t.join() + + for i in thread_ids_to_cancel: + self.assertTrue( + is_job_cancelled[i], "Thread {i}: Job in group A was not cancelled.".format(i=i) + ) + + for i in thread_ids_to_run: + self.assertFalse( + is_job_cancelled[i], "Thread {i}: Job in group B did not succeeded.".format(i=i) + ) + + def test_inheritable_tags(self): + self.check_inheritable_tags( + create_thread=lambda target, session: InheritableThread(target, session=session) + ) + self.check_inheritable_tags( + create_thread=lambda target, session: threading.Thread( + target=inheritable_thread_target(session)(target) + ) + ) + + def check_inheritable_tags(self, create_thread): + spark = self.spark + spark.addTag("a") + first = set() + second = set() + + def get_inner_local_prop(): + spark.addTag("c") + second.update(spark.getTags()) + + def get_outer_local_prop(): + spark.addTag("b") + first.update(spark.getTags()) + t2 = create_thread(target=get_inner_local_prop, session=spark) + t2.start() + t2.join() + + t1 = create_thread(target=get_outer_local_prop, session=spark) + t1.start() + t1.join() + + self.assertEqual(spark.getTags(), {"a"}) + self.assertEqual(first, {"a", "b"}) + self.assertEqual(second, {"a", "b", "c"}) + + def test_interrupt_tag(self): + thread_ids = range(4) + self.check_job_cancellation( + lambda job_group: self.spark.addTag(job_group), + lambda job_group: self.spark.interruptTag(job_group), + thread_ids, + [i for i in thread_ids if i % 2 == 0], + [i for i in thread_ids if i % 2 != 0], + ) + self.spark.clearTags() + + def test_interrupt_all(self): + thread_ids = range(4) + self.check_job_cancellation( + lambda job_group: None, + lambda job_group: self.spark.interruptAll(), + thread_ids, + thread_ids, + [], + ) + self.spark.clearTags() + + +class JobCancellationTests(JobCancellationTestsMixin, ReusedSQLTestCase): + pass + + +if __name__ == "__main__": + from pyspark.sql.tests.test_job_cancellation import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/test_python_datasource.py b/python/pyspark/sql/tests/test_python_datasource.py index 140c7680b181b..a636b852a1e50 100644 --- a/python/pyspark/sql/tests/test_python_datasource.py +++ b/python/pyspark/sql/tests/test_python_datasource.py @@ -25,6 +25,7 @@ DataSourceReader, InputPartition, DataSourceWriter, + DataSourceArrowWriter, WriterCommitMessage, CaseInsensitiveDict, ) @@ -277,7 +278,7 @@ def write(self, iterator): from pyspark import TaskContext context = TaskContext.get() - output_path = os.path.join(self.path, f"{context.partitionId}.json") + output_path = os.path.join(self.path, f"{context.partitionId()}.json") count = 0 with open(output_path, "w") as file: for row in iterator: @@ -436,6 +437,37 @@ def partitions(self): ): self.spark.read.format("arrowbatch").schema("key int, dummy string").load().show() + def test_arrow_batch_sink(self): + class TestDataSource(DataSource): + @classmethod + def name(cls): + return "arrow_sink" + + def writer(self, schema, overwrite): + return TestArrowWriter(self.options["path"]) + + class TestArrowWriter(DataSourceArrowWriter): + def __init__(self, path): + self.path = path + + def write(self, iterator): + from pyspark import TaskContext + + context = TaskContext.get() + output_path = os.path.join(self.path, f"{context.partitionId()}.json") + with open(output_path, "w") as file: + for batch in iterator: + df = batch.to_pandas() + df.to_json(file, orient="records", lines=True) + return WriterCommitMessage() + + self.spark.dataSource.register(TestDataSource) + df = self.spark.range(3) + with tempfile.TemporaryDirectory(prefix="test_arrow_batch_sink") as d: + df.write.format("arrow_sink").mode("append").save(d) + df2 = self.spark.read.format("json").load(d) + assertDataFrameEqual(df2, df) + def test_data_source_type_mismatch(self): class TestDataSource(DataSource): @classmethod diff --git a/python/pyspark/sql/tests/test_readwriter.py b/python/pyspark/sql/tests/test_readwriter.py index 2fca6b57decf9..683c925eefc23 100644 --- a/python/pyspark/sql/tests/test_readwriter.py +++ b/python/pyspark/sql/tests/test_readwriter.py @@ -23,6 +23,7 @@ from pyspark.sql.functions import col, lit from pyspark.sql.readwriter import DataFrameWriterV2 from pyspark.sql.types import StructType, StructField, StringType +from pyspark.testing import assertDataFrameEqual from pyspark.testing.sqlutils import ReusedSQLTestCase @@ -34,15 +35,15 @@ def test_save_and_load(self): try: df.write.json(tmpPath) actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) schema = StructType([StructField("value", StringType(), True)]) actual = self.spark.read.json(tmpPath, schema) - self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect())) + assertDataFrameEqual(df.select("value"), actual) df.write.json(tmpPath, "overwrite") actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) df.write.save( format="json", @@ -53,11 +54,11 @@ def test_save_and_load(self): actual = self.spark.read.load( format="json", path=tmpPath, noUse="this options will not be used in load." ) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) with self.sql_conf({"spark.sql.sources.default": "org.apache.spark.sql.json"}): actual = self.spark.read.load(path=tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) csvpath = os.path.join(tempfile.mkdtemp(), "data") df.write.option("quote", None).format("csv").save(csvpath) @@ -71,15 +72,15 @@ def test_save_and_load_builder(self): try: df.write.json(tmpPath) actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) schema = StructType([StructField("value", StringType(), True)]) actual = self.spark.read.json(tmpPath, schema) - self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect())) + assertDataFrameEqual(df.select("value"), actual) df.write.mode("overwrite").json(tmpPath) actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) df.write.mode("overwrite").options( noUse="this options will not be used in save." @@ -89,11 +90,11 @@ def test_save_and_load_builder(self): actual = self.spark.read.format("json").load( path=tmpPath, noUse="this options will not be used in load." ) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) with self.sql_conf({"spark.sql.sources.default": "org.apache.spark.sql.json"}): actual = self.spark.read.load(path=tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + assertDataFrameEqual(df, actual) finally: shutil.rmtree(tmpPath) diff --git a/python/pyspark/sql/tests/test_session.py b/python/pyspark/sql/tests/test_session.py index de94b0d1882ff..c21247e3159c0 100644 --- a/python/pyspark/sql/tests/test_session.py +++ b/python/pyspark/sql/tests/test_session.py @@ -227,12 +227,6 @@ def test_unsupported_api(self): (lambda: session.client, "client"), (session.addArtifacts, "addArtifact(s)"), (lambda: session.copyFromLocalToFs("", ""), "copyFromLocalToFs"), - (lambda: session.interruptTag(""), "interruptTag"), - (lambda: session.interruptOperation(""), "interruptOperation"), - (lambda: session.addTag(""), "addTag"), - (lambda: session.removeTag(""), "removeTag"), - (session.getTags, "getTags"), - (session.clearTags, "clearTags"), ] for func, name in unsupported: diff --git a/python/pyspark/sql/tests/test_subquery.py b/python/pyspark/sql/tests/test_subquery.py index f58ff6364aed7..7c63ddb69458e 100644 --- a/python/pyspark/sql/tests/test_subquery.py +++ b/python/pyspark/sql/tests/test_subquery.py @@ -47,18 +47,21 @@ def df2(self): ["c", "d"], ) - def test_unanalyzable_expression(self): - sub = self.spark.range(1).where(sf.col("id") == sf.col("id").outer()) + def test_noop_outer(self): + assertDataFrameEqual( + self.spark.range(1).select(sf.col("id").outer()), + self.spark.range(1).select(sf.col("id")), + ) with self.assertRaises(AnalysisException) as pe: - sub.schema + self.spark.range(1).select(sf.col("outer_col").outer()).collect() self.check_error( exception=pe.exception, - errorClass="UNANALYZABLE_EXPRESSION", - messageParameters={"expr": '"outer(id)"'}, + errorClass="UNRESOLVED_COLUMN.WITH_SUGGESTION", + messageParameters={"objectName": "`outer_col`", "proposal": "`id`"}, query_context_type=QueryContextType.DataFrame, - fragment="outer", + fragment="col", ) def test_simple_uncorrelated_scalar_subquery(self): @@ -189,7 +192,7 @@ def test_scalar_subquery_against_local_relations(self): "c1", ( self.spark.table("t2") - .where(sf.col("c2").outer() == sf.col("c2")) + .where(sf.col("t1.c2").outer() == sf.col("t2.c2")) .select(sf.max("c1")) .scalar() ), @@ -205,45 +208,72 @@ def test_correlated_scalar_subquery(self): self.df2.createOrReplaceTempView("r") with self.subTest("in where"): - assertDataFrameEqual( - self.spark.table("l").where( - sf.col("b") - < ( - self.spark.table("r") - .where(sf.col("a").outer() == sf.col("c")) - .select(sf.max("d")) - .scalar() + for cond in [ + sf.col("a").outer() == sf.col("c"), + (sf.col("a") == sf.col("c")).outer(), + sf.expr("a = c").outer(), + ]: + with self.subTest(cond=cond): + assertDataFrameEqual( + self.spark.table("l").where( + sf.col("b") + < self.spark.table("r").where(cond).select(sf.max("d")).scalar() + ), + self.spark.sql( + """select * from l where b < (select max(d) from r where a = c)""" + ), ) - ), - self.spark.sql( - """select * from l where b < (select max(d) from r where a = c)""" - ), - ) with self.subTest("in select"): + df1 = self.spark.table("l").alias("t1") + df2 = self.spark.table("l").alias("t2") + + for cond in [ + sf.col("t1.a") == sf.col("t2.a").outer(), + (sf.col("t1.a") == sf.col("t2.a")).outer(), + sf.expr("t1.a = t2.a").outer(), + ]: + with self.subTest(cond=cond): + assertDataFrameEqual( + df1.select( + "a", + df2.where(cond).select(sf.sum("b")).scalar().alias("sum_b"), + ), + self.spark.sql( + """ + select + a, (select sum(b) from l t2 where t2.a = t1.a) sum_b + from l t1 + """ + ), + ) + + with self.subTest("without .outer()"): assertDataFrameEqual( self.spark.table("l").select( "a", ( - self.spark.table("l") - .where(sf.col("a") == sf.col("a").outer()) - .select(sf.sum("b")) + self.spark.table("r") + .where(sf.col("b") == sf.col("a").outer()) + .select(sf.sum("d")) .scalar() - .alias("sum_b") + .alias("sum_d") ), ), self.spark.sql( - """select a, (select sum(b) from l l2 where l2.a = l1.a) sum_b from l l1""" + """select a, (select sum(d) from r where b = l.a) sum_d from l""" ), ) with self.subTest("in select (null safe)"): + df1 = self.spark.table("l").alias("t1") + df2 = self.spark.table("l").alias("t2") + assertDataFrameEqual( - self.spark.table("l").select( + df1.select( "a", ( - self.spark.table("l") - .where(sf.col("a").eqNullSafe(sf.col("a").outer())) + df2.where(sf.col("t2.a").eqNullSafe(sf.col("t1.a").outer())) .select(sf.sum("b")) .scalar() .alias("sum_b") @@ -278,15 +308,13 @@ def test_correlated_scalar_subquery(self): ) with self.subTest("non-aggregated"): + df1 = self.spark.table("l").alias("t1") + df2 = self.spark.table("l").alias("t2") + with self.assertRaises(SparkRuntimeException) as pe: - self.spark.table("l").select( + df1.select( "a", - ( - self.spark.table("l") - .where(sf.col("a") == sf.col("a").outer()) - .select("b") - .scalar() - ), + df2.where(sf.col("t1.a") == sf.col("t2.a").outer()).select("b").scalar(), ).collect() self.check_error( @@ -296,19 +324,21 @@ def test_correlated_scalar_subquery(self): ) with self.subTest("non-equal"): + df1 = self.spark.table("l").alias("t1") + df2 = self.spark.table("l").alias("t2") + assertDataFrameEqual( - self.spark.table("l").select( + df1.select( "a", ( - self.spark.table("l") - .where(sf.col("a") < sf.col("a").outer()) + df2.where(sf.col("t2.a") < sf.col("t1.a").outer()) .select(sf.sum("b")) .scalar() .alias("sum_b") ), ), self.spark.sql( - """select a, (select sum(b) from l l2 where l2.a < l1.a) sum_b from l l1""" + """select a, (select sum(b) from l t2 where t2.a < t1.a) sum_b from l t1""" ), ) @@ -343,26 +373,30 @@ def test_exists_subquery(self): self.df2.createOrReplaceTempView("r") with self.subTest("EXISTS"): - assertDataFrameEqual( - self.spark.table("l").where( - self.spark.table("r").where(sf.col("a").outer() == sf.col("c")).exists() - ), - self.spark.sql( - """select * from l where exists (select * from r where l.a = r.c)""" - ), - ) + for cond in [ + sf.col("a").outer() == sf.col("c"), + (sf.col("a") == sf.col("c")).outer(), + sf.expr("a = c").outer(), + ]: + with self.subTest(cond=cond): + assertDataFrameEqual( + self.spark.table("l").where(self.spark.table("r").where(cond).exists()), + self.spark.sql( + """select * from l where exists (select * from r where l.a = r.c)""" + ), + ) - assertDataFrameEqual( - self.spark.table("l").where( - self.spark.table("r").where(sf.col("a").outer() == sf.col("c")).exists() - & (sf.col("a") <= sf.lit(2)) - ), - self.spark.sql( - """ + assertDataFrameEqual( + self.spark.table("l").where( + self.spark.table("r").where(cond).exists() + & (sf.col("a") <= sf.lit(2)) + ), + self.spark.sql( + """ select * from l where exists (select * from r where l.a = r.c) and l.a <= 2 """ - ), - ) + ), + ) with self.subTest("NOT EXISTS"): assertDataFrameEqual( @@ -425,70 +459,537 @@ def test_exists_subquery(self): ), ) - def test_scalar_subquery_with_outer_reference_errors(self): + def test_scalar_subquery_with_missing_outer_reference(self): with self.tempView("l", "r"): self.df1.createOrReplaceTempView("l") self.df2.createOrReplaceTempView("r") - with self.subTest("missing `outer()`"): - with self.assertRaises(AnalysisException) as pe: - self.spark.table("l").select( - "a", - ( - self.spark.table("r") - .where(sf.col("c") == sf.col("a")) - .select(sf.sum("d")) - .scalar() - ), - ).collect() + with self.assertRaises(AnalysisException) as pe: + self.spark.table("l").select( + "a", + ( + self.spark.table("r") + .where(sf.col("c") == sf.col("a")) + .select(sf.sum("d")) + .scalar() + ), + ).collect() + + self.check_error( + exception=pe.exception, + errorClass="UNRESOLVED_COLUMN.WITH_SUGGESTION", + messageParameters={"objectName": "`a`", "proposal": "`c`, `d`"}, + query_context_type=QueryContextType.DataFrame, + fragment="col", + ) - self.check_error( - exception=pe.exception, - errorClass="UNRESOLVED_COLUMN.WITH_SUGGESTION", - messageParameters={"objectName": "`a`", "proposal": "`c`, `d`"}, - query_context_type=QueryContextType.DataFrame, - fragment="col", - ) + def table1(self): + t1 = self.spark.sql("VALUES (0, 1), (1, 2) AS t1(c1, c2)") + t1.createOrReplaceTempView("t1") + return self.spark.table("t1") - with self.subTest("extra `outer()`"): - with self.assertRaises(AnalysisException) as pe: - self.spark.table("l").select( - "a", - ( - self.spark.table("r") - .where(sf.col("c").outer() == sf.col("a").outer()) - .select(sf.sum("d")) - .scalar() - ), - ).collect() + def table2(self): + t2 = self.spark.sql("VALUES (0, 2), (0, 3) AS t2(c1, c2)") + t2.createOrReplaceTempView("t2") + return self.spark.table("t2") - self.check_error( - exception=pe.exception, - errorClass="UNRESOLVED_COLUMN.WITH_SUGGESTION", - messageParameters={"objectName": "`c`", "proposal": "`a`, `b`"}, - query_context_type=QueryContextType.DataFrame, - fragment="outer", - ) + def table3(self): + t3 = self.spark.sql( + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) AS t3(c1, c2)" + ) + t3.createOrReplaceTempView("t3") + return self.spark.table("t3") - with self.subTest("missing `outer()` for another outer"): - with self.assertRaises(AnalysisException) as pe: - self.spark.table("l").select( - "a", - ( - self.spark.table("r") - .where(sf.col("b") == sf.col("a").outer()) - .select(sf.sum("d")) - .scalar() - ), - ).collect() + def test_lateral_join_with_single_column_select(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() - self.check_error( - exception=pe.exception, - errorClass="UNRESOLVED_COLUMN.WITH_SUGGESTION", - messageParameters={"objectName": "`b`", "proposal": "`c`, `d`"}, - query_context_type=QueryContextType.DataFrame, - fragment="col", + assertDataFrameEqual( + t1.lateralJoin(self.spark.range(1).select(sf.col("c1").outer())), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT c1)"""), + ) + assertDataFrameEqual( + t1.lateralJoin(t2.select(sf.col("t1.c1").outer())), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT t1.c1 FROM t2)"""), + ) + assertDataFrameEqual( + t1.lateralJoin(t2.select(sf.col("t1.c1").outer() + sf.col("t2.c1"))), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT t1.c1 + t2.c1 FROM t2)"""), + ) + + def test_lateral_join_with_star_expansion(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.lateralJoin(self.spark.range(1).select().select(sf.col("*"))), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT *)"""), + ) + assertDataFrameEqual( + t1.lateralJoin(t2.select(sf.col("*"))), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT * FROM t2)"""), + ) + assertDataFrameEqual( + t1.lateralJoin(t2.select(sf.col("t1.*").outer(), sf.col("t2.*"))), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT t1.*, t2.* FROM t2)"""), + ) + assertDataFrameEqual( + t1.lateralJoin(t2.alias("t1").select(sf.col("t1.*"))), + self.spark.sql("""SELECT * FROM t1, LATERAL (SELECT t1.* FROM t2 AS t1)"""), + ) + + def test_lateral_join_with_different_join_types(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1).select( + (sf.col("c1").outer() + sf.col("c2").outer()).alias("c3") + ), + sf.col("c2") == sf.col("c3"), + ), + self.spark.sql( + """SELECT * FROM t1 JOIN LATERAL (SELECT c1 + c2 AS c3) ON c2 = c3""" + ), + ) + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1).select( + (sf.col("c1").outer() + sf.col("c2").outer()).alias("c3") + ), + sf.col("c2") == sf.col("c3"), + "left", + ), + self.spark.sql( + """SELECT * FROM t1 LEFT JOIN LATERAL (SELECT c1 + c2 AS c3) ON c2 = c3""" + ), + ) + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1).select( + (sf.col("c1").outer() + sf.col("c2").outer()).alias("c3") + ), + how="cross", + ), + self.spark.sql("""SELECT * FROM t1 CROSS JOIN LATERAL (SELECT c1 + c2 AS c3)"""), + ) + + with self.assertRaises(AnalysisException) as pe: + t1.lateralJoin( + self.spark.range(1).select( + (sf.col("c1").outer() + sf.col("c2").outer()).alias("c3") + ), + how="right", + ).collect() + + self.check_error( + pe.exception, + errorClass="UNSUPPORTED_JOIN_TYPE", + messageParameters={ + "typ": "right", + "supported": "'inner', 'leftouter', 'left', 'left_outer', 'cross'", + }, + ) + + def test_lateral_join_with_subquery_alias(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1) + .select(sf.col("c1").outer(), sf.col("c2").outer()) + .toDF("a", "b") + .alias("s") + ).select("a", "b"), + self.spark.sql("""SELECT a, b FROM t1, LATERAL (SELECT c1, c2) s(a, b)"""), + ) + + def test_lateral_join_with_correlated_predicates(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.lateralJoin( + t2.where(sf.col("t1.c1").outer() == sf.col("t2.c1")).select(sf.col("c2")) + ), + self.spark.sql( + """SELECT * FROM t1, LATERAL (SELECT c2 FROM t2 WHERE t1.c1 = t2.c1)""" + ), + ) + assertDataFrameEqual( + t1.lateralJoin( + t2.where(sf.col("t1.c1").outer() < sf.col("t2.c1")).select(sf.col("c2")) + ), + self.spark.sql( + """SELECT * FROM t1, LATERAL (SELECT c2 FROM t2 WHERE t1.c1 < t2.c1)""" + ), + ) + + def test_lateral_join_with_aggregation_and_correlated_predicates(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.lateralJoin( + t2.where(sf.col("t1.c2").outer() < sf.col("t2.c2")).select( + sf.max(sf.col("c2")).alias("m") + ) + ), + self.spark.sql( + """ + SELECT * FROM t1, LATERAL (SELECT max(c2) AS m FROM t2 WHERE t1.c2 < t2.c2) + """ + ), + ) + + def test_lateral_join_reference_preceding_from_clause_items(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.join(t2).lateralJoin( + self.spark.range(1).select(sf.col("t1.c2").outer() + sf.col("t2.c2").outer()) + ), + self.spark.sql("""SELECT * FROM t1 JOIN t2 JOIN LATERAL (SELECT t1.c2 + t2.c2)"""), + ) + + def test_multiple_lateral_joins(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1).select( + (sf.col("c1").outer() + sf.col("c2").outer()).alias("a") + ) + ) + .lateralJoin( + self.spark.range(1).select( + (sf.col("c1").outer() - sf.col("c2").outer()).alias("b") + ) ) + .lateralJoin( + self.spark.range(1).select( + (sf.col("a").outer() * sf.col("b").outer()).alias("c") + ) + ), + self.spark.sql( + """ + SELECT * FROM t1, + LATERAL (SELECT c1 + c2 AS a), + LATERAL (SELECT c1 - c2 AS b), + LATERAL (SELECT a * b AS c) + """ + ), + ) + + def test_lateral_join_in_between_regular_joins(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.lateralJoin( + t2.where(sf.col("t1.c1").outer() == sf.col("t2.c1")) + .select(sf.col("c2")) + .alias("s"), + how="left", + ).join(t1.alias("t3"), sf.col("s.c2") == sf.col("t3.c2"), how="left"), + self.spark.sql( + """ + SELECT * FROM t1 + LEFT OUTER JOIN LATERAL (SELECT c2 FROM t2 WHERE t1.c1 = t2.c1) s + LEFT OUTER JOIN t1 t3 ON s.c2 = t3.c2 + """ + ), + ) + + def test_nested_lateral_joins(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.lateralJoin(t2.lateralJoin(self.spark.range(1).select(sf.col("c1").outer()))), + self.spark.sql( + """SELECT * FROM t1, LATERAL (SELECT * FROM t2, LATERAL (SELECT c1))""" + ), + ) + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1) + .select((sf.col("c1").outer() + sf.lit(1)).alias("c1")) + .lateralJoin(self.spark.range(1).select(sf.col("c1").outer())) + ), + self.spark.sql( + """ + SELECT * FROM t1, + LATERAL (SELECT * FROM (SELECT c1 + 1 AS c1), LATERAL (SELECT c1)) + """ + ), + ) + + def test_scalar_subquery_inside_lateral_join(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1).select( + sf.col("c2").outer(), t2.select(sf.min(sf.col("c2"))).scalar() + ) + ), + self.spark.sql( + """SELECT * FROM t1, LATERAL (SELECT c2, (SELECT MIN(c2) FROM t2))""" + ), + ) + assertDataFrameEqual( + t1.lateralJoin( + self.spark.range(1) + .select(sf.col("c1").outer().alias("a")) + .select( + t2.where(sf.col("c1") == sf.col("a").outer()) + .select(sf.sum(sf.col("c2"))) + .scalar() + ) + ), + self.spark.sql( + """ + SELECT * FROM t1, LATERAL ( + SELECT (SELECT SUM(c2) FROM t2 WHERE c1 = a) FROM (SELECT c1 AS a) + ) + """ + ), + ) + + def test_lateral_join_inside_subquery(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.where( + sf.col("c1") + == ( + t2.lateralJoin(self.spark.range(1).select(sf.col("c1").outer().alias("a"))) + .select(sf.min(sf.col("a"))) + .scalar() + ) + ), + self.spark.sql( + """ + SELECT * FROM t1 WHERE c1 = (SELECT MIN(a) FROM t2, LATERAL (SELECT c1 AS a)) + """ + ), + ) + assertDataFrameEqual( + t1.where( + sf.col("c1") + == ( + t2.lateralJoin(self.spark.range(1).select(sf.col("c1").outer().alias("a"))) + .where(sf.col("c1") == sf.col("t1.c1").outer()) + .select(sf.min(sf.col("a"))) + .scalar() + ) + ), + self.spark.sql( + """ + SELECT * FROM t1 + WHERE c1 = (SELECT MIN(a) FROM t2, LATERAL (SELECT c1 AS a) WHERE c1 = t1.c1) + """ + ), + ) + + def test_lateral_join_with_table_valued_functions(self): + with self.tempView("t1", "t3"): + t1 = self.table1() + t3 = self.table3() + + assertDataFrameEqual( + t1.lateralJoin(self.spark.tvf.range(3)), + self.spark.sql("""SELECT * FROM t1, LATERAL RANGE(3)"""), + ) + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.explode(sf.array(sf.col("c1").outer(), sf.col("c2").outer())) + ).toDF("c1", "c2", "c3"), + self.spark.sql("""SELECT * FROM t1, LATERAL EXPLODE(ARRAY(c1, c2)) t2(c3)"""), + ) + assertDataFrameEqual( + t3.lateralJoin(self.spark.tvf.explode_outer(sf.col("c2").outer())).toDF( + "c1", "c2", "v" + ), + self.spark.sql("""SELECT * FROM t3, LATERAL EXPLODE_OUTER(c2) t2(v)"""), + ) + assertDataFrameEqual( + self.spark.tvf.explode(sf.array(sf.lit(1), sf.lit(2))) + .toDF("v") + .lateralJoin(self.spark.range(1).select((sf.col("v").outer() + 1).alias("v"))), + self.spark.sql( + """SELECT * FROM EXPLODE(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1 AS v)""" + ), + ) + + def test_lateral_join_with_table_valued_functions_and_join_conditions(self): + with self.tempView("t1", "t3"): + t1 = self.table1() + t3 = self.table3() + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.explode(sf.array(sf.col("c1").outer(), sf.col("c2").outer())), + sf.col("c1") == sf.col("col"), + ).toDF("c1", "c2", "c3"), + self.spark.sql( + """SELECT * FROM t1 JOIN LATERAL EXPLODE(ARRAY(c1, c2)) t(c3) ON t1.c1 = c3""" + ), + ) + assertDataFrameEqual( + t3.lateralJoin( + self.spark.tvf.explode(sf.col("c2").outer()), + sf.col("c1") == sf.col("col"), + ).toDF("c1", "c2", "c3"), + self.spark.sql("""SELECT * FROM t3 JOIN LATERAL EXPLODE(c2) t(c3) ON t3.c1 = c3"""), + ) + assertDataFrameEqual( + t3.lateralJoin( + self.spark.tvf.explode(sf.col("c2").outer()), + sf.col("c1") == sf.col("col"), + "left", + ).toDF("c1", "c2", "c3"), + self.spark.sql( + """SELECT * FROM t3 LEFT JOIN LATERAL EXPLODE(c2) t(c3) ON t3.c1 = c3""" + ), + ) + + def test_subquery_with_generator_and_tvf(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual( + self.spark.range(1).select(sf.explode(t1.select(sf.collect_list("c2")).scalar())), + self.spark.sql("""SELECT EXPLODE((SELECT COLLECT_LIST(c2) FROM t1))"""), + ) + assertDataFrameEqual( + self.spark.tvf.explode(t1.select(sf.collect_list("c2")).scalar()), + self.spark.sql("""SELECT * FROM EXPLODE((SELECT COLLECT_LIST(c2) FROM t1))"""), + ) + + def test_subquery_in_join_condition(self): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + assertDataFrameEqual( + t1.join(t2, sf.col("t1.c1") == t1.select(sf.max("c1")).scalar()), + self.spark.sql("""SELECT * FROM t1 JOIN t2 ON t1.c1 = (SELECT MAX(c1) FROM t1)"""), + ) + + def test_subquery_in_unpivot(self): + self.check_subquery_in_unpivot(QueryContextType.DataFrame, "exists") + + def check_subquery_in_unpivot(self, query_context_type, fragment): + with self.tempView("t1", "t2"): + t1 = self.table1() + t2 = self.table2() + + with self.assertRaises(AnalysisException) as pe: + t1.unpivot("c1", t2.exists(), "c1", "c2").collect() + + self.check_error( + exception=pe.exception, + errorClass=( + "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_IN_EXISTS_SUBQUERY" + ), + messageParameters={"treeNode": "Expand.*"}, + query_context_type=query_context_type, + fragment=fragment, + matchPVals=True, + ) + + def test_subquery_in_transpose(self): + with self.tempView("t1"): + t1 = self.table1() + + with self.assertRaises(AnalysisException) as pe: + t1.transpose(t1.select(sf.max("c1")).scalar()).collect() + + self.check_error( + exception=pe.exception, + errorClass="TRANSPOSE_INVALID_INDEX_COLUMN", + messageParameters={"reason": "Index column must be an atomic attribute"}, + ) + + def test_subquery_in_with_columns(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual( + t1.withColumn( + "scalar", + self.spark.range(1) + .select(sf.col("c1").outer() + sf.col("c2").outer()) + .scalar(), + ), + t1.select("*", (sf.col("c1") + sf.col("c2")).alias("scalar")), + ) + assertDataFrameEqual( + t1.withColumn( + "scalar", + self.spark.range(1) + .withColumn("c1", sf.col("c1").outer()) + .select(sf.col("c1") + sf.col("c2").outer()) + .scalar(), + ), + t1.select("*", (sf.col("c1") + sf.col("c2")).alias("scalar")), + ) + assertDataFrameEqual( + t1.withColumn( + "scalar", + self.spark.range(1) + .select(sf.col("c1").outer().alias("c1")) + .withColumn("c2", sf.col("c2").outer()) + .select(sf.col("c1") + sf.col("c2")) + .scalar(), + ), + t1.select("*", (sf.col("c1") + sf.col("c2")).alias("scalar")), + ) + + def test_subquery_in_with_columns_renamed(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual( + t1.withColumn( + "scalar", + self.spark.range(1) + .select(sf.col("c1").outer().alias("c1"), sf.col("c2").outer().alias("c2")) + .withColumnsRenamed({"c1": "x", "c2": "y"}) + .select(sf.col("x") + sf.col("y")) + .scalar(), + ), + t1.select("*", (sf.col("c1").alias("x") + sf.col("c2").alias("y")).alias("scalar")), + ) + + def test_subquery_in_drop(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual(t1.drop(self.spark.range(1).select(sf.lit("c1")).scalar()), t1) + + def test_subquery_in_repartition(self): + with self.tempView("t1"): + t1 = self.table1() + + assertDataFrameEqual(t1.repartition(self.spark.range(1).select(sf.lit(1)).scalar()), t1) class SubqueryTests(SubqueryTestsMixin, ReusedSQLTestCase): diff --git a/python/pyspark/sql/tests/test_tvf.py b/python/pyspark/sql/tests/test_tvf.py index 5c709437fc4db..c7274c0810cfb 100644 --- a/python/pyspark/sql/tests/test_tvf.py +++ b/python/pyspark/sql/tests/test_tvf.py @@ -52,6 +52,39 @@ def test_explode(self): expected = self.spark.sql("""SELECT * FROM explode(null :: map)""") assertDataFrameEqual(actual=actual, expected=expected) + def test_explode_with_lateral_join(self): + with self.tempView("t1", "t2"): + t1 = self.spark.sql("VALUES (0, 1), (1, 2) AS t1(c1, c2)") + t1.createOrReplaceTempView("t1") + t3 = self.spark.sql( + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) " + "AS t3(c1, c2)" + ) + t3.createOrReplaceTempView("t3") + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.explode(sf.array(sf.col("c1").outer(), sf.col("c2").outer())) + .toDF("c3") + .alias("t2") + ), + self.spark.sql("""SELECT * FROM t1, LATERAL EXPLODE(ARRAY(c1, c2)) t2(c3)"""), + ) + assertDataFrameEqual( + t3.lateralJoin(self.spark.tvf.explode(sf.col("c2").outer()).toDF("v").alias("t2")), + self.spark.sql("""SELECT * FROM t3, LATERAL EXPLODE(c2) t2(v)"""), + ) + assertDataFrameEqual( + self.spark.tvf.explode(sf.array(sf.lit(1), sf.lit(2))) + .toDF("v") + .lateralJoin( + self.spark.range(1).select((sf.col("v").outer() + sf.lit(1)).alias("v2")) + ), + self.spark.sql( + """SELECT * FROM EXPLODE(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1 AS v2)""" + ), + ) + def test_explode_outer(self): actual = self.spark.tvf.explode_outer(sf.array(sf.lit(1), sf.lit(2))) expected = self.spark.sql("""SELECT * FROM explode_outer(array(1, 2))""") @@ -81,6 +114,45 @@ def test_explode_outer(self): expected = self.spark.sql("""SELECT * FROM explode_outer(null :: map)""") assertDataFrameEqual(actual=actual, expected=expected) + def test_explode_outer_with_lateral_join(self): + with self.tempView("t1", "t2"): + t1 = self.spark.sql("VALUES (0, 1), (1, 2) AS t1(c1, c2)") + t1.createOrReplaceTempView("t1") + t3 = self.spark.sql( + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) " + "AS t3(c1, c2)" + ) + t3.createOrReplaceTempView("t3") + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.explode_outer( + sf.array(sf.col("c1").outer(), sf.col("c2").outer()) + ) + .toDF("c3") + .alias("t2") + ), + self.spark.sql("""SELECT * FROM t1, LATERAL EXPLODE_OUTER(ARRAY(c1, c2)) t2(c3)"""), + ) + assertDataFrameEqual( + t3.lateralJoin( + self.spark.tvf.explode_outer(sf.col("c2").outer()).toDF("v").alias("t2") + ), + self.spark.sql("""SELECT * FROM t3, LATERAL EXPLODE_OUTER(c2) t2(v)"""), + ) + assertDataFrameEqual( + self.spark.tvf.explode_outer(sf.array(sf.lit(1), sf.lit(2))) + .toDF("v") + .lateralJoin( + self.spark.range(1).select((sf.col("v").outer() + sf.lit(1)).alias("v2")) + ), + self.spark.sql( + """ + SELECT * FROM EXPLODE_OUTER(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1 AS v2) + """ + ), + ) + def test_inline(self): actual = self.spark.tvf.inline( sf.array(sf.struct(sf.lit(1), sf.lit("a")), sf.struct(sf.lit(2), sf.lit("b"))) @@ -107,6 +179,35 @@ def test_inline(self): ) assertDataFrameEqual(actual=actual, expected=expected) + def test_inline_with_lateral_join(self): + with self.tempView("array_struct"): + array_struct = self.spark.sql( + """ + VALUES + (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), + (2, ARRAY()), + (3, ARRAY(STRUCT(3, 'c'))) AS array_struct(id, arr) + """ + ) + array_struct.createOrReplaceTempView("array_struct") + + assertDataFrameEqual( + array_struct.lateralJoin(self.spark.tvf.inline(sf.col("arr").outer())), + self.spark.sql("""SELECT * FROM array_struct JOIN LATERAL INLINE(arr)"""), + ) + assertDataFrameEqual( + array_struct.lateralJoin( + self.spark.tvf.inline(sf.col("arr").outer()).toDF("k", "v").alias("t"), + sf.col("id") == sf.col("k"), + "left", + ), + self.spark.sql( + """ + SELECT * FROM array_struct LEFT JOIN LATERAL INLINE(arr) t(k, v) ON id = k + """ + ), + ) + def test_inline_outer(self): actual = self.spark.tvf.inline_outer( sf.array(sf.struct(sf.lit(1), sf.lit("a")), sf.struct(sf.lit(2), sf.lit("b"))) @@ -137,6 +238,35 @@ def test_inline_outer(self): ) assertDataFrameEqual(actual=actual, expected=expected) + def test_inline_outer_with_lateral_join(self): + with self.tempView("array_struct"): + array_struct = self.spark.sql( + """ + VALUES + (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), + (2, ARRAY()), + (3, ARRAY(STRUCT(3, 'c'))) AS array_struct(id, arr) + """ + ) + array_struct.createOrReplaceTempView("array_struct") + + assertDataFrameEqual( + array_struct.lateralJoin(self.spark.tvf.inline_outer(sf.col("arr").outer())), + self.spark.sql("""SELECT * FROM array_struct JOIN LATERAL INLINE_OUTER(arr)"""), + ) + assertDataFrameEqual( + array_struct.lateralJoin( + self.spark.tvf.inline_outer(sf.col("arr").outer()).toDF("k", "v").alias("t"), + sf.col("id") == sf.col("k"), + "left", + ), + self.spark.sql( + """ + SELECT * FROM array_struct LEFT JOIN LATERAL INLINE_OUTER(arr) t(k, v) ON id = k + """ + ), + ) + def test_json_tuple(self): actual = self.spark.tvf.json_tuple(sf.lit('{"a":1, "b":2}'), sf.lit("a"), sf.lit("b")) expected = self.spark.sql("""SELECT json_tuple('{"a":1, "b":2}', 'a', 'b')""") @@ -151,6 +281,64 @@ def test_json_tuple(self): messageParameters={"item": "field"}, ) + def test_json_tuple_with_lateral_join(self): + with self.tempView("json_table"): + json_table = self.spark.sql( + """ + VALUES + ('1', '{"f1": "1", "f2": "2", "f3": 3, "f5": 5.23}'), + ('2', '{"f1": "1", "f3": "3", "f2": 2, "f4": 4.01}'), + ('3', '{"f1": 3, "f4": "4", "f3": "3", "f2": 2, "f5": 5.01}'), + ('4', cast(null as string)), + ('5', '{"f1": null, "f5": ""}'), + ('6', '[invalid JSON string]') AS json_table(key, jstring) + """ + ) + json_table.createOrReplaceTempView("json_table") + + assertDataFrameEqual( + json_table.alias("t1") + .lateralJoin( + self.spark.tvf.json_tuple( + sf.col("jstring").outer(), + sf.lit("f1"), + sf.lit("f2"), + sf.lit("f3"), + sf.lit("f4"), + sf.lit("f5"), + ).alias("t2") + ) + .select("t1.key", "t2.*"), + self.spark.sql( + """ + SELECT t1.key, t2.* FROM json_table t1, + LATERAL json_tuple(t1.jstring, 'f1', 'f2', 'f3', 'f4', 'f5') t2 + """ + ), + ) + assertDataFrameEqual( + json_table.alias("t1") + .lateralJoin( + self.spark.tvf.json_tuple( + sf.col("jstring").outer(), + sf.lit("f1"), + sf.lit("f2"), + sf.lit("f3"), + sf.lit("f4"), + sf.lit("f5"), + ).alias("t2") + ) + .where(sf.col("t2.c0").isNotNull()) + .select("t1.key", "t2.*"), + self.spark.sql( + """ + SELECT t1.key, t2.* FROM json_table t1, + LATERAL json_tuple(t1.jstring, 'f1', 'f2', 'f3', 'f4', 'f5') t2 + WHERE t2.c0 IS NOT NULL + """ + ), + ) + def test_posexplode(self): actual = self.spark.tvf.posexplode(sf.array(sf.lit(1), sf.lit(2))) expected = self.spark.sql("""SELECT * FROM posexplode(array(1, 2))""") @@ -180,6 +368,39 @@ def test_posexplode(self): expected = self.spark.sql("""SELECT * FROM posexplode(null :: map)""") assertDataFrameEqual(actual=actual, expected=expected) + def test_posexplode_with_lateral_join(self): + with self.tempView("t1", "t2"): + t1 = self.spark.sql("VALUES (0, 1), (1, 2) AS t1(c1, c2)") + t1.createOrReplaceTempView("t1") + t3 = self.spark.sql( + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) " + "AS t3(c1, c2)" + ) + t3.createOrReplaceTempView("t3") + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.posexplode(sf.array(sf.col("c1").outer(), sf.col("c2").outer())) + ), + self.spark.sql("""SELECT * FROM t1, LATERAL POSEXPLODE(ARRAY(c1, c2))"""), + ) + assertDataFrameEqual( + t3.lateralJoin(self.spark.tvf.posexplode(sf.col("c2").outer())), + self.spark.sql("""SELECT * FROM t3, LATERAL POSEXPLODE(c2)"""), + ) + assertDataFrameEqual( + self.spark.tvf.posexplode(sf.array(sf.lit(1), sf.lit(2))) + .toDF("p", "v") + .lateralJoin( + self.spark.range(1).select((sf.col("v").outer() + sf.lit(1)).alias("v2")) + ), + self.spark.sql( + """ + SELECT * FROM POSEXPLODE(ARRAY(1, 2)) t(p, v), LATERAL (SELECT v + 1 AS v2) + """ + ), + ) + def test_posexplode_outer(self): actual = self.spark.tvf.posexplode_outer(sf.array(sf.lit(1), sf.lit(2))) expected = self.spark.sql("""SELECT * FROM posexplode_outer(array(1, 2))""") @@ -209,11 +430,95 @@ def test_posexplode_outer(self): expected = self.spark.sql("""SELECT * FROM posexplode_outer(null :: map)""") assertDataFrameEqual(actual=actual, expected=expected) + def test_posexplode_outer_with_lateral_join(self): + with self.tempView("t1", "t2"): + t1 = self.spark.sql("VALUES (0, 1), (1, 2) AS t1(c1, c2)") + t1.createOrReplaceTempView("t1") + t3 = self.spark.sql( + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) " + "AS t3(c1, c2)" + ) + t3.createOrReplaceTempView("t3") + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.posexplode_outer( + sf.array(sf.col("c1").outer(), sf.col("c2").outer()) + ) + ), + self.spark.sql("""SELECT * FROM t1, LATERAL POSEXPLODE_OUTER(ARRAY(c1, c2))"""), + ) + assertDataFrameEqual( + t3.lateralJoin(self.spark.tvf.posexplode_outer(sf.col("c2").outer())), + self.spark.sql("""SELECT * FROM t3, LATERAL POSEXPLODE_OUTER(c2)"""), + ) + assertDataFrameEqual( + self.spark.tvf.posexplode_outer(sf.array(sf.lit(1), sf.lit(2))) + .toDF("p", "v") + .lateralJoin( + self.spark.range(1).select((sf.col("v").outer() + sf.lit(1)).alias("v2")) + ), + self.spark.sql( + """ + SELECT * FROM POSEXPLODE_OUTER(ARRAY(1, 2)) t(p, v), + LATERAL (SELECT v + 1 AS v2) + """ + ), + ) + def test_stack(self): actual = self.spark.tvf.stack(sf.lit(2), sf.lit(1), sf.lit(2), sf.lit(3)) expected = self.spark.sql("""SELECT * FROM stack(2, 1, 2, 3)""") assertDataFrameEqual(actual=actual, expected=expected) + def test_stack_with_lateral_join(self): + with self.tempView("t1", "t3"): + t1 = self.spark.sql("VALUES (0, 1), (1, 2) AS t1(c1, c2)") + t1.createOrReplaceTempView("t1") + t3 = self.spark.sql( + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) " + "AS t3(c1, c2)" + ) + t3.createOrReplaceTempView("t3") + + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.stack( + sf.lit(2), + sf.lit("Key"), + sf.col("c1").outer(), + sf.lit("Value"), + sf.col("c2").outer(), + ).alias("t") + ).select("t.*"), + self.spark.sql( + """SELECT t.* FROM t1, LATERAL stack(2, 'Key', c1, 'Value', c2) t""" + ), + ) + assertDataFrameEqual( + t1.lateralJoin( + self.spark.tvf.stack(sf.lit(1), sf.col("c1").outer(), sf.col("c2").outer()) + .toDF("x", "y") + .alias("t") + ).select("t.*"), + self.spark.sql("""SELECT t.* FROM t1 JOIN LATERAL stack(1, c1, c2) t(x, y)"""), + ) + assertDataFrameEqual( + t1.join(t3, sf.col("t1.c1") == sf.col("t3.c1")) + .lateralJoin( + self.spark.tvf.stack( + sf.lit(1), sf.col("t1.c2").outer(), sf.col("t3.c2").outer() + ).alias("t") + ) + .select("t.*"), + self.spark.sql( + """ + SELECT t.* FROM t1 JOIN t3 ON t1.c1 = t3.c1 + JOIN LATERAL stack(1, t1.c2, t3.c2) t + """ + ), + ) + def test_collations(self): actual = self.spark.tvf.collations() expected = self.spark.sql("""SELECT * FROM collations()""") @@ -256,6 +561,31 @@ def test_variant_explode(self): expected = self.spark.sql("""SELECT * FROM variant_explode(parse_json('1'))""") assertDataFrameEqual(actual=actual, expected=expected) + def test_variant_explode_with_lateral_join(self): + with self.tempView("variant_table"): + variant_table = self.spark.sql( + """ + SELECT id, parse_json(v) AS v FROM VALUES + (0, '["hello", "world"]'), (1, '{"a": true, "b": 3.14}'), + (2, '[]'), (3, '{}'), + (4, NULL), (5, '1') + AS t(id, v) + """ + ) + variant_table.createOrReplaceTempView("variant_table") + + assertDataFrameEqual( + variant_table.alias("t1") + .lateralJoin(self.spark.tvf.variant_explode(sf.col("v").outer()).alias("t")) + .select("t1.id", "t.*"), + self.spark.sql( + """ + SELECT t1.id, t.* FROM variant_table AS t1, + LATERAL variant_explode(v) AS t + """ + ), + ) + def test_variant_explode_outer(self): actual = self.spark.tvf.variant_explode_outer(sf.parse_json(sf.lit('["hello", "world"]'))) expected = self.spark.sql( @@ -290,6 +620,31 @@ def test_variant_explode_outer(self): expected = self.spark.sql("""SELECT * FROM variant_explode_outer(parse_json('1'))""") assertDataFrameEqual(actual=actual, expected=expected) + def test_variant_explode_outer_with_lateral_join(self): + with self.tempView("variant_table"): + variant_table = self.spark.sql( + """ + SELECT id, parse_json(v) AS v FROM VALUES + (0, '["hello", "world"]'), (1, '{"a": true, "b": 3.14}'), + (2, '[]'), (3, '{}'), + (4, NULL), (5, '1') + AS t(id, v) + """ + ) + variant_table.createOrReplaceTempView("variant_table") + + assertDataFrameEqual( + variant_table.alias("t1") + .lateralJoin(self.spark.tvf.variant_explode_outer(sf.col("v").outer()).alias("t")) + .select("t1.id", "t.*"), + self.spark.sql( + """ + SELECT t1.id, t.* FROM variant_table AS t1, + LATERAL variant_explode_outer(v) AS t + """ + ), + ) + class TVFTests(TVFTestsMixin, ReusedSQLTestCase): pass diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index dcc383b7add5a..ab05502ad229d 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -2240,6 +2240,60 @@ def test_variant_type(self): PySparkValueError, lambda: str(VariantVal(bytes([32, 10, 1, 0, 0, 0]), metadata)) ) + # check parse_json + for key, json, obj in expected_values: + self.assertEqual(VariantVal.parseJson(json).toJson(), json) + self.assertEqual(VariantVal.parseJson(json).toPython(), obj) + + # compare the parse_json in Spark vs python. `json_str` contains all of `expected_values`. + parse_json_spark_output = variants[0] + parse_json_python_output = VariantVal.parseJson(json_str) + self.assertEqual(parse_json_spark_output.value, parse_json_python_output.value) + self.assertEqual(parse_json_spark_output.metadata, parse_json_python_output.metadata) + + # Test createDataFrame + create_df_variants = self.spark.createDataFrame( + [ + ( + VariantVal.parseJson("2"), + [VariantVal.parseJson("3")], + {"v": VariantVal.parseJson("4")}, + {"v": VariantVal.parseJson("5")}, + ), + (None, [None], {"v": None}, {"v": None}), + (None, None, None, None), + ], + "v variant, a array, s struct, m map", + ).collect() + self.assertEqual(create_df_variants[0][0].toJson(), "2") + self.assertEqual(create_df_variants[0][1][0].toJson(), "3") + self.assertEqual(create_df_variants[0][2][0].toJson(), "4") + self.assertEqual(create_df_variants[0][3]["v"].toJson(), "5") + self.assertEqual(create_df_variants[1][0], None) + self.assertEqual(create_df_variants[1][1][0], None) + self.assertEqual(create_df_variants[1][2][0], None) + self.assertEqual(create_df_variants[1][3]["v"], None) + self.assertEqual(create_df_variants[2][0], None) + self.assertEqual(create_df_variants[2][1], None) + self.assertEqual(create_df_variants[2][2], None) + self.assertEqual(create_df_variants[2][3], None) + + def test_to_ddl(self): + schema = StructType().add("a", NullType()).add("b", BooleanType()).add("c", BinaryType()) + self.assertEqual(schema.toDDL(), "a VOID,b BOOLEAN,c BINARY") + + schema = StructType().add("a", IntegerType()).add("b", StringType()) + self.assertEqual(schema.toDDL(), "a INT,b STRING") + + schema = StructType().add("a", FloatType()).add("b", LongType(), False) + self.assertEqual(schema.toDDL(), "a FLOAT,b BIGINT NOT NULL") + + schema = StructType().add("a", ArrayType(DoubleType()), False).add("b", DateType()) + self.assertEqual(schema.toDDL(), "a ARRAY NOT NULL,b DATE") + + schema = StructType().add("a", TimestampType()).add("b", TimestampNTZType()) + self.assertEqual(schema.toDDL(), "a TIMESTAMP,b TIMESTAMP_NTZ") + def test_from_ddl(self): self.assertEqual(DataType.fromDDL("long"), LongType()) self.assertEqual( diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index 78aa2546128a1..67d243cd29244 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -220,7 +220,7 @@ def test_udf_in_filter_on_top_of_outer_join(self): right = self.spark.createDataFrame([Row(a=1)]) df = left.join(right, on="a", how="left_outer") df = df.withColumn("b", udf(lambda x: "x")(df.a)) - self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b="x")]) + assertDataFrameEqual(df.filter('b = "x"'), [Row(a=1, b="x")]) def test_udf_in_filter_on_top_of_join(self): # regression test for SPARK-18589 @@ -228,7 +228,7 @@ def test_udf_in_filter_on_top_of_join(self): right = self.spark.createDataFrame([Row(b=1)]) f = udf(lambda a, b: a == b, BooleanType()) df = left.crossJoin(right).filter(f("a", "b")) - self.assertEqual(df.collect(), [Row(a=1, b=1)]) + assertDataFrameEqual(df, [Row(a=1, b=1)]) def test_udf_in_join_condition(self): # regression test for SPARK-25314 @@ -243,7 +243,7 @@ def test_udf_in_join_condition(self): df.collect() with self.sql_conf({"spark.sql.crossJoin.enabled": True}): df = left.join(right, f("a", "b")) - self.assertEqual(df.collect(), [Row(a=1, b=1)]) + assertDataFrameEqual(df, [Row(a=1, b=1)]) def test_udf_in_left_outer_join_condition(self): # regression test for SPARK-26147 @@ -256,7 +256,7 @@ def test_udf_in_left_outer_join_condition(self): # The Python UDF only refer to attributes from one side, so it's evaluable. df = left.join(right, f("a") == col("b").cast("string"), how="left_outer") with self.sql_conf({"spark.sql.crossJoin.enabled": True}): - self.assertEqual(df.collect(), [Row(a=1, b=1)]) + assertDataFrameEqual(df, [Row(a=1, b=1)]) def test_udf_and_common_filter_in_join_condition(self): # regression test for SPARK-25314 @@ -266,7 +266,7 @@ def test_udf_and_common_filter_in_join_condition(self): f = udf(lambda a, b: a == b, BooleanType()) df = left.join(right, [f("a", "b"), left.a1 == right.b1]) # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition. - self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)]) + assertDataFrameEqual(df, [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)]) def test_udf_not_supported_in_join_condition(self): # regression test for SPARK-25314 @@ -294,7 +294,7 @@ def test_udf_as_join_condition(self): f = udf(lambda a: a, IntegerType()) df = left.join(right, [f("a") == f("b"), left.a1 == right.b1]) - self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)]) + assertDataFrameEqual(df, [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)]) def test_udf_without_arguments(self): self.spark.catalog.registerFunction("foo", lambda: "bar") @@ -331,7 +331,7 @@ def test_udf_with_filter_function(self): my_filter = udf(lambda a: a < 2, BooleanType()) sel = df.select(col("key"), col("value")).filter((my_filter(col("key"))) & (df.value < "2")) - self.assertEqual(sel.collect(), [Row(key=1, value="1")]) + assertDataFrameEqual(sel, [Row(key=1, value="1")]) def test_udf_with_variant_input(self): df = self.spark.range(0, 10).selectExpr("parse_json(cast(id as string)) v") @@ -461,7 +461,7 @@ def test_udf_with_aggregate_function(self): my_filter = udf(lambda a: a == 1, BooleanType()) sel = df.select(col("key")).distinct().filter(my_filter(col("key"))) - self.assertEqual(sel.collect(), [Row(key=1)]) + assertDataFrameEqual(sel, [Row(key=1)]) my_copy = udf(lambda x: x, IntegerType()) my_add = udf(lambda a, b: int(a + b), IntegerType()) @@ -471,7 +471,7 @@ def test_udf_with_aggregate_function(self): .agg(sum(my_strlen(col("value"))).alias("s")) .select(my_add(col("k"), col("s")).alias("t")) ) - self.assertEqual(sel.collect(), [Row(t=4), Row(t=3)]) + assertDataFrameEqual(sel, [Row(t=4), Row(t=3)]) def test_udf_in_generate(self): from pyspark.sql.functions import explode @@ -505,7 +505,7 @@ def test_udf_with_order_by_and_limit(self): my_copy = udf(lambda x: x, IntegerType()) df = self.spark.range(10).orderBy("id") res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1) - self.assertEqual(res.collect(), [Row(id=0, copy=0)]) + assertDataFrameEqual(res, [Row(id=0, copy=0)]) def test_udf_registration_returns_udf(self): df = self.spark.range(10) @@ -838,12 +838,12 @@ def test_datasource_with_udf(self): for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn("c", c1) expected = df.withColumn("c", lit(2)) - self.assertEqual(expected.collect(), result.collect()) + assertDataFrameEqual(expected, result) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn("c", c2) expected = df.withColumn("c", col("i") + 1) - self.assertEqual(expected.collect(), result.collect()) + assertDataFrameEqual(expected, result) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: @@ -902,7 +902,7 @@ def test_udf_in_subquery(self): result = self.spark.sql( "select i from values(0L) as data(i) where i in (select id from v)" ) - self.assertEqual(result.collect(), [Row(i=0)]) + assertDataFrameEqual(result, [Row(i=0)]) def test_udf_globals_not_overwritten(self): @udf("string") @@ -1229,6 +1229,33 @@ def setUpClass(cls): super(BaseUDFTestsMixin, cls).setUpClass() cls.spark.conf.set("spark.sql.execution.pythonUDF.arrow.enabled", "false") + # We cannot check whether the batch size is effective or not. We just run the query with + # various batch size and see whether the query runs successfully, and the output is + # consistent across different batch sizes. + def test_udf_with_various_batch_size(self): + self.spark.catalog.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType()) + for batch_size in [1, 33, 1000, 2000]: + with self.sql_conf({"spark.sql.execution.python.udf.maxRecordsPerBatch": batch_size}): + df = self.spark.range(1000).selectExpr("twoArgs('test', id) AS ret").orderBy("ret") + rets = [x["ret"] for x in df.collect()] + self.assertEqual(rets, list(range(4, 1004))) + + # We cannot check whether the buffer size is effective or not. We just run the query with + # various buffer size and see whether the query runs successfully, and the output is + # consistent across different batch sizes. + def test_udf_with_various_buffer_size(self): + self.spark.catalog.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType()) + for batch_size in [1, 33, 10000]: + with self.sql_conf({"spark.sql.execution.python.udf.buffer.size": batch_size}): + df = ( + self.spark.range(1000) + .repartition(1) + .selectExpr("twoArgs('test', id) AS ret") + .orderBy("ret") + ) + rets = [x["ret"] for x in df.collect()] + self.assertEqual(rets, list(range(4, 1004))) + class UDFInitializationTests(unittest.TestCase): def tearDown(self): diff --git a/python/pyspark/sql/tests/test_udf_profiler.py b/python/pyspark/sql/tests/test_udf_profiler.py index 7e752b2edca1f..7c741bce51f77 100644 --- a/python/pyspark/sql/tests/test_udf_profiler.py +++ b/python/pyspark/sql/tests/test_udf_profiler.py @@ -31,21 +31,15 @@ from pyspark.sql.functions import col, pandas_udf, udf from pyspark.sql.window import Window from pyspark.profiler import UDFBasicProfiler -from pyspark.testing.sqlutils import ( - ReusedSQLTestCase, +from pyspark.testing.sqlutils import ReusedSQLTestCase +from pyspark.testing.utils import ( have_pandas, have_pyarrow, + have_flameprof, pandas_requirement_message, pyarrow_requirement_message, ) -try: - import flameprof # noqa: F401 - - has_flameprof = True -except ImportError: - has_flameprof = False - def _do_computation(spark, *, action=lambda df: df.collect(), use_arrow=False): @udf("long", useArrow=use_arrow) @@ -208,7 +202,7 @@ def test_perf_profiler_udf(self): ) self.assertTrue(f"udf_{id}_perf.pstats" in os.listdir(d)) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -230,7 +224,7 @@ def test_perf_profiler_udf_with_arrow(self): io.getvalue(), f"10.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) def test_perf_profiler_udf_multiple_actions(self): @@ -252,7 +246,7 @@ def action(df): io.getvalue(), f"20.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) def test_perf_profiler_udf_registered(self): @@ -276,7 +270,7 @@ def add1(x): io.getvalue(), f"10.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -309,7 +303,7 @@ def add2(x): io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -345,7 +339,7 @@ def add2(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -395,7 +389,7 @@ def mean_udf(v: pd.Series) -> float: io.getvalue(), f"5.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -427,7 +421,7 @@ def min_udf(v: pd.Series) -> float: io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -458,7 +452,7 @@ def normalize(pdf): io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -496,7 +490,7 @@ def asof_join(left, right): io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -530,7 +524,7 @@ def normalize(table): io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) @unittest.skipIf( @@ -562,7 +556,7 @@ def summarize(left, right): io.getvalue(), f"2.*{os.path.basename(inspect.getfile(_do_computation))}" ) - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) def test_perf_profiler_render(self): @@ -572,7 +566,7 @@ def test_perf_profiler_render(self): id = list(self.profile_results.keys())[0] - if has_flameprof: + if have_flameprof: self.assertIn("svg", self.spark.profile.render(id)) self.assertIn("svg", self.spark.profile.render(id, type="perf")) self.assertIn("svg", self.spark.profile.render(id, renderer="flameprof")) diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index f3f993fc6a787..eca3ab0013b92 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -27,10 +27,12 @@ PySparkTypeError, AnalysisException, PySparkPicklingError, + IllegalArgumentException, ) from pyspark.util import PythonEvalType from pyspark.sql.functions import ( array, + col, create_map, array, lit, @@ -74,8 +76,7 @@ def eval(self): yield "hello", "world" func = udtf(TestUDTF, returnType="c1: string, c2: string") - rows = func().collect() - self.assertEqual(rows, [Row(c1="hello", c2="world")]) + assertDataFrameEqual(func(), [Row(c1="hello", c2="world")]) def test_udtf_yield_single_row_col(self): class TestUDTF: @@ -83,8 +84,7 @@ def eval(self, a: int): yield a, func = udtf(TestUDTF, returnType="a: int") - rows = func(lit(1)).collect() - self.assertEqual(rows, [Row(a=1)]) + assertDataFrameEqual(func(lit(1)), [Row(a=1)]) def test_udtf_yield_multi_cols(self): class TestUDTF: @@ -92,8 +92,7 @@ def eval(self, a: int): yield a, a + 1 func = udtf(TestUDTF, returnType="a: int, b: int") - rows = func(lit(1)).collect() - self.assertEqual(rows, [Row(a=1, b=2)]) + assertDataFrameEqual(func(lit(1)), [Row(a=1, b=2)]) def test_udtf_yield_multi_rows(self): class TestUDTF: @@ -102,8 +101,7 @@ def eval(self, a: int): yield a + 1, func = udtf(TestUDTF, returnType="a: int") - rows = func(lit(1)).collect() - self.assertEqual(rows, [Row(a=1), Row(a=2)]) + assertDataFrameEqual(func(lit(1)), [Row(a=1), Row(a=2)]) def test_udtf_yield_multi_row_col(self): class TestUDTF: @@ -113,8 +111,8 @@ def eval(self, a: int, b: int): yield a, b, b - a func = udtf(TestUDTF, returnType="a: int, b: int, c: int") - rows = func(lit(1), lit(2)).collect() - self.assertEqual(rows, [Row(a=1, b=2, c=3), Row(a=1, b=2, c=-1), Row(a=1, b=2, c=1)]) + res = func(lit(1), lit(2)) + assertDataFrameEqual(res, [Row(a=1, b=2, c=3), Row(a=1, b=2, c=-1), Row(a=1, b=2, c=1)]) def test_udtf_decorator(self): @udtf(returnType="a: int, b: int") @@ -122,8 +120,7 @@ class TestUDTF: def eval(self, a: int): yield a, a + 1 - rows = TestUDTF(lit(1)).collect() - self.assertEqual(rows, [Row(a=1, b=2)]) + assertDataFrameEqual(TestUDTF(lit(1)), [Row(a=1, b=2)]) def test_udtf_registration(self): class TestUDTF: @@ -135,9 +132,7 @@ def eval(self, a: int, b: int): func = udtf(TestUDTF, returnType="a: int, b: int, c: int") self.spark.udtf.register("testUDTF", func) df = self.spark.sql("SELECT * FROM testUDTF(1, 2)") - self.assertEqual( - df.collect(), [Row(a=1, b=2, c=3), Row(a=1, b=2, c=-1), Row(a=1, b=2, c=1)] - ) + assertDataFrameEqual(df, [Row(a=1, b=2, c=3), Row(a=1, b=2, c=-1), Row(a=1, b=2, c=1)]) def test_udtf_with_lateral_join(self): class TestUDTF: @@ -150,10 +145,33 @@ def eval(self, a: int, b: int) -> Iterator: df = self.spark.sql( "SELECT f.* FROM values (0, 1), (1, 2) t(a, b), LATERAL testUDTF(a, b) f" ) + schema = StructType( + [ + StructField("a", IntegerType(), True), + StructField("b", IntegerType(), True), + StructField("c", IntegerType(), True), + ] + ) expected = self.spark.createDataFrame( - [(0, 1, 1), (0, 1, -1), (1, 2, 3), (1, 2, -1)], schema=["a", "b", "c"] + [(0, 1, 1), (0, 1, -1), (1, 2, 3), (1, 2, -1)], schema=schema + ) + assertDataFrameEqual(df, expected) + + def test_udtf_with_lateral_join_dataframe(self): + @udtf(returnType="a: int, b: int, c: int") + class TestUDTF: + def eval(self, a: int, b: int) -> Iterator: + yield a, b, a + b + yield a, b, a - b + + self.spark.udtf.register("testUDTF", TestUDTF) + + assertDataFrameEqual( + self.spark.sql("values (0, 1), (1, 2) t(a, b)").lateralJoin( + TestUDTF(col("a").outer(), col("b").outer()) + ), + self.spark.sql("SELECT * FROM values (0, 1), (1, 2) t(a, b), LATERAL testUDTF(a, b)"), ) - self.assertEqual(df.collect(), expected.collect()) def test_udtf_eval_with_return_stmt(self): class TestUDTF: @@ -161,8 +179,8 @@ def eval(self, a: int, b: int): return [(a, a + 1), (b, b + 1)] func = udtf(TestUDTF, returnType="a: int, b: int") - rows = func(lit(1), lit(2)).collect() - self.assertEqual(rows, [Row(a=1, b=2), Row(a=2, b=3)]) + res = func(lit(1), lit(2)) + assertDataFrameEqual(res, [Row(a=1, b=2), Row(a=2, b=3)]) def test_udtf_eval_returning_non_tuple(self): @udtf(returnType="a: int") @@ -217,14 +235,14 @@ class TestUDTF: def eval(self, a: int): ... - self.assertEqual(TestUDTF(lit(1)).collect(), []) + assertDataFrameEqual(TestUDTF(lit(1)), []) @udtf(returnType="a: int") class TestUDTF: def eval(self, a: int): return - self.assertEqual(TestUDTF(lit(1)).collect(), []) + assertDataFrameEqual(TestUDTF(lit(1)), []) def test_udtf_with_conditional_return(self): class TestUDTF: @@ -234,11 +252,25 @@ def eval(self, a: int): func = udtf(TestUDTF, returnType="a: int") self.spark.udtf.register("test_udtf", func) - self.assertEqual( - self.spark.sql("SELECT * FROM range(0, 8) JOIN LATERAL test_udtf(id)").collect(), + assertDataFrameEqual( + self.spark.sql("SELECT * FROM range(0, 8) JOIN LATERAL test_udtf(id)"), [Row(id=6, a=6), Row(id=7, a=7)], ) + def test_udtf_with_conditional_return_dataframe(self): + @udtf(returnType="a: int") + class TestUDTF: + def eval(self, a: int): + if a > 5: + yield a, + + self.spark.udtf.register("test_udtf", TestUDTF) + + assertDataFrameEqual( + self.spark.range(8).lateralJoin(TestUDTF(col("id").outer())), + self.spark.sql("SELECT * FROM range(0, 8) JOIN LATERAL test_udtf(id)"), + ) + def test_udtf_with_empty_yield(self): @udtf(returnType="a: int") class TestUDTF: @@ -254,9 +286,9 @@ def eval(self, a: int): yield a, yield None, - self.assertEqual(TestUDTF(lit(1)).collect(), [Row(a=1), Row(a=None)]) + assertDataFrameEqual(TestUDTF(lit(1)), [Row(a=1), Row(a=None)]) df = self.spark.createDataFrame([(0, 1), (1, 2)], schema=["a", "b"]) - self.assertEqual(TestUDTF(lit(1)).join(df, "a", "inner").collect(), [Row(a=1, b=2)]) + assertDataFrameEqual(TestUDTF(lit(1)).join(df, "a", "inner"), [Row(a=1, b=2)]) assertDataFrameEqual( TestUDTF(lit(1)).join(df, "a", "left"), [Row(a=None, b=None), Row(a=1, b=2)] ) @@ -267,10 +299,10 @@ class TestUDTF: def eval(self, a: int): yield a, - self.assertEqual(TestUDTF(lit(None)).collect(), [Row(a=None)]) + assertDataFrameEqual(TestUDTF(lit(None)), [Row(a=None)]) self.spark.udtf.register("testUDTF", TestUDTF) df = self.spark.sql("SELECT * FROM testUDTF(null)") - self.assertEqual(df.collect(), [Row(a=None)]) + assertDataFrameEqual(df, [Row(a=None)]) # These are expected error message substrings to be used in test cases below. tooManyPositionalArguments = "too many positional arguments" @@ -366,8 +398,8 @@ def __init__(self): def eval(self, a: int): yield a, a + 1, self.key - rows = TestUDTF(lit(1)).collect() - self.assertEqual(rows, [Row(a=1, b=2, c="test")]) + res = TestUDTF(lit(1)) + assertDataFrameEqual(res, [Row(a=1, b=2, c="test")]) def test_udtf_terminate(self): @udtf(returnType="key: string, value: float") @@ -385,8 +417,8 @@ def terminate(self): yield "count", float(self._count) yield "avg", self._sum / self._count - self.assertEqual( - TestUDTF(lit(1)).collect(), + assertDataFrameEqual( + TestUDTF(lit(1)), [Row(key="input", value=1), Row(key="count", value=1.0), Row(key="avg", value=1.0)], ) @@ -395,8 +427,8 @@ def terminate(self): "SELECT id, key, value FROM range(0, 10, 1, 2), " "LATERAL test_udtf(id) WHERE key != 'input'" ) - self.assertEqual( - df.collect(), + assertDataFrameEqual( + df, [ Row(id=4, key="count", value=5.0), Row(id=4, key="avg", value=2.0), @@ -608,10 +640,8 @@ def eval(self, person): yield f"{person.name}: {person.age}", self.spark.udtf.register("test_udtf", TestUDTF) - self.assertEqual( - self.spark.sql( - "select * from test_udtf(named_struct('name', 'Alice', 'age', 1))" - ).collect(), + assertDataFrameEqual( + self.spark.sql("select * from test_udtf(named_struct('name', 'Alice', 'age', 1))"), [Row(x="Alice: 1")], ) @@ -634,8 +664,8 @@ def eval(self, m): yield str(m), self.spark.udtf.register("test_udtf", TestUDTF) - self.assertEqual( - self.spark.sql("select * from test_udtf(map('key', 'value'))").collect(), + assertDataFrameEqual( + self.spark.sql("select * from test_udtf(map('key', 'value'))"), [Row(x="{'key': 'value'}")], ) @@ -645,7 +675,7 @@ class TestUDTF: def eval(self, x: int): yield {"a": x, "b": x + 1}, - self.assertEqual(TestUDTF(lit(1)).collect(), [Row(x=Row(a=1, b=2))]) + assertDataFrameEqual(TestUDTF(lit(1)), [Row(x=Row(a=1, b=2))]) def test_udtf_with_array_output_types(self): @udtf(returnType="x: array") @@ -653,7 +683,7 @@ class TestUDTF: def eval(self, x: int): yield [x, x + 1, x + 2], - self.assertEqual(TestUDTF(lit(1)).collect(), [Row(x=[1, 2, 3])]) + assertDataFrameEqual(TestUDTF(lit(1)), [Row(x=[1, 2, 3])]) def test_udtf_with_map_output_types(self): @udtf(returnType="x: map") @@ -661,7 +691,7 @@ class TestUDTF: def eval(self, x: int): yield {x: str(x)}, - self.assertEqual(TestUDTF(lit(1)).collect(), [Row(x={1: "1"})]) + assertDataFrameEqual(TestUDTF(lit(1)), [Row(x={1: "1"})]) def test_udtf_with_empty_output_types(self): @udtf(returnType=StructType()) @@ -1019,18 +1049,133 @@ def test_udtf(a: int): ) def test_udtf_with_table_argument_query(self): + func = self.udtf_for_table_argument() + self.spark.udtf.register("test_udtf", func) + assertDataFrameEqual( + self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM range(0, 8)))"), + [Row(a=6), Row(a=7)], + ) + + def test_df_asTable(self): + func = self.udtf_for_table_argument() + self.spark.udtf.register("test_udtf", func) + df = self.spark.range(8) + assertDataFrameEqual( + func(df.asTable()), + self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM range(0, 8)))"), + ) + + def udtf_for_table_argument(self): class TestUDTF: def eval(self, row: Row): if row["id"] > 5: yield row["id"], func = udtf(TestUDTF, returnType="a: int") - self.spark.udtf.register("test_udtf", func) - self.assertEqual( - self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM range(0, 8)))").collect(), - [Row(a=6), Row(a=7)], + return func + + def test_df_asTable_chaining_methods(self): + class TestUDTF: + def eval(self, row: Row): + yield row["key"], row["value"] + + func = udtf(TestUDTF, returnType="key: int, value: string") + df = self.spark.createDataFrame( + [(1, "a", 3), (1, "b", 3), (2, "c", 4), (2, "d", 4)], ["key", "value", "number"] + ) + assertDataFrameEqual( + func(df.asTable().partitionBy("key").orderBy(df.value)), + [ + Row(key=1, value="a"), + Row(key=1, value="b"), + Row(key=2, value="c"), + Row(key=2, value="d"), + ], + checkRowOrder=True, + ) + assertDataFrameEqual( + func(df.asTable().partitionBy(["key", "number"]).orderBy(df.value)), + [ + Row(key=1, value="a"), + Row(key=1, value="b"), + Row(key=2, value="c"), + Row(key=2, value="d"), + ], + checkRowOrder=True, + ) + assertDataFrameEqual( + func(df.asTable().partitionBy("key").orderBy(df.value.desc())), + [ + Row(key=1, value="b"), + Row(key=1, value="a"), + Row(key=2, value="d"), + Row(key=2, value="c"), + ], + checkRowOrder=True, + ) + assertDataFrameEqual( + func(df.asTable().partitionBy("key").orderBy(["number", "value"])), + [ + Row(key=1, value="a"), + Row(key=1, value="b"), + Row(key=2, value="c"), + Row(key=2, value="d"), + ], + checkRowOrder=True, + ) + assertDataFrameEqual( + func(df.asTable().withSinglePartition()), + [ + Row(key=1, value="a"), + Row(key=1, value="b"), + Row(key=2, value="c"), + Row(key=2, value="d"), + ], + ) + + assertDataFrameEqual( + func(df.asTable().withSinglePartition().orderBy("value")), + [ + Row(key=1, value="a"), + Row(key=1, value="b"), + Row(key=2, value="c"), + Row(key=2, value="d"), + ], ) + with self.assertRaisesRegex( + IllegalArgumentException, + r"Cannot call withSinglePartition\(\) after partitionBy\(\)" + r" or withSinglePartition\(\) has been called", + ): + df.asTable().partitionBy(df.key).withSinglePartition() + + with self.assertRaisesRegex( + IllegalArgumentException, + r"Cannot call partitionBy\(\) after partitionBy\(\)" + r" or withSinglePartition\(\) has been called", + ): + df.asTable().withSinglePartition().partitionBy(df.key) + + with self.assertRaisesRegex( + IllegalArgumentException, + r"Please call partitionBy\(\) or withSinglePartition\(\) before orderBy\(\)", + ): + df.asTable().orderBy(df.key) + + with self.assertRaisesRegex( + IllegalArgumentException, + r"Please call partitionBy\(\) or withSinglePartition\(\) before orderBy\(\)", + ): + df.asTable().partitionBy().orderBy(df.key) + + with self.assertRaisesRegex( + IllegalArgumentException, + r"Cannot call partitionBy\(\) after partitionBy\(\)" + r" or withSinglePartition\(\) has been called", + ): + df.asTable().partitionBy(df.key).partitionBy() + def test_udtf_with_int_and_table_argument_query(self): class TestUDTF: def eval(self, i: int, row: Row): @@ -1039,26 +1184,19 @@ def eval(self, i: int, row: Row): func = udtf(TestUDTF, returnType="a: int") self.spark.udtf.register("test_udtf", func) - self.assertEqual( - self.spark.sql( - "SELECT * FROM test_udtf(5, TABLE (SELECT id FROM range(0, 8)))" - ).collect(), + assertDataFrameEqual( + self.spark.sql("SELECT * FROM test_udtf(5, TABLE (SELECT id FROM range(0, 8)))"), [Row(a=6), Row(a=7)], ) def test_udtf_with_table_argument_identifier(self): - class TestUDTF: - def eval(self, row: Row): - if row["id"] > 5: - yield row["id"], - - func = udtf(TestUDTF, returnType="a: int") + func = self.udtf_for_table_argument() self.spark.udtf.register("test_udtf", func) with self.tempView("v"): self.spark.sql("CREATE OR REPLACE TEMPORARY VIEW v as SELECT id FROM range(0, 8)") - self.assertEqual( - self.spark.sql("SELECT * FROM test_udtf(TABLE (v))").collect(), + assertDataFrameEqual( + self.spark.sql("SELECT * FROM test_udtf(TABLE (v))"), [Row(a=6), Row(a=7)], ) @@ -1073,44 +1211,29 @@ def eval(self, i: int, row: Row): with self.tempView("v"): self.spark.sql("CREATE OR REPLACE TEMPORARY VIEW v as SELECT id FROM range(0, 8)") - self.assertEqual( - self.spark.sql("SELECT * FROM test_udtf(5, TABLE (v))").collect(), + assertDataFrameEqual( + self.spark.sql("SELECT * FROM test_udtf(5, TABLE (v))"), [Row(a=6), Row(a=7)], ) def test_udtf_with_table_argument_unknown_identifier(self): - class TestUDTF: - def eval(self, row: Row): - if row["id"] > 5: - yield row["id"], - - func = udtf(TestUDTF, returnType="a: int") + func = self.udtf_for_table_argument() self.spark.udtf.register("test_udtf", func) with self.assertRaisesRegex(AnalysisException, "TABLE_OR_VIEW_NOT_FOUND"): self.spark.sql("SELECT * FROM test_udtf(TABLE (v))").collect() def test_udtf_with_table_argument_malformed_query(self): - class TestUDTF: - def eval(self, row: Row): - if row["id"] > 5: - yield row["id"], - - func = udtf(TestUDTF, returnType="a: int") + func = self.udtf_for_table_argument() self.spark.udtf.register("test_udtf", func) with self.assertRaisesRegex(AnalysisException, "TABLE_OR_VIEW_NOT_FOUND"): self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT * FROM v))").collect() def test_udtf_with_table_argument_cte_inside(self): - class TestUDTF: - def eval(self, row: Row): - if row["id"] > 5: - yield row["id"], - - func = udtf(TestUDTF, returnType="a: int") + func = self.udtf_for_table_argument() self.spark.udtf.register("test_udtf", func) - self.assertEqual( + assertDataFrameEqual( self.spark.sql( """ SELECT * FROM test_udtf(TABLE ( @@ -1120,19 +1243,14 @@ def eval(self, row: Row): SELECT * FROM t )) """ - ).collect(), + ), [Row(a=6), Row(a=7)], ) def test_udtf_with_table_argument_cte_outside(self): - class TestUDTF: - def eval(self, row: Row): - if row["id"] > 5: - yield row["id"], - - func = udtf(TestUDTF, returnType="a: int") + func = self.udtf_for_table_argument() self.spark.udtf.register("test_udtf", func) - self.assertEqual( + assertDataFrameEqual( self.spark.sql( """ WITH t AS ( @@ -1140,11 +1258,11 @@ def eval(self, row: Row): ) SELECT * FROM test_udtf(TABLE (SELECT id FROM t)) """ - ).collect(), + ), [Row(a=6), Row(a=7)], ) - self.assertEqual( + assertDataFrameEqual( self.spark.sql( """ WITH t AS ( @@ -1152,28 +1270,23 @@ def eval(self, row: Row): ) SELECT * FROM test_udtf(TABLE (t)) """ - ).collect(), + ), [Row(a=6), Row(a=7)], ) # TODO(SPARK-44233): Fix the subquery resolution. @unittest.skip("Fails to resolve the subquery.") def test_udtf_with_table_argument_lateral_join(self): - class TestUDTF: - def eval(self, row: Row): - if row["id"] > 5: - yield row["id"], - - func = udtf(TestUDTF, returnType="a: int") + func = self.udtf_for_table_argument() self.spark.udtf.register("test_udtf", func) - self.assertEqual( + assertDataFrameEqual( self.spark.sql( """ SELECT * FROM range(0, 8) AS t, LATERAL test_udtf(TABLE (t)) """ - ).collect(), + ), [Row(a=6), Row(a=7)], ) @@ -1198,8 +1311,8 @@ def eval(self, a: Row, b: Row): self.spark.sql(query).collect() with self.sql_conf({"spark.sql.tvf.allowMultipleTableArguments.enabled": True}): - self.assertEqual( - self.spark.sql(query).collect(), + assertDataFrameEqual( + self.spark.sql(query), [ Row(a=0, b=0), Row(a=1, b=0), @@ -1375,7 +1488,7 @@ def eval(self, a, b): assertSchemaEqual(df.schema, expected_schema) assertDataFrameEqual(df, expected_results) - def test_udtf_with_analyze_arbitary_number_arguments(self): + def test_udtf_with_analyze_arbitrary_number_arguments(self): class TestUDTF: @staticmethod def analyze(*args: AnalyzeArgument) -> AnalyzeResult: @@ -2197,6 +2310,17 @@ def terminate(self): ], ) + def test_udtf_with_table_argument_and_partition_by_no_terminate(self): + func = self.udtf_for_table_argument() # a udtf with no terminate method defined + self.spark.udtf.register("test_udtf", func) + + assertDataFrameEqual( + self.spark.sql( + "SELECT * FROM test_udtf(TABLE (SELECT id FROM range(0, 8)) PARTITION BY id)" + ), + [Row(a=6), Row(a=7)], + ) + def test_udtf_with_table_argument_and_partition_by_and_order_by(self): class TestUDTF: def __init__(self): @@ -2539,8 +2663,10 @@ def eval(self, v): yield i, v.toJson() self.spark.udtf.register("test_udtf", TestUDTF) - rows = self.spark.sql('select i, s from test_udtf(parse_json(\'{"a":"b"}\'))').collect() - self.assertEqual(rows, [Row(i=n, s='{"a":"b"}') for n in range(10)]) + assertDataFrameEqual( + self.spark.sql('select i, s from test_udtf(parse_json(\'{"a":"b"}\'))'), + [Row(i=n, s='{"a":"b"}') for n in range(10)], + ) def test_udtf_with_nested_variant_input(self): # struct @@ -2551,10 +2677,10 @@ def eval(self, v): yield i, v["v"].toJson() self.spark.udtf.register("test_udtf_struct", TestUDTFStruct) - rows = self.spark.sql( + res = self.spark.sql( "select i, s from test_udtf_struct(named_struct('v', parse_json('{\"a\":\"c\"}')))" - ).collect() - self.assertEqual(rows, [Row(i=n, s='{"a":"c"}') for n in range(10)]) + ) + assertDataFrameEqual(res, [Row(i=n, s='{"a":"c"}') for n in range(10)]) # array @udtf(returnType="i int, s: string") @@ -2564,10 +2690,8 @@ def eval(self, v): yield i, v[0].toJson() self.spark.udtf.register("test_udtf_array", TestUDTFArray) - rows = self.spark.sql( - 'select i, s from test_udtf_array(array(parse_json(\'{"a":"d"}\')))' - ).collect() - self.assertEqual(rows, [Row(i=n, s='{"a":"d"}') for n in range(10)]) + res = self.spark.sql('select i, s from test_udtf_array(array(parse_json(\'{"a":"d"}\')))') + assertDataFrameEqual(res, [Row(i=n, s='{"a":"d"}') for n in range(10)]) # map @udtf(returnType="i int, s: string") @@ -2577,10 +2701,10 @@ def eval(self, v): yield i, v["v"].toJson() self.spark.udtf.register("test_udtf_map", TestUDTFMap) - rows = self.spark.sql( + res = self.spark.sql( "select i, s from test_udtf_map(map('v', parse_json('{\"a\":\"e\"}')))" - ).collect() - self.assertEqual(rows, [Row(i=n, s='{"a":"e"}') for n in range(10)]) + ) + assertDataFrameEqual(res, [Row(i=n, s='{"a":"e"}') for n in range(10)]) def test_udtf_with_variant_output(self): @udtf(returnType="i int, v: variant") @@ -2591,8 +2715,8 @@ def eval(self, n): yield i, VariantVal(bytes([2, 1, 0, 0, 2, 5, 97 + i]), bytes([1, 1, 0, 1, 97])) self.spark.udtf.register("test_udtf", TestUDTF) - rows = self.spark.sql("select i, to_json(v) from test_udtf(8)").collect() - self.assertEqual(rows, [Row(i=n, s=f'{{"a":"{chr(97 + n)}"}}') for n in range(8)]) + res = self.spark.sql("select i, to_json(v) from test_udtf(8)") + assertDataFrameEqual(res, [Row(i=n, s=f'{{"a":"{chr(97 + n)}"}}') for n in range(8)]) def test_udtf_with_nested_variant_output(self): # struct @@ -2606,8 +2730,8 @@ def eval(self, n): } self.spark.udtf.register("test_udtf_struct", TestUDTFStruct) - rows = self.spark.sql("select i, to_json(v.v1) from test_udtf_struct(8)").collect() - self.assertEqual(rows, [Row(i=n, s=f'{{"a":"{chr(97 + n)}"}}') for n in range(8)]) + res = self.spark.sql("select i, to_json(v.v1) from test_udtf_struct(8)") + assertDataFrameEqual(res, [Row(i=n, s=f'{{"a":"{chr(97 + n)}"}}') for n in range(8)]) # array @udtf(returnType="i int, v: array") @@ -2620,8 +2744,8 @@ def eval(self, n): ] self.spark.udtf.register("test_udtf_array", TestUDTFArray) - rows = self.spark.sql("select i, to_json(v[0]) from test_udtf_array(8)").collect() - self.assertEqual(rows, [Row(i=n, s=f'{{"a":"{chr(98 + n)}"}}') for n in range(8)]) + res = self.spark.sql("select i, to_json(v[0]) from test_udtf_array(8)") + assertDataFrameEqual(res, [Row(i=n, s=f'{{"a":"{chr(98 + n)}"}}') for n in range(8)]) # map @udtf(returnType="i int, v: map") @@ -2634,8 +2758,8 @@ def eval(self, n): } self.spark.udtf.register("test_udtf_struct", TestUDTFStruct) - rows = self.spark.sql("select i, to_json(v['v1']) from test_udtf_struct(8)").collect() - self.assertEqual(rows, [Row(i=n, s=f'{{"a":"{chr(99 + n)}"}}') for n in range(8)]) + res = self.spark.sql("select i, to_json(v['v1']) from test_udtf_struct(8)") + assertDataFrameEqual(res, [Row(i=n, s=f'{{"a":"{chr(99 + n)}"}}') for n in range(8)]) class UDTFTests(BaseUDTFTestsMixin, ReusedSQLTestCase): diff --git a/python/pyspark/sql/tvf_argument.py b/python/pyspark/sql/tvf_argument.py new file mode 100644 index 0000000000000..cb373d35d9ec2 --- /dev/null +++ b/python/pyspark/sql/tvf_argument.py @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +class TableValuedFunctionArgument: + """ + Base class for arguments passed to Table Valued Functions. + + This class is intentionally left empty and serves as a marker to achieve + parity with the Scala `TableValuedFunctionArgument` trait. + """ + + pass diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 03227c8c8760f..7c3b97f951d67 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -49,7 +49,6 @@ from pyspark.util import is_remote_only, JVM_INT_MAX from pyspark.serializers import CloudPickleSerializer from pyspark.sql.utils import ( - has_numpy, get_active_spark_context, escape_meta_characters, StringConcat, @@ -65,9 +64,6 @@ PySparkKeyError, ) -if has_numpy: - import numpy as np - if TYPE_CHECKING: import numpy as np from py4j.java_gateway import GatewayClient, JavaGateway, JavaClass @@ -1555,6 +1551,18 @@ def treeString(self, maxDepth: int = JVM_INT_MAX) -> str: field._build_formatted_string(prefix, stringConcat, depth) return stringConcat.toString() + def toDDL(self) -> str: + from pyspark.sql.utils import is_remote + + if is_remote(): + from pyspark.sql.connect.session import SparkSession + + session = SparkSession.getActiveSession() + assert session is not None + return session._to_ddl(self) + else: + return get_active_spark_context()._to_ddl(self) + class VariantType(AtomicType): """ @@ -1572,6 +1580,8 @@ def fromInternal(self, obj: Dict) -> Optional["VariantVal"]: return VariantVal(obj["value"], obj["metadata"]) def toInternal(self, variant: Any) -> Any: + if variant is None: + return None assert isinstance(variant, VariantVal) return {"value": variant.value, "metadata": variant.metadata} @@ -1758,6 +1768,15 @@ def toJson(self, zone_id: str = "UTC") -> str: """ return VariantUtils.to_json(self.value, self.metadata, zone_id) + @classmethod + def parseJson(cls, json_str: str) -> "VariantVal": + """ + Convert the VariantVal to a nested Python object of Python data types. + :return: Python representation of the Variant nested structure + """ + (value, metadata) = VariantUtils.parse_json(json_str) + return VariantVal(value, metadata) + _atomic_types: List[Type[DataType]] = [ StringType, @@ -1888,43 +1907,9 @@ def _parse_datatype_string(s: str) -> DataType: if is_remote(): from pyspark.sql.connect.session import SparkSession - return cast( - DataType, - SparkSession.active()._client._analyze(method="ddl_parse", ddl_string=s).parsed, - ) - + return SparkSession.active()._parse_ddl(s) else: - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - - def from_ddl_schema(type_str: str) -> DataType: - return _parse_datatype_json_string( - cast(JVMView, sc._jvm) - .org.apache.spark.sql.types.StructType.fromDDL(type_str) - .json() - ) - - def from_ddl_datatype(type_str: str) -> DataType: - return _parse_datatype_json_string( - cast(JVMView, sc._jvm) - .org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str) - .json() - ) - - try: - # DDL format, "fieldname datatype, fieldname datatype". - return from_ddl_schema(s) - except Exception as e: - try: - # For backwards compatibility, "integer", "struct" and etc. - return from_ddl_datatype(s) - except BaseException: - try: - # For backwards compatibility, "fieldname: datatype, fieldname: datatype" case. - return from_ddl_datatype("struct<%s>" % s.strip()) - except BaseException: - raise e + return get_active_spark_context()._parse_ddl(s) def _parse_datatype_json_string(json_string: str) -> DataType: @@ -3250,7 +3235,13 @@ def convert(self, obj: datetime.timedelta, gateway_client: "GatewayClient") -> " class NumpyScalarConverter: def can_convert(self, obj: Any) -> bool: - return has_numpy and isinstance(obj, np.generic) + from pyspark.testing.utils import have_numpy + + if have_numpy: + import numpy as np + + return isinstance(obj, np.generic) + return False def convert(self, obj: "np.generic", gateway_client: "GatewayClient") -> Any: return obj.item() @@ -3261,6 +3252,8 @@ def _from_numpy_type_to_java_type( self, nt: "np.dtype", gateway: "JavaGateway" ) -> Optional["JavaClass"]: """Convert NumPy type to Py4J Java type.""" + import numpy as np + if nt in [np.dtype("int8"), np.dtype("int16")]: # Mapping int8 to gateway.jvm.byte causes # TypeError: 'bytes' object does not support item assignment @@ -3281,7 +3274,13 @@ def _from_numpy_type_to_java_type( return None def can_convert(self, obj: Any) -> bool: - return has_numpy and isinstance(obj, np.ndarray) and obj.ndim == 1 + from pyspark.testing.utils import have_numpy + + if have_numpy: + import numpy as np + + return isinstance(obj, np.ndarray) and obj.ndim == 1 + return False def convert(self, obj: "np.ndarray", gateway_client: "GatewayClient") -> "JavaGateway": from pyspark import SparkContext diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 9cf93938528f8..cf093bd936437 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -391,7 +391,7 @@ def _create_judf(self, func: Callable[..., Any]) -> "JavaObject": wrapped_func = _wrap_function(sc, func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) assert sc._jvm is not None - judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( + judf = getattr(sc._jvm, "org.apache.spark.sql.execution.python.UserDefinedPythonFunction")( self._name, wrapped_func, jdt, self.evalType, self.deterministic ) return judf diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py index 5ce3e2dfd2a9e..3ea32349d81d2 100644 --- a/python/pyspark/sql/udtf.py +++ b/python/pyspark/sql/udtf.py @@ -32,7 +32,7 @@ if TYPE_CHECKING: from py4j.java_gateway import JavaObject - from pyspark.sql._typing import ColumnOrName + from pyspark.sql._typing import TVFArgumentOrName from pyspark.sql.dataframe import DataFrame from pyspark.sql.session import SparkSession @@ -148,7 +148,7 @@ class AnalyzeResult: The schema that the Python UDTF will return. withSinglePartition: bool If true, the UDTF is specifying for Catalyst to repartition all rows of the input TABLE - argument to one collection for consumption by exactly one instance of the correpsonding + argument to one collection for consumption by exactly one instance of the corresponding UDTF class. partitionBy: sequence of :class:`PartitioningColumn` If non-empty, this is a sequence of expressions that the UDTF is specifying for Catalyst to @@ -362,32 +362,51 @@ def _create_judtf(self, func: Type) -> "JavaObject": assert sc._jvm is not None if self.returnType is None: - judtf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonTableFunction( - self._name, wrapped_func, self.evalType, self.deterministic - ) + judtf = getattr( + sc._jvm, "org.apache.spark.sql.execution.python.UserDefinedPythonTableFunction" + )(self._name, wrapped_func, self.evalType, self.deterministic) else: jdt = spark._jsparkSession.parseDataType(self.returnType.json()) - judtf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonTableFunction( - self._name, wrapped_func, jdt, self.evalType, self.deterministic - ) + judtf = getattr( + sc._jvm, "org.apache.spark.sql.execution.python.UserDefinedPythonTableFunction" + )(self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judtf - def __call__(self, *args: "ColumnOrName", **kwargs: "ColumnOrName") -> "DataFrame": + def __call__(self, *args: "TVFArgumentOrName", **kwargs: "TVFArgumentOrName") -> "DataFrame": from pyspark.sql.classic.column import _to_java_column, _to_seq from pyspark.sql import DataFrame, SparkSession + from pyspark.sql.table_arg import TableArg spark = SparkSession._getActiveSessionOrCreate() sc = spark.sparkContext assert sc._jvm is not None - jcols = [_to_java_column(arg) for arg in args] + [ - sc._jvm.PythonSQLUtils.namedArgumentExpression(key, _to_java_column(value)) - for key, value in kwargs.items() - ] + # Process positional arguments + jargs = [] + for arg in args: + if isinstance(arg, TableArg): + # If the argument is a TableArg, get the Java TableArg object + jargs.append(arg._j_table_arg) + else: + # Otherwise, convert it to a Java column + jargs.append(_to_java_column(arg)) # type: ignore[arg-type] + + # Process keyword arguments + jkwargs = [] + for key, value in kwargs.items(): + if isinstance(value, TableArg): + # If the value is a TableArg, get the Java TableArg object + j_arg = value._j_table_arg + else: + # Otherwise, convert it to a Java column + j_arg = _to_java_column(value) # type: ignore[arg-type] + # Create a named argument expression + j_named_arg = sc._jvm.PythonSQLUtils.namedArgumentExpression(key, j_arg) + jkwargs.append(j_named_arg) judtf = self._judtf - jPythonUDTF = judtf.apply(spark._jsparkSession, _to_seq(sc, jcols)) + jPythonUDTF = judtf.apply(spark._jsparkSession, _to_seq(sc, jargs + jkwargs)) return DataFrame(jPythonUDTF, spark) def asDeterministic(self) -> "UserDefinedTableFunction": diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 3cacc5b9d021a..855496ff3b7ca 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -62,14 +62,6 @@ from pyspark.sql.dataframe import DataFrame from pyspark.pandas._typing import IndexOpsLike, SeriesOrIndex -has_numpy: bool = False -try: - import numpy as np # noqa: F401 - - has_numpy = True -except ImportError: - pass - FuncT = TypeVar("FuncT", bound=Callable[..., Any]) @@ -118,18 +110,34 @@ def require_test_compiled() -> None: def require_minimum_plotly_version() -> None: """Raise ImportError if plotly is not installed""" + from pyspark.loose_version import LooseVersion + minimum_plotly_version = "4.8" try: - import plotly # noqa: F401 + import plotly + + have_plotly = True except ImportError as error: + have_plotly = False + raised_error = error + if not have_plotly: raise PySparkImportError( errorClass="PACKAGE_NOT_INSTALLED", messageParameters={ - "package_name": "plotly", + "package_name": "Plotly", "minimum_version": str(minimum_plotly_version), }, - ) from error + ) from raised_error + if LooseVersion(plotly.__version__) < LooseVersion(minimum_plotly_version): + raise PySparkImportError( + errorClass="UNSUPPORTED_PACKAGE_VERSION", + messageParameters={ + "package_name": "Plotly", + "minimum_version": str(minimum_plotly_version), + "current_version": str(plotly.__version__), + }, + ) class ForeachBatchFunction: diff --git a/python/pyspark/sql/variant_utils.py b/python/pyspark/sql/variant_utils.py index 40cc69c1f0961..3025523064e1d 100644 --- a/python/pyspark/sql/variant_utils.py +++ b/python/pyspark/sql/variant_utils.py @@ -21,7 +21,7 @@ import json import struct from array import array -from typing import Any, Callable, Dict, List, Tuple +from typing import Any, Callable, Dict, List, NamedTuple, Tuple from pyspark.errors import PySparkValueError from zoneinfo import ZoneInfo @@ -108,8 +108,25 @@ class VariantUtils: # string size) + (size bytes of string content). LONG_STR = 16 + VERSION = 1 + # The lower 4 bits of the first metadata byte contain the version. + VERSION_MASK = 0x0F + + U8_MAX = 0xFF + U16_MAX = 0xFFFF + U24_MAX = 0xFFFFFF + U24_SIZE = 3 U32_SIZE = 4 + I8_MAX = 0x7F + I8_MIN = -0x80 + I16_MAX = 0x7FFF + I16_MIN = -0x8000 + I32_MAX = 0x7FFFFFFF + I32_MIN = -0x80000000 + I64_MAX = 0x7FFFFFFFFFFFFFFF + I64_MIN = -0x8000000000000000 + EPOCH = datetime.datetime( year=1970, month=1, day=1, hour=0, minute=0, second=0, tzinfo=datetime.timezone.utc ) @@ -140,6 +157,15 @@ def to_python(cls, value: bytes, metadata: bytes) -> str: """ return cls._to_python(value, metadata, 0) + @classmethod + def parse_json(cls, json_str: str) -> Tuple[bytes, bytes]: + """ + Parses the JSON string and creates the Variant binary (value, metadata) + :return: tuple of 2 binary values (value, metadata) + """ + builder = VariantBuilder() + return builder.build(json_str) + @classmethod def _read_long(cls, data: bytes, pos: int, num_bytes: int, signed: bool) -> int: cls._check_index(pos, len(data)) @@ -468,7 +494,10 @@ def _handle_object( value, offset_start + offset_size * i, offset_size, signed=False ) value_pos = data_start + offset - key_value_pos_list.append((cls._get_metadata_key(metadata, id), value_pos)) + if metadata is not None: + key_value_pos_list.append((cls._get_metadata_key(metadata, id), value_pos)) + else: + key_value_pos_list.append(("", value_pos)) return func(key_value_pos_list) @classmethod @@ -496,3 +525,297 @@ def _handle_array(cls, value: bytes, pos: int, func: Callable[[List[int]], Any]) element_pos = data_start + offset value_pos_list.append(element_pos) return func(value_pos_list) + + +class FieldEntry(NamedTuple): + """ + Info about an object field + """ + + key: str + id: int + offset: int + + +class VariantBuilder: + """ + A utility class for building VariantVal. + """ + + DEFAULT_SIZE_LIMIT = 16 * 1024 * 1024 + + def __init__(self, size_limit: int = DEFAULT_SIZE_LIMIT): + self.value = bytearray() + self.dictionary = dict[str, int]() + self.dictionary_keys = list[bytes]() + self.size_limit = size_limit + + def build(self, json_str: str) -> Tuple[bytes, bytes]: + parsed = json.loads(json_str, parse_float=self._handle_float) + self._process_parsed_json(parsed) + + num_keys = len(self.dictionary_keys) + dictionary_string_size = sum(len(key) for key in self.dictionary_keys) + + # Determine the number of bytes required per offset entry. + # The largest offset is the one-past-the-end value, which is total string size. It's very + # unlikely that the number of keys could be larger, but incorporate that into the + # calculation in case of pathological data. + max_size = max(dictionary_string_size, num_keys) + if max_size > self.size_limit: + raise PySparkValueError(errorClass="VARIANT_SIZE_LIMIT_EXCEEDED", messageParameters={}) + offset_size = self._get_integer_size(max_size) + + offset_start = 1 + offset_size + string_start = offset_start + (num_keys + 1) * offset_size + metadata_size = string_start + dictionary_string_size + if metadata_size > self.size_limit: + raise PySparkValueError(errorClass="VARIANT_SIZE_LIMIT_EXCEEDED", messageParameters={}) + + metadata = bytearray() + header_byte = VariantUtils.VERSION | ((offset_size - 1) << 6) + metadata.extend(header_byte.to_bytes(1, byteorder="little")) + metadata.extend(num_keys.to_bytes(offset_size, byteorder="little")) + # write offsets + current_offset = 0 + for key in self.dictionary_keys: + metadata.extend(current_offset.to_bytes(offset_size, byteorder="little")) + current_offset += len(key) + metadata.extend(current_offset.to_bytes(offset_size, byteorder="little")) + # write key data + for key in self.dictionary_keys: + metadata.extend(key) + return (bytes(self.value), bytes(metadata)) + + def _process_parsed_json(self, parsed: Any) -> None: + if type(parsed) is dict: + fields = list[FieldEntry]() + start = len(self.value) + for key, value in parsed.items(): + id = self._add_key(key) + fields.append(FieldEntry(key, id, len(self.value) - start)) + self._process_parsed_json(value) + self._finish_writing_object(start, fields) + elif type(parsed) is list: + offsets = [] + start = len(self.value) + for elem in parsed: + offsets.append(len(self.value) - start) + self._process_parsed_json(elem) + self._finish_writing_array(start, offsets) + elif type(parsed) is str: + self._append_string(parsed) + elif type(parsed) is int: + if not self._append_int(parsed): + self._process_parsed_json(self._handle_float(str(parsed))) + elif type(parsed) is float: + self._append_float(parsed) + elif type(parsed) is decimal.Decimal: + self._append_decimal(parsed) + elif type(parsed) is bool: + self._append_boolean(parsed) + elif parsed is None: + self._append_null() + else: + raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={}) + + # Choose the smallest unsigned integer type that can store `value`. It must be within + # [0, U24_MAX]. + def _get_integer_size(self, value: int) -> int: + if value <= VariantUtils.U8_MAX: + return 1 + if value <= VariantUtils.U16_MAX: + return 2 + return VariantUtils.U24_SIZE + + def _check_capacity(self, additional: int) -> None: + required = len(self.value) + additional + if required > self.size_limit: + raise PySparkValueError(errorClass="VARIANT_SIZE_LIMIT_EXCEEDED", messageParameters={}) + + def _primitive_header(self, type: int) -> bytes: + return bytes([(type << 2) | VariantUtils.PRIMITIVE]) + + def _short_string_header(self, size: int) -> bytes: + return bytes([size << 2 | VariantUtils.SHORT_STR]) + + def _array_header(self, large_size: bool, offset_size: int) -> bytes: + return bytes( + [ + ( + (large_size << (VariantUtils.BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << VariantUtils.BASIC_TYPE_BITS) + | VariantUtils.ARRAY + ) + ] + ) + + def _object_header(self, large_size: bool, id_size: int, offset_size: int) -> bytes: + return bytes( + [ + ( + (large_size << (VariantUtils.BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (VariantUtils.BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << VariantUtils.BASIC_TYPE_BITS) + | VariantUtils.OBJECT + ) + ] + ) + + # Add a key to the variant dictionary. If the key already exists, the dictionary is + # not modified. In either case, return the id of the key. + def _add_key(self, key: str) -> int: + if key in self.dictionary: + return self.dictionary[key] + id = len(self.dictionary_keys) + self.dictionary[key] = id + self.dictionary_keys.append(key.encode("utf-8")) + return id + + def _handle_float(self, num_str: str) -> Any: + # a float can be a decimal if it only contains digits, '-', or '-'. + if all([ch.isdecimal() or ch == "-" or ch == "." for ch in num_str]): + dec = decimal.Decimal(num_str) + precision = len(dec.as_tuple().digits) + scale = -int(dec.as_tuple().exponent) + + if ( + scale <= VariantUtils.MAX_DECIMAL16_PRECISION + and precision <= VariantUtils.MAX_DECIMAL16_PRECISION + ): + return dec + return float(num_str) + + def _append_boolean(self, b: bool) -> None: + self._check_capacity(1) + self.value.extend(self._primitive_header(VariantUtils.TRUE if b else VariantUtils.FALSE)) + + def _append_null(self) -> None: + self._check_capacity(1) + self.value.extend(self._primitive_header(VariantUtils.NULL)) + + def _append_string(self, s: str) -> None: + text = s.encode("utf-8") + long_str = len(text) > VariantUtils.MAX_SHORT_STR_SIZE + additional = (1 + VariantUtils.U32_SIZE) if long_str else 1 + self._check_capacity(additional + len(text)) + if long_str: + self.value.extend(self._primitive_header(VariantUtils.LONG_STR)) + self.value.extend(len(text).to_bytes(VariantUtils.U32_SIZE, byteorder="little")) + else: + self.value.extend(self._short_string_header(len(text))) + self.value.extend(text) + + def _append_int(self, i: int) -> bool: + self._check_capacity(1 + 8) + if i >= VariantUtils.I8_MIN and i <= VariantUtils.I8_MAX: + self.value.extend(self._primitive_header(VariantUtils.INT1)) + self.value.extend(i.to_bytes(1, byteorder="little", signed=True)) + elif i >= VariantUtils.I16_MIN and i <= VariantUtils.I16_MAX: + self.value.extend(self._primitive_header(VariantUtils.INT2)) + self.value.extend(i.to_bytes(2, byteorder="little", signed=True)) + elif i >= VariantUtils.I32_MIN and i <= VariantUtils.I32_MAX: + self.value.extend(self._primitive_header(VariantUtils.INT4)) + self.value.extend(i.to_bytes(4, byteorder="little", signed=True)) + elif i >= VariantUtils.I64_MIN and i <= VariantUtils.I64_MAX: + self.value.extend(self._primitive_header(VariantUtils.INT8)) + self.value.extend(i.to_bytes(8, byteorder="little", signed=True)) + else: + return False + return True + + # Append a decimal value to the variant builder. The caller should guarantee that its precision + # and scale fit into `MAX_DECIMAL16_PRECISION`. + def _append_decimal(self, d: decimal.Decimal) -> None: + self._check_capacity(2 + 16) + precision = len(d.as_tuple().digits) + scale = -int(d.as_tuple().exponent) + unscaled = int("".join(map(str, d.as_tuple().digits))) + unscaled = -unscaled if d < 0 else unscaled + if ( + scale <= VariantUtils.MAX_DECIMAL4_PRECISION + and precision <= VariantUtils.MAX_DECIMAL4_PRECISION + ): + self.value.extend(self._primitive_header(VariantUtils.DECIMAL4)) + self.value.extend(scale.to_bytes(1, byteorder="little")) + self.value.extend(unscaled.to_bytes(4, byteorder="little", signed=True)) + elif ( + scale <= VariantUtils.MAX_DECIMAL8_PRECISION + and precision <= VariantUtils.MAX_DECIMAL8_PRECISION + ): + self.value.extend(self._primitive_header(VariantUtils.DECIMAL8)) + self.value.extend(scale.to_bytes(1, byteorder="little")) + self.value.extend(unscaled.to_bytes(8, byteorder="little", signed=True)) + else: + assert ( + scale <= VariantUtils.MAX_DECIMAL16_PRECISION + and precision <= VariantUtils.MAX_DECIMAL16_PRECISION + ) + self.value.extend(self._primitive_header(VariantUtils.DECIMAL16)) + self.value.extend(scale.to_bytes(1, byteorder="little")) + self.value.extend(unscaled.to_bytes(16, byteorder="little", signed=True)) + + def _append_float(self, f: float) -> None: + self._check_capacity(1 + 8) + self.value.extend(self._primitive_header(VariantUtils.DOUBLE)) + self.value.extend(struct.pack(" None: + data_size = len(self.value) - start + num_offsets = len(offsets) + large_size = num_offsets > VariantUtils.U8_MAX + size_bytes = VariantUtils.U32_SIZE if large_size else 1 + offset_size = self._get_integer_size(data_size) + # The space for header byte, object size, and offset list. + header_size = 1 + size_bytes + (num_offsets + 1) * offset_size + self._check_capacity(header_size) + self.value.extend(bytearray(header_size)) + # Shift the just-written element data to make room for the header section. + self.value[start + header_size :] = bytes(self.value[start : start + data_size]) + # Write the header byte, num offsets + offset_start = start + 1 + size_bytes + self.value[start : start + 1] = self._array_header(large_size, offset_size) + self.value[start + 1 : offset_start] = num_offsets.to_bytes(size_bytes, byteorder="little") + # write offset list + offset_list = bytearray() + for offset in offsets: + offset_list.extend(offset.to_bytes(offset_size, byteorder="little")) + offset_list.extend(data_size.to_bytes(offset_size, byteorder="little")) + self.value[offset_start : offset_start + len(offset_list)] = offset_list + + # Finish writing a variant object after all of its fields have already been written. + def _finish_writing_object(self, start: int, fields: List[FieldEntry]) -> None: + num_fields = len(fields) + # object fields are from a python dictionary, so keys are already distinct + fields.sort(key=lambda f: f.key) + max_id = 0 + for field in fields: + max_id = max(max_id, field.id) + + data_size = len(self.value) - start + large_size = num_fields > VariantUtils.U8_MAX + size_bytes = VariantUtils.U32_SIZE if large_size else 1 + id_size = self._get_integer_size(max_id) + offset_size = self._get_integer_size(data_size) + # The space for header byte, object size, id list, and offset list. + header_size = 1 + size_bytes + num_fields * id_size + (num_fields + 1) * offset_size + self._check_capacity(header_size) + self.value.extend(bytearray(header_size)) + # Shift the just-written field data to make room for the object header section. + self.value[start + header_size :] = self.value[start : start + data_size] + # Write the header byte, num fields, id list, offset list + self.value[start : start + 1] = self._object_header(large_size, id_size, offset_size) + self.value[start + 1 : start + 1 + size_bytes] = num_fields.to_bytes( + size_bytes, byteorder="little" + ) + id_start = start + 1 + size_bytes + offset_start = id_start + num_fields * id_size + id_list = bytearray() + offset_list = bytearray() + for field in fields: + id_list.extend(field.id.to_bytes(id_size, byteorder="little")) + offset_list.extend(field.offset.to_bytes(offset_size, byteorder="little")) + offset_list.extend(data_size.to_bytes(offset_size, byteorder="little")) + self.value[id_start : id_start + len(id_list)] = id_list + self.value[offset_start : offset_start + len(offset_list)] = offset_list diff --git a/python/pyspark/sql/worker/create_data_source.py b/python/pyspark/sql/worker/create_data_source.py index ef70876fc32c5..f74c1555e6e9e 100644 --- a/python/pyspark/sql/worker/create_data_source.py +++ b/python/pyspark/sql/worker/create_data_source.py @@ -20,7 +20,7 @@ from typing import IO from pyspark.accumulators import _accumulatorRegistry -from pyspark.errors import PySparkAssertionError, PySparkRuntimeError, PySparkTypeError +from pyspark.errors import PySparkAssertionError, PySparkTypeError from pyspark.serializers import ( read_bool, read_int, @@ -127,13 +127,7 @@ def main(infile: IO, outfile: IO) -> None: options[key] = value # Instantiate a data source. - try: - data_source = data_source_cls(options=options) # type: ignore - except Exception as e: - raise PySparkRuntimeError( - errorClass="DATA_SOURCE_CREATE_ERROR", - messageParameters={"error": str(e)}, - ) + data_source = data_source_cls(options=options) # type: ignore # Get the schema of the data source. # If user_specified_schema is not None, use user_specified_schema. @@ -141,17 +135,11 @@ def main(infile: IO, outfile: IO) -> None: # Throw exception if the data source does not implement schema(). is_ddl_string = False if user_specified_schema is None: - try: - schema = data_source.schema() - if isinstance(schema, str): - # Here we cannot use _parse_datatype_string to parse the DDL string schema. - # as it requires an active Spark session. - is_ddl_string = True - except NotImplementedError: - raise PySparkRuntimeError( - errorClass="NOT_IMPLEMENTED", - messageParameters={"feature": "DataSource.schema"}, - ) + schema = data_source.schema() + if isinstance(schema, str): + # Here we cannot use _parse_datatype_string to parse the DDL string schema. + # as it requires an active Spark session. + is_ddl_string = True else: schema = user_specified_schema # type: ignore diff --git a/python/pyspark/sql/worker/write_into_data_source.py b/python/pyspark/sql/worker/write_into_data_source.py index a114a3facc467..91a1f4d3b1b34 100644 --- a/python/pyspark/sql/worker/write_into_data_source.py +++ b/python/pyspark/sql/worker/write_into_data_source.py @@ -32,6 +32,7 @@ from pyspark.sql.datasource import ( DataSource, DataSourceWriter, + DataSourceArrowWriter, WriterCommitMessage, CaseInsensitiveDict, ) @@ -194,7 +195,10 @@ def batch_to_rows() -> Iterator[Row]: ] yield _create_row(fields=fields, values=values) - res = writer.write(batch_to_rows()) + if isinstance(writer, DataSourceArrowWriter): + res = writer.write(iterator) + else: + res = writer.write(batch_to_rows()) # Check the commit message has the right type. if not isinstance(res, WriterCommitMessage): diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py index 046247763c0b3..4c9633db311a6 100644 --- a/python/pyspark/streaming/tests/test_dstream.py +++ b/python/pyspark/streaming/tests/test_dstream.py @@ -403,7 +403,7 @@ def failed_func(rdd1, rdd2): self.fail("a failed func should throw an error") - def test_failed_func_with_reseting_failure(self): + def test_failed_func_with_resetting_failure(self): input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index 1dd15666382f6..76f5b48ff9bb0 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -52,13 +52,9 @@ def have_package(name: str) -> bool: - try: - import importlib + import importlib - importlib.import_module(name) - return True - except Exception: - return False + return importlib.util.find_spec(name) is not None have_numpy = have_package("numpy") @@ -91,6 +87,14 @@ def have_package(name: str) -> bool: have_graphviz = have_package("graphviz") graphviz_requirement_message = None if have_graphviz else "No module named 'graphviz'" +have_flameprof = have_package("flameprof") +flameprof_requirement_message = None if have_flameprof else "No module named 'flameprof'" + +have_jinja2 = have_package("jinja2") +jinja2_requirement_message = None if have_jinja2 else "No module named 'jinja2'" + +have_openpyxl = have_package("openpyxl") +openpyxl_requirement_message = None if have_openpyxl else "No module named 'openpyxl'" pandas_requirement_message = None try: @@ -336,6 +340,7 @@ def check_error( messageParameters: Optional[Dict[str, str]] = None, query_context_type: Optional[QueryContextType] = None, fragment: Optional[str] = None, + matchPVals: bool = False, ): query_context = exception.getQueryContext() assert bool(query_context) == (query_context_type is not None), ( @@ -359,9 +364,30 @@ def check_error( # Test message parameters expected = messageParameters actual = exception.getMessageParameters() - self.assertEqual( - expected, actual, f"Expected message parameters was '{expected}', got '{actual}'" - ) + if matchPVals: + self.assertEqual( + len(expected), + len(actual), + "Expected message parameters count does not match actual message parameters count" + f": {len(expected)}, {len(actual)}.", + ) + for key, value in expected.items(): + self.assertIn( + key, + actual, + f"Expected message parameter key '{key}' was not found " + "in actual message parameters.", + ) + self.assertRegex( + actual[key], + value, + f"Expected message parameter value '{value}' does not match actual message " + f"parameter value '{actual[key]}'.", + ), + else: + self.assertEqual( + expected, actual, f"Expected message parameters was '{expected}', got '{actual}'" + ) # Test query context if query_context: diff --git a/python/pyspark/util.py b/python/pyspark/util.py index 3b38b8b72c615..3e9a68ccfe2e5 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -27,6 +27,7 @@ import traceback import typing import socket +import warnings from types import TracebackType from typing import Any, Callable, IO, Iterator, List, Optional, TextIO, Tuple, Union @@ -366,7 +367,8 @@ def inheritable_thread_target(f: Optional[Union[Callable, "SparkSession"]] = Non >>> Thread(target=inheritable_thread_target(target_func)).start() # doctest: +SKIP - If you're using Spark Connect, you should explicitly provide Spark session as follows: + If you're using Spark Connect or if you want to inherit the tags properly, + you should explicitly provide Spark session as follows: >>> @inheritable_thread_target(session) # doctest: +SKIP ... def target_func(): @@ -406,13 +408,41 @@ def inner(*args: Any, **kwargs: Any) -> Any: return outer - # Non Spark Connect + # Non Spark Connect with SparkSession or Callable + from pyspark.sql import SparkSession from pyspark import SparkContext from py4j.clientserver import ClientServer if isinstance(SparkContext._gateway, ClientServer): # Here's when the pinned-thread mode (PYSPARK_PIN_THREAD) is on. + if isinstance(f, SparkSession): + session = f + assert session is not None + tags = set(session.getTags()) + # Local properties are copied when wrapping the function. + assert SparkContext._active_spark_context is not None + properties = SparkContext._active_spark_context._jsc.sc().getLocalProperties().clone() + + def outer(ff: Callable) -> Callable: + @functools.wraps(ff) + def wrapped(*args: Any, **kwargs: Any) -> Any: + # Apply properties and tags in the child thread. + assert SparkContext._active_spark_context is not None + SparkContext._active_spark_context._jsc.sc().setLocalProperties(properties) + for tag in tags: + session.addTag(tag) # type: ignore[union-attr] + return ff(*args, **kwargs) + + return wrapped + + return outer + + warnings.warn( + "Spark session is not provided. Tags will not be inherited.", + UserWarning, + ) + # NOTICE the internal difference vs `InheritableThread`. `InheritableThread` # copies local properties when the thread starts but `inheritable_thread_target` # copies when the function is wrapped. @@ -506,11 +536,15 @@ def copy_local_properties(*a: Any, **k: Any) -> Any: from pyspark import SparkContext from py4j.clientserver import ClientServer + self._session = session # type: ignore[assignment] if isinstance(SparkContext._gateway, ClientServer): # Here's when the pinned-thread mode (PYSPARK_PIN_THREAD) is on. def copy_local_properties(*a: Any, **k: Any) -> Any: # self._props is set before starting the thread to match the behavior with JVM. assert hasattr(self, "_props") + if hasattr(self, "_tags"): + for tag in self._tags: # type: ignore[has-type] + self._session.addTag(tag) assert SparkContext._active_spark_context is not None SparkContext._active_spark_context._jsc.sc().setLocalProperties(self._props) return target(*a, **k) @@ -546,6 +580,9 @@ def start(self) -> None: self._props = ( SparkContext._active_spark_context._jsc.sc().getLocalProperties().clone() ) + if self._session is not None: + self._tags = self._session.getTags() + return super(InheritableThread, self).start() diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 04f95e9f52648..e799498cdd80b 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -34,6 +34,7 @@ _deserialize_accumulator, ) from pyspark.sql.streaming.stateful_processor_api_client import StatefulProcessorApiClient +from pyspark.sql.streaming.stateful_processor_util import TransformWithStateInPandasFuncMode from pyspark.taskcontext import BarrierTaskContext, TaskContext from pyspark.resource import ResourceInformation from pyspark.util import PythonEvalType, local_connect_and_auth @@ -153,7 +154,7 @@ def verify_result_length(result, length): ) -def wrap_arrow_batch_udf(f, args_offsets, kwargs_offsets, return_type): +def wrap_arrow_batch_udf(f, args_offsets, kwargs_offsets, return_type, runner_conf): import pandas as pd func, args_kwargs_offsets = wrap_kwargs_support(f, args_offsets, kwargs_offsets) @@ -171,9 +172,21 @@ def wrap_arrow_batch_udf(f, args_offsets, kwargs_offsets, return_type): elif type(return_type) == BinaryType: result_func = lambda r: bytes(r) if r is not None else r # noqa: E731 - @fail_on_stopiteration - def evaluate(*args: pd.Series) -> pd.Series: - return pd.Series([result_func(func(*row)) for row in zip(*args)]) + if "spark.sql.execution.pythonUDF.arrow.concurrency.level" in runner_conf: + from concurrent.futures import ThreadPoolExecutor + + c = int(runner_conf["spark.sql.execution.pythonUDF.arrow.concurrency.level"]) + + @fail_on_stopiteration + def evaluate(*args: pd.Series) -> pd.Series: + with ThreadPoolExecutor(max_workers=c) as pool: + return pd.Series(list(pool.map(lambda row: result_func(func(*row)), zip(*args)))) + + else: + + @fail_on_stopiteration + def evaluate(*args: pd.Series) -> pd.Series: + return pd.Series([result_func(func(*row)) for row in zip(*args)]) def verify_result_length(result, length): if len(result) != length: @@ -493,36 +506,36 @@ def wrapped(key_series, value_series): def wrap_grouped_transform_with_state_pandas_udf(f, return_type, runner_conf): - def wrapped(stateful_processor_api_client, key, value_series_gen): + def wrapped(stateful_processor_api_client, mode, key, value_series_gen): import pandas as pd values = (pd.concat(x, axis=1) for x in value_series_gen) - result_iter = f(stateful_processor_api_client, key, values) + result_iter = f(stateful_processor_api_client, mode, key, values) # TODO(SPARK-49100): add verification that elements in result_iter are # indeed of type pd.DataFrame and confirm to assigned cols return result_iter - return lambda p, k, v: [(wrapped(p, k, v), to_arrow_type(return_type))] + return lambda p, m, k, v: [(wrapped(p, m, k, v), to_arrow_type(return_type))] def wrap_grouped_transform_with_state_pandas_init_state_udf(f, return_type, runner_conf): - def wrapped(stateful_processor_api_client, key, value_series_gen): + def wrapped(stateful_processor_api_client, mode, key, value_series_gen): import pandas as pd state_values_gen, init_states_gen = itertools.tee(value_series_gen, 2) state_values = (df for x, _ in state_values_gen if not (df := pd.concat(x, axis=1)).empty) init_states = (df for _, x in init_states_gen if not (df := pd.concat(x, axis=1)).empty) - result_iter = f(stateful_processor_api_client, key, state_values, init_states) + result_iter = f(stateful_processor_api_client, mode, key, state_values, init_states) # TODO(SPARK-49100): add verification that elements in result_iter are # indeed of type pd.DataFrame and confirm to assigned cols return result_iter - return lambda p, k, v: [(wrapped(p, k, v), to_arrow_type(return_type))] + return lambda p, m, k, v: [(wrapped(p, m, k, v), to_arrow_type(return_type))] def wrap_grouped_map_pandas_udf_with_state(f, return_type): @@ -854,7 +867,7 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index, profil if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: return wrap_scalar_pandas_udf(func, args_offsets, kwargs_offsets, return_type) elif eval_type == PythonEvalType.SQL_ARROW_BATCHED_UDF: - return wrap_arrow_batch_udf(func, args_offsets, kwargs_offsets, return_type) + return wrap_arrow_batch_udf(func, args_offsets, kwargs_offsets, return_type, runner_conf) elif eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF: return args_offsets, wrap_pandas_batch_iter_udf(func, return_type) elif eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF: @@ -1036,7 +1049,7 @@ def eval(self, *args, **kwargs) -> Iterator: list(args) + list(kwargs.values()) ) if changed_partitions: - if self._udtf.terminate is not None: + if hasattr(self._udtf, "terminate"): result = self._udtf.terminate() if result is not None: for row in result: @@ -1062,7 +1075,7 @@ def eval(self, *args, **kwargs) -> Iterator: self._eval_raised_skip_rest_of_input_table = True def terminate(self) -> Iterator: - if self._udtf.terminate is not None: + if hasattr(self._udtf, "terminate"): return self._udtf.terminate() return iter(()) @@ -1556,7 +1569,8 @@ def read_udfs(pickleSer, infile, eval_type): arrow_cast, ) else: - ser = BatchedSerializer(CPickleSerializer(), 100) + batch_size = int(os.environ.get("PYTHON_UDF_BATCH_SIZE", "100")) + ser = BatchedSerializer(CPickleSerializer(), batch_size) is_profiling = read_bool(infile) if is_profiling: @@ -1697,18 +1711,22 @@ def mapper(a): ser.key_offsets = parsed_offsets[0][0] stateful_processor_api_client = StatefulProcessorApiClient(state_server_port, key_schema) - # Create function like this: - # mapper a: f([a[0]], [a[0], a[1]]) def mapper(a): - key = a[0] + mode = a[0] - def values_gen(): - for x in a[1]: - retVal = [x[1][o] for o in parsed_offsets[0][1]] - yield retVal + if mode == TransformWithStateInPandasFuncMode.PROCESS_DATA: + key = a[1] - # This must be generator comprehension - do not materialize. - return f(stateful_processor_api_client, key, values_gen()) + def values_gen(): + for x in a[2]: + retVal = [x[1][o] for o in parsed_offsets[0][1]] + yield retVal + + # This must be generator comprehension - do not materialize. + return f(stateful_processor_api_client, mode, key, values_gen()) + else: + # mode == PROCESS_TIMER or mode == COMPLETE + return f(stateful_processor_api_client, mode, None, iter([])) elif eval_type == PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF: # We assume there is only one UDF here because grouped map doesn't @@ -1731,16 +1749,22 @@ def values_gen(): stateful_processor_api_client = StatefulProcessorApiClient(state_server_port, key_schema) def mapper(a): - key = a[0] + mode = a[0] - def values_gen(): - for x in a[1]: - retVal = [x[1][o] for o in parsed_offsets[0][1]] - initVal = [x[2][o] for o in parsed_offsets[1][1]] - yield retVal, initVal + if mode == TransformWithStateInPandasFuncMode.PROCESS_DATA: + key = a[1] - # This must be generator comprehension - do not materialize. - return f(stateful_processor_api_client, key, values_gen()) + def values_gen(): + for x in a[2]: + retVal = [x[1][o] for o in parsed_offsets[0][1]] + initVal = [x[2][o] for o in parsed_offsets[1][1]] + yield retVal, initVal + + # This must be generator comprehension - do not materialize. + return f(stateful_processor_api_client, mode, key, values_gen()) + else: + # mode == PROCESS_TIMER or mode == COMPLETE + return f(stateful_processor_api_client, mode, None, iter([])) elif eval_type == PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF: import pyarrow as pa @@ -1958,17 +1982,6 @@ def process(): try: serializer.dump_stream(out_iter, outfile) finally: - # Sending a signal to TransformWithState UDF to perform proper cleanup steps. - if ( - eval_type == PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_UDF - or eval_type == PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF - ): - # Sending key as None to indicate that process() has finished. - end_iter = func(split_index, iter([(None, None)])) - # Need to materialize the iterator to trigger the cleanup steps, nothing needs - # to be done here. - for _ in end_iter: - pass if hasattr(out_iter, "close"): out_iter.close() diff --git a/python/pyspark/worker_util.py b/python/pyspark/worker_util.py index 81c05ce94eb65..5c758d3f83fe6 100644 --- a/python/pyspark/worker_util.py +++ b/python/pyspark/worker_util.py @@ -107,8 +107,8 @@ def setup_memory_limits(memory_limit_mb: int) -> None: except (resource.error, OSError, ValueError) as e: # not all systems support resource limits, so warn instead of failing - curent = currentframe() - lineno = getframeinfo(curent).lineno + 1 if curent is not None else 0 + current = currentframe() + lineno = getframeinfo(current).lineno + 1 if current is not None else 0 if "__file__" in globals(): print( warnings.formatwarning( diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml index 211c6c93b9674..814a3e1c595b8 100644 --- a/resource-managers/kubernetes/core/pom.xml +++ b/resource-managers/kubernetes/core/pom.xml @@ -37,7 +37,7 @@ io.fabric8 - volcano-model-v1beta1 + volcano-model ${kubernetes-client.version} @@ -105,11 +105,6 @@ test - - io.fabric8 - kubernetes-httpclient-okhttp - ${kubernetes-client.version} - io.fabric8 kubernetes-client diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala index 2c28dc380046c..557bf01cbdbae 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala @@ -24,10 +24,7 @@ import com.google.common.io.Files import io.fabric8.kubernetes.client.{ConfigBuilder, KubernetesClient, KubernetesClientBuilder} import io.fabric8.kubernetes.client.Config.KUBERNETES_REQUEST_RETRY_BACKOFFLIMIT_SYSTEM_PROPERTY import io.fabric8.kubernetes.client.Config.autoConfigure -import io.fabric8.kubernetes.client.okhttp.OkHttpClientFactory import io.fabric8.kubernetes.client.utils.Utils.getSystemPropertyOrEnvVar -import okhttp3.Dispatcher -import okhttp3.OkHttpClient import org.apache.spark.SparkConf import org.apache.spark.annotation.{DeveloperApi, Since, Stable} @@ -35,7 +32,6 @@ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.internal.{Logging, MDC} import org.apache.spark.internal.LogKeys.K8S_CONTEXT import org.apache.spark.internal.config.ConfigEntry -import org.apache.spark.util.ThreadUtils /** * :: DeveloperApi :: @@ -78,10 +74,6 @@ object SparkKubernetesClientFactory extends Logging { .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX") val clientCertFile = sparkConf .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX") - // TODO(SPARK-37687): clean up direct usage of OkHttpClient, see also: - // https://github.com/fabric8io/kubernetes-client/issues/3547 - val dispatcher = new Dispatcher( - ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher")) // Allow for specifying a context used to auto-configure from the users K8S config file val kubeContext = sparkConf.get(KUBERNETES_CONTEXT).filter(_.nonEmpty) @@ -117,17 +109,9 @@ object SparkKubernetesClientFactory extends Logging { }.withOption(namespace) { (ns, configBuilder) => configBuilder.withNamespace(ns) }.build() - val factoryWithCustomDispatcher = new OkHttpClientFactory() { - override protected def additionalConfig(builder: OkHttpClient.Builder): Unit = { - builder.dispatcher(dispatcher) - } - } logDebug("Kubernetes client config: " + new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(config)) - new KubernetesClientBuilder() - .withHttpClientFactory(factoryWithCustomDispatcher) - .withConfig(config) - .build() + new KubernetesClientBuilder().withConfig(config).build() } private implicit class OptionConfigurableConfigBuilder(val configBuilder: ConfigBuilder) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorKubernetesCredentialsFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorKubernetesCredentialsFeatureStepSuite.scala index 59cc7ac91d1ab..6a14711071b80 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorKubernetesCredentialsFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorKubernetesCredentialsFeatureStepSuite.scala @@ -16,10 +16,11 @@ */ package org.apache.spark.deploy.k8s.features +import io.fabric8.kubernetes.api.model.PodSpec import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.deploy.k8s.{KubernetesExecutorConf, KubernetesTestConf, SparkPod} +import org.apache.spark.deploy.k8s.{KubernetesTestConf, SparkPod} import org.apache.spark.deploy.k8s.Config._ class ExecutorKubernetesCredentialsFeatureStepSuite extends SparkFunSuite with BeforeAndAfter { @@ -30,58 +31,40 @@ class ExecutorKubernetesCredentialsFeatureStepSuite extends SparkFunSuite with B baseConf = new SparkConf(false) } - private def newExecutorConf(environment: Map[String, String] = Map.empty): - KubernetesExecutorConf = { - KubernetesTestConf.createExecutorConf( - sparkConf = baseConf, - environment = environment) - } - test("configure spark pod with executor service account") { baseConf.set(KUBERNETES_EXECUTOR_SERVICE_ACCOUNT_NAME, "executor-name") - val step = new ExecutorKubernetesCredentialsFeatureStep(newExecutorConf()) - val spec = step - .configurePod(SparkPod.initialPod()) - .pod - .getSpec - - val serviceAccountName = spec.getServiceAccountName - val accountName = spec.getServiceAccount - assertSAName(serviceAccountName, accountName) + val spec = evaluateStep() + assertSAName("executor-name", spec) } test("configure spark pod with with driver service account " + "and without executor service account") { baseConf.set(KUBERNETES_DRIVER_SERVICE_ACCOUNT_NAME, "driver-name") - val step = new ExecutorKubernetesCredentialsFeatureStep(newExecutorConf()) - val spec = step - .configurePod(SparkPod.initialPod()) - .pod - .getSpec - - val serviceAccountName = spec.getServiceAccountName - val accountName = spec.getServiceAccount - assertSAName(serviceAccountName, accountName) + val spec = evaluateStep() + assertSAName("driver-name", spec) } test("configure spark pod with with driver service account " + "and with executor service account") { baseConf.set(KUBERNETES_DRIVER_SERVICE_ACCOUNT_NAME, "driver-name") baseConf.set(KUBERNETES_EXECUTOR_SERVICE_ACCOUNT_NAME, "executor-name") + val spec = evaluateStep() + assertSAName("executor-name", spec) + } + + private def assertSAName(expectedServiceAccountName: String, + spec: PodSpec): Unit = { + assert(spec.getServiceAccountName.equals(expectedServiceAccountName)) + assert(spec.getServiceAccount.equals(expectedServiceAccountName)) + } - val step = new ExecutorKubernetesCredentialsFeatureStep(newExecutorConf()) - val spec = step + private def evaluateStep(): PodSpec = { + val executorConf = KubernetesTestConf.createExecutorConf( + sparkConf = baseConf) + val step = new ExecutorKubernetesCredentialsFeatureStep(executorConf) + step .configurePod(SparkPod.initialPod()) .pod .getSpec - - val serviceAccountName = spec.getServiceAccountName - val accountName = spec.getServiceAccount - assertSAName(serviceAccountName, accountName) - } - - def assertSAName(serviceAccountName: String, accountName: String): Unit = { - assert(serviceAccountName.equals(serviceAccountName)) - assert(accountName.equals(accountName)) } } diff --git a/resource-managers/kubernetes/core/volcano/src/main/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStep.scala b/resource-managers/kubernetes/core/volcano/src/main/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStep.scala index 314550713ef16..046d268df4e29 100644 --- a/resource-managers/kubernetes/core/volcano/src/main/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStep.scala +++ b/resource-managers/kubernetes/core/volcano/src/main/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStep.scala @@ -17,8 +17,8 @@ package org.apache.spark.deploy.k8s.features import io.fabric8.kubernetes.api.model._ +import io.fabric8.volcano.api.model.scheduling.v1beta1.{PodGroup, PodGroupSpec} import io.fabric8.volcano.client.DefaultVolcanoClient -import io.fabric8.volcano.scheduling.v1beta1.{PodGroup, PodGroupSpec} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesDriverConf, KubernetesExecutorConf, SparkPod} import org.apache.spark.internal.Logging diff --git a/resource-managers/kubernetes/core/volcano/src/test/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStepSuite.scala b/resource-managers/kubernetes/core/volcano/src/test/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStepSuite.scala index dab414e0e19e7..c92164993ef80 100644 --- a/resource-managers/kubernetes/core/volcano/src/test/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/volcano/src/test/scala/org/apache/spark/deploy/k8s/features/VolcanoFeatureStepSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy.k8s.features import java.io.File -import io.fabric8.volcano.scheduling.v1beta1.PodGroup +import io.fabric8.volcano.api.model.scheduling.v1beta1.PodGroup import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s._ diff --git a/resource-managers/kubernetes/integration-tests/README.md b/resource-managers/kubernetes/integration-tests/README.md index 5b80fe10596c1..36848cba0609e 100644 --- a/resource-managers/kubernetes/integration-tests/README.md +++ b/resource-managers/kubernetes/integration-tests/README.md @@ -330,11 +330,11 @@ You can also specify your specific dockerfile to build JVM/Python/R based image ## Requirements - A minimum of 6 CPUs and 9G of memory is required to complete all Volcano test cases. -- Volcano v1.9.0. +- Volcano v1.10.0. ## Installation - kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml + kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.10.0/installer/volcano-development.yaml ## Run tests @@ -355,5 +355,5 @@ You can also specify `volcano` tag to only run Volcano test: ## Cleanup Volcano - kubectl delete -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml + kubectl delete -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.10.0/installer/volcano-development.yaml diff --git a/resource-managers/kubernetes/integration-tests/volcano/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/VolcanoTestsSuite.scala b/resource-managers/kubernetes/integration-tests/volcano/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/VolcanoTestsSuite.scala index 2f414b72ee195..4a9dc135ecfdc 100644 --- a/resource-managers/kubernetes/integration-tests/volcano/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/VolcanoTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/volcano/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/VolcanoTestsSuite.scala @@ -28,8 +28,8 @@ import scala.concurrent.Future import scala.jdk.CollectionConverters._ import io.fabric8.kubernetes.api.model.{HasMetadata, Pod, Quantity} +import io.fabric8.volcano.api.model.scheduling.v1beta1.{Queue, QueueBuilder} import io.fabric8.volcano.client.VolcanoClient -import io.fabric8.volcano.scheduling.v1beta1.{Queue, QueueBuilder} import org.scalatest.BeforeAndAfterEach import org.scalatest.concurrent.Eventually diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index c86195d0ef31e..911ce2e27f96d 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -529,7 +529,7 @@ private[yarn] class YarnAllocator( log"${MDC(LogKeys.MEMORY_SIZE, resource.getMemorySize)} MB memory." if (resource.getResources.nonEmpty) { requestContainerMessage = requestContainerMessage + - log" with custom resources: ${MDC(LogKeys.RESOURCE, resource)}" + log" with custom resources: ${MDC(LogKeys.YARN_RESOURCE, resource)}" } logInfo(requestContainerMessage) } @@ -820,6 +820,7 @@ private[yarn] class YarnAllocator( logInfo(log"Skip launching executorRunnable as running executors count: " + log"${MDC(LogKeys.COUNT, rpRunningExecs)} reached target executors count: " + log"${MDC(LogKeys.NUM_EXECUTOR_TARGET, getOrUpdateTargetNumExecutorsForRPId(rpId))}.") + internalReleaseContainer(container) } } } diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala index 92d9f2d62d1c1..71843b7f90b1f 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala @@ -293,7 +293,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite { } test("running Spark in yarn-cluster mode displays driver log links") { - val log4jConf = new File(tempDir, "log4j.properties") + val log4jConf = new File(tempDir, "log4j2.properties") val logOutFile = new File(tempDir, "logs") Files.asCharSink(log4jConf, StandardCharsets.UTF_8).write( s"""rootLogger.level = debug diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh index 0bea4a45040ed..814e17f147129 100755 --- a/sbin/spark-config.sh +++ b/sbin/spark-config.sh @@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}" # Add the PySpark classes to the PYTHONPATH: if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}" - export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}" + export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.9-src.zip:${PYTHONPATH}" export PYSPARK_PYTHONPATH_SET=1 fi diff --git a/sbin/start-connect-server.sh b/sbin/start-connect-server.sh index 668423bad1cbb..7f0c430a468a9 100755 --- a/sbin/start-connect-server.sh +++ b/sbin/start-connect-server.sh @@ -33,7 +33,7 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then echo "Usage: ./sbin/start-connect-server.sh [--wait] [options]" "${SPARK_HOME}"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2 - exit 1 + exit 0 fi . "${SPARK_HOME}/bin/load-spark-env.sh" diff --git a/sbin/start-history-server.sh b/sbin/start-history-server.sh index 71dace47767cb..a99c8e557885b 100755 --- a/sbin/start-history-server.sh +++ b/sbin/start-history-server.sh @@ -40,7 +40,7 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then pattern+="\|Registered signal handler for" "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 - exit 1 + exit 0 fi . "${SPARK_HOME}/sbin/spark-config.sh" diff --git a/sbin/start-master.sh b/sbin/start-master.sh index 36fe4b4abeb91..25e739132f0d5 100755 --- a/sbin/start-master.sh +++ b/sbin/start-master.sh @@ -35,7 +35,7 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then pattern+="\|Registered signal handler for" "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 - exit 1 + exit 0 fi ORIGINAL_ARGS="$@" diff --git a/sbin/start-thriftserver.sh b/sbin/start-thriftserver.sh index b1d38713218b7..a457526979341 100755 --- a/sbin/start-thriftserver.sh +++ b/sbin/start-thriftserver.sh @@ -52,7 +52,7 @@ function usage { if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then usage - exit 1 + exit 0 fi export SUBMIT_USAGE_FUNCTION=usage diff --git a/sbin/start-worker.sh b/sbin/start-worker.sh index fd58f01bac2eb..c0147a51b3f2c 100755 --- a/sbin/start-worker.sh +++ b/sbin/start-worker.sh @@ -47,7 +47,7 @@ if [[ $# -lt 1 ]] || [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then pattern+="\|Registered signal handler for" "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 - exit 1 + [[ $# -lt 1 ]] && exit 1 || exit 0 fi . "${SPARK_HOME}/sbin/spark-config.sh" diff --git a/scalastyle-config.xml b/scalastyle-config.xml index 05b3f6a268985..7e64dc9be6731 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -460,33 +460,6 @@ This file is divided into 3 sections: -1,0,1,2,3 - - Objects.toStringHelper - Avoid using Object.toStringHelper. Use ToStringBuilder instead. - - - - Files\.createTempDir\( - Avoid using com.google.common.io.Files.createTempDir due to CVE-2020-8908. - Use org.apache.spark.util.Utils.createTempDir instead. - - - - - FileBackedOutputStream - Avoid using FileBackedOutputStream due to CVE-2023-2976. - - - - AtomicDoubleArray - Avoid using AtomicDoubleArray due to CVE-2018-10237. - - - - CompoundOrdering - Avoid using CompoundOrdering due to CVE-2018-10237. - - byteCountToDisplaySize Use Utils.bytesToString instead of byteCountToDisplaySize for consistency. diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index eeebe89de8ff1..dafeed48aef11 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -283,6 +283,7 @@ IS: 'IS'; ITEMS: 'ITEMS'; ITERATE: 'ITERATE'; JOIN: 'JOIN'; +JSON: 'JSON'; KEYS: 'KEYS'; LANGUAGE: 'LANGUAGE'; LAST: 'LAST'; @@ -365,6 +366,7 @@ REAL: 'REAL'; RECORDREADER: 'RECORDREADER'; RECORDWRITER: 'RECORDWRITER'; RECOVER: 'RECOVER'; +RECURSIVE: 'RECURSIVE'; REDUCE: 'REDUCE'; REFERENCES: 'REFERENCES'; REFRESH: 'REFRESH'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index cdee8c906054d..667d200268cf8 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -48,15 +48,15 @@ compoundOrSingleStatement ; singleCompoundStatement - : BEGIN compoundBody END SEMICOLON? EOF + : BEGIN compoundBody? END SEMICOLON? EOF ; beginEndCompoundBlock - : beginLabel? BEGIN compoundBody END endLabel? + : beginLabel? BEGIN compoundBody? END endLabel? ; compoundBody - : (compoundStatements+=compoundStatement SEMICOLON)* + : (compoundStatements+=compoundStatement SEMICOLON)+ ; compoundStatement @@ -70,6 +70,7 @@ compoundStatement | leaveStatement | iterateStatement | loopStatement + | forStatement ; setStatementWithOptionalVarKeyword @@ -111,6 +112,10 @@ loopStatement : beginLabel? LOOP compoundBody END LOOP endLabel? ; +forStatement + : beginLabel? FOR (multipartIdentifier AS)? query DO compoundBody END FOR endLabel? + ; + singleStatement : (statement|setResetStatement) SEMICOLON* EOF ; @@ -231,6 +236,7 @@ statement | ALTER TABLE identifierReference RECOVER PARTITIONS #recoverPartitions | ALTER TABLE identifierReference (clusterBySpec | CLUSTER BY NONE) #alterClusterBy + | ALTER TABLE identifierReference collationSpec #alterTableCollation | DROP TABLE (IF EXISTS)? identifierReference PURGE? #dropTable | DROP VIEW (IF EXISTS)? identifierReference #dropView | CREATE (OR REPLACE)? (GLOBAL? TEMPORARY)? @@ -238,6 +244,7 @@ statement identifierCommentList? (commentSpec | schemaBinding | + collationSpec | (PARTITIONED ON identifierList) | (TBLPROPERTIES propertyList))* AS query #createView @@ -280,7 +287,7 @@ statement | (DESC | DESCRIBE) namespace EXTENDED? identifierReference #describeNamespace | (DESC | DESCRIBE) TABLE? option=(EXTENDED | FORMATTED)? - identifierReference partitionSpec? describeColName? #describeRelation + identifierReference partitionSpec? describeColName? (AS JSON)? #describeRelation | (DESC | DESCRIBE) QUERY? query #describeQuery | COMMENT ON namespace identifierReference IS comment #commentNamespace @@ -502,7 +509,7 @@ describeColName ; ctes - : WITH namedQuery (COMMA namedQuery)* + : WITH RECURSIVE? namedQuery (COMMA namedQuery)* ; namedQuery @@ -523,6 +530,7 @@ createTableClauses createFileFormat | locationSpec | commentSpec | + collationSpec | (TBLPROPERTIES tableProps=propertyList))* ; @@ -643,7 +651,7 @@ sortItem ; fromStatement - : fromClause fromStatementBody+ + : fromClause fromStatementBody* ; fromStatementBody @@ -1227,8 +1235,12 @@ colPosition : position=FIRST | position=AFTER afterCol=errorCapturingIdentifier ; +collationSpec + : DEFAULT COLLATION collationName=identifier + ; + collateClause - : COLLATE collationName=identifier + : COLLATE collationName=multipartIdentifier ; type @@ -1504,6 +1516,9 @@ version operatorPipeRightSide : selectClause windowClause? | EXTEND extendList=namedExpressionSeq + | SET operatorPipeSetAssignmentSeq + | DROP identifierSeq + | AS errorCapturingIdentifier // Note that the WINDOW clause is not allowed in the WHERE pipe operator, but we add it here in // the grammar simply for purposes of catching this invalid syntax and throwing a specific // dedicated error message. @@ -1515,11 +1530,20 @@ operatorPipeRightSide | unpivotClause pivotClause? | sample | joinRelation - | operator=(UNION | EXCEPT | SETMINUS | INTERSECT) setQuantifier? right=queryTerm + | operator=(UNION | EXCEPT | SETMINUS | INTERSECT) setQuantifier? right=queryPrimary | queryOrganization | AGGREGATE namedExpressionSeq? aggregationClause? ; +operatorPipeSetAssignmentSeq + : ident+=errorCapturingIdentifier + (DOT errorCapturingIdentifier)* // This is invalid syntax; we just capture it here. + EQ expression + (COMMA ident+=errorCapturingIdentifier + (DOT errorCapturingIdentifier)* // This is invalid syntax; we just capture it here. + EQ expression)* + ; + // When `SQL_standard_keyword_behavior=true`, there are 2 kinds of keywords in Spark SQL. // - Reserved keywords: // Keywords that are reserved and can't be used as identifiers for table, view, column, @@ -1656,6 +1680,7 @@ ansiNonReserved | INVOKER | ITEMS | ITERATE + | JSON | KEYS | LANGUAGE | LAST @@ -2015,6 +2040,7 @@ nonReserved | IS | ITEMS | ITERATE + | JSON | KEYS | LANGUAGE | LAST @@ -2094,6 +2120,7 @@ nonReserved | RECORDREADER | RECORDWRITER | RECOVER + | RECURSIVE | REDUCE | REFERENCES | REFRESH diff --git a/sql/api/src/main/scala/org/apache/spark/sql/Column.scala b/sql/api/src/main/scala/org/apache/spark/sql/Column.scala index 8498ae04d9a2a..f13b340e5e9c8 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/Column.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.parser.DataTypeParser import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{lit, map} -import org.apache.spark.sql.internal.{ColumnNode, LazyOuterReference, UnresolvedAttribute} +import org.apache.spark.sql.internal.ColumnNode import org.apache.spark.sql.types._ import org.apache.spark.util.ArrayImplicits._ @@ -137,7 +137,7 @@ class TypedColumn[-T, U](node: ColumnNode, private[sql] val encoder: Encoder[U]) * @since 1.3.0 */ @Stable -class Column(val node: ColumnNode) extends Logging { +class Column(val node: ColumnNode) extends Logging with TableValuedFunctionArgument { private[sql] def this(name: String, planId: Option[Long]) = this(withOrigin { name match { case "*" => internal.UnresolvedStar(None, planId) @@ -1383,20 +1383,27 @@ class Column(val node: ColumnNode) extends Logging { def over(): Column = over(Window.spec) /** - * Marks this column reference as an outer reference for subqueries. + * Mark this column as an outer column if its expression refers to columns from an outer query. + * This is used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it to + * build subquery expressions. Spark Connect DataFrame is always lazily analyzed and does not + * need to use this function. * - * @group subquery + * {{{ + * // Spark can't analyze this `df` now as it doesn't know how to resolve `t1.col`. + * val df = spark.table("t2").where($"t2.col" === $"t1.col".outer()) + * + * // Since this `df` is lazily analyzed, you won't see any error until you try to execute it. + * df.collect() // Fails with UNRESOLVED_COLUMN error. + * + * // Now Spark can resolve `t1.col` with the outer plan `spark.table("t1")`. + * spark.table("t1").where(df.exists()) + * }}} + * + * @group expr_ops * @since 4.0.0 */ - def outer(): Column = withOrigin { - node match { - case attr: UnresolvedAttribute if !attr.isMetadataColumn => - Column(LazyOuterReference(attr.nameParts, attr.planId)) - case _ => - throw new IllegalArgumentException( - "Only unresolved attributes can be used as outer references") - } - } + def outer(): Column = Column(internal.LazyExpression(node)) + } /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/Encoders.scala b/sql/api/src/main/scala/org/apache/spark/sql/Encoders.scala index 9976b34f7a01f..4957d76af9a29 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/Encoders.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/Encoders.scala @@ -81,6 +81,20 @@ object Encoders { */ def DOUBLE: Encoder[java.lang.Double] = BoxedDoubleEncoder + /** + * An encoder for nullable char type. + * + * @since 4.0.0 + */ + def CHAR(length: Int): Encoder[java.lang.String] = CharEncoder(length) + + /** + * An encoder for nullable varchar type. + * + * @since 4.0.0 + */ + def VARCHAR(length: Int): Encoder[java.lang.String] = VarcharEncoder(length) + /** * An encoder for nullable string type. * diff --git a/sql/api/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/api/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 9e6e0e97f0302..091fbf20a0a7f 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql import org.apache.spark.annotation.Stable +import org.apache.spark.internal.config.{ConfigEntry, OptionalConfigEntry} /** * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`. @@ -53,6 +54,11 @@ abstract class RuntimeConfig { set(key, value.toString) } + /** + * Sets the given Spark runtime configuration property. + */ + private[sql] def set[T](entry: ConfigEntry[T], value: T): Unit + /** * Returns the value of Spark runtime configuration property for the given key. If the key is * not set yet, return its default value if possible, otherwise `NoSuchElementException` will be @@ -74,6 +80,25 @@ abstract class RuntimeConfig { */ def get(key: String, default: String): String + /** + * Returns the value of Spark runtime configuration property for the given key. If the key is + * not set yet, return `defaultValue` in [[ConfigEntry]]. + */ + @throws[NoSuchElementException]("if the key is not set") + private[sql] def get[T](entry: ConfigEntry[T]): T + + /** + * Returns the value of Spark runtime configuration property for the given key. If the key is + * not set yet, return None. + */ + private[sql] def get[T](entry: OptionalConfigEntry[T]): Option[T] + + /** + * Returns the value of Spark runtime configuration property for the given key. If the key is + * not set yet, return the user given `default`. + */ + private[sql] def get[T](entry: ConfigEntry[T], default: T): T + /** * Returns all properties set in this conf. * diff --git a/sql/api/src/main/scala/org/apache/spark/sql/TableValuedFunctionArgument.scala b/sql/api/src/main/scala/org/apache/spark/sql/TableValuedFunctionArgument.scala new file mode 100644 index 0000000000000..f99c4ecd48554 --- /dev/null +++ b/sql/api/src/main/scala/org/apache/spark/sql/TableValuedFunctionArgument.scala @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +trait TableValuedFunctionArgument diff --git a/sql/api/src/main/scala/org/apache/spark/sql/api/Dataset.scala b/sql/api/src/main/scala/org/apache/spark/sql/api/Dataset.scala index 9d41998f11dc6..20c181e7b9cf6 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/api/Dataset.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/api/Dataset.scala @@ -859,6 +859,60 @@ abstract class Dataset[T] extends Serializable { joinWith(other, condition, "inner") } + /** + * Lateral join with another `DataFrame`. + * + * Behaves as an JOIN LATERAL. + * + * @param right + * Right side of the join operation. + * @group untypedrel + * @since 4.0.0 + */ + def lateralJoin(right: DS[_]): Dataset[Row] + + /** + * Lateral join with another `DataFrame`. + * + * Behaves as an JOIN LATERAL. + * + * @param right + * Right side of the join operation. + * @param joinExprs + * Join expression. + * @group untypedrel + * @since 4.0.0 + */ + def lateralJoin(right: DS[_], joinExprs: Column): Dataset[Row] + + /** + * Lateral join with another `DataFrame`. + * + * @param right + * Right side of the join operation. + * @param joinType + * Type of join to perform. Default `inner`. Must be one of: `inner`, `cross`, `left`, + * `leftouter`, `left_outer`. + * @group untypedrel + * @since 4.0.0 + */ + def lateralJoin(right: DS[_], joinType: String): Dataset[Row] + + /** + * Lateral join with another `DataFrame`. + * + * @param right + * Right side of the join operation. + * @param joinExprs + * Join expression. + * @param joinType + * Type of join to perform. Default `inner`. Must be one of: `inner`, `cross`, `left`, + * `leftouter`, `left_outer`. + * @group untypedrel + * @since 4.0.0 + */ + def lateralJoin(right: DS[_], joinExprs: Column, joinType: String): Dataset[Row] + protected def sortInternal(global: Boolean, sortExprs: Seq[Column]): Dataset[T] /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/api/SQLContext.scala b/sql/api/src/main/scala/org/apache/spark/sql/api/SQLContext.scala new file mode 100644 index 0000000000000..50590fffa1521 --- /dev/null +++ b/sql/api/src/main/scala/org/apache/spark/sql/api/SQLContext.scala @@ -0,0 +1,1022 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.api + +import scala.collection.immutable +import scala.reflect.runtime.universe.TypeTag + +import _root_.java.util.{List => JList, Map => JMap, Properties} + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{DeveloperApi, Experimental, Stable, Unstable} +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Encoder, Encoders, ExperimentalMethods, Row} +import org.apache.spark.sql.api.SQLImplicits +import org.apache.spark.sql.catalog.Table +import org.apache.spark.sql.functions.{array_size, coalesce, col, lit, when} +import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.ExecutionListenerManager + +/** + * The entry point for working with structured data (rows and columns) in Spark 1.x. + * + * As of Spark 2.0, this is replaced by [[SparkSession]]. However, we are keeping the class here + * for backward compatibility. + * + * @groupname basic Basic Operations + * @groupname ddl_ops Persistent Catalog DDL + * @groupname cachemgmt Cached Table Management + * @groupname genericdata Generic Data Sources + * @groupname specificdata Specific Data Sources + * @groupname config Configuration + * @groupname dataframes Custom DataFrame Creation + * @groupname dataset Custom Dataset Creation + * @groupname Ungrouped Support functions for language integrated queries + * @since 1.0.0 + */ +@Stable +abstract class SQLContext private[sql] (val sparkSession: SparkSession) + extends Logging + with Serializable { + + // Note: Since Spark 2.0 this class has become a wrapper of SparkSession, where the + // real functionality resides. This class remains mainly for backward compatibility. + + def sparkContext: SparkContext = sparkSession.sparkContext + + /** + * Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary tables, + * registered functions, but sharing the same `SparkContext`, cached data and other things. + * + * @since 1.6.0 + */ + def newSession(): SQLContext + + /** + * An interface to register custom QueryExecutionListener that listen for execution metrics. + */ + def listenerManager: ExecutionListenerManager + + /** + * Set Spark SQL configuration properties. + * + * @group config + * @since 1.0.0 + */ + def setConf(props: Properties): Unit + + /** + * Set the given Spark SQL configuration property. + * + * @group config + * @since 1.0.0 + */ + def setConf(key: String, value: String): Unit = { + sparkSession.conf.set(key, value) + } + + /** + * Return the value of Spark SQL configuration property for the given key. + * + * @group config + * @since 1.0.0 + */ + def getConf(key: String): String = { + sparkSession.conf.get(key) + } + + /** + * Return the value of Spark SQL configuration property for the given key. If the key is not set + * yet, return `defaultValue`. + * + * @group config + * @since 1.0.0 + */ + def getConf(key: String, defaultValue: String): String = { + sparkSession.conf.get(key, defaultValue) + } + + /** + * Return all the configuration properties that have been set (i.e. not the default). This + * creates a new copy of the config properties in the form of a Map. + * + * @group config + * @since 1.0.0 + */ + def getAllConfs: immutable.Map[String, String] = { + sparkSession.conf.getAll + } + + /** + * :: Experimental :: A collection of methods that are considered experimental, but can be used + * to hook into the query planner for advanced functionality. + * + * @group basic + * @since 1.3.0 + */ + @Experimental + @transient + @Unstable + def experimental: ExperimentalMethods + + /** + * Returns a `DataFrame` with no rows or columns. + * + * @group basic + * @since 1.3.0 + */ + def emptyDataFrame: Dataset[Row] = sparkSession.emptyDataFrame + + /** + * A collection of methods for registering user-defined functions (UDF). + * + * The following example registers a Scala closure as UDF: + * {{{ + * sqlContext.udf.register("myUDF", (arg1: Int, arg2: String) => arg2 + arg1) + * }}} + * + * The following example registers a UDF in Java: + * {{{ + * sqlContext.udf().register("myUDF", + * (Integer arg1, String arg2) -> arg2 + arg1, + * DataTypes.StringType); + * }}} + * + * @note + * The user-defined functions must be deterministic. Due to optimization, duplicate + * invocations may be eliminated or the function may even be invoked more times than it is + * present in the query. + * + * @group basic + * @since 1.3.0 + */ + def udf: UDFRegistration + + /** + * (Scala-specific) Implicit methods available in Scala for converting common Scala objects into + * `DataFrame`s. + * + * {{{ + * val sqlContext = new SQLContext(sc) + * import sqlContext.implicits._ + * }}} + * + * @group basic + * @since 1.3.0 + */ + val implicits: SQLImplicits + + /** + * Returns true if the table is currently cached in-memory. + * @group cachemgmt + * @since 1.3.0 + */ + def isCached(tableName: String): Boolean = { + sparkSession.catalog.isCached(tableName) + } + + /** + * Caches the specified table in-memory. + * @group cachemgmt + * @since 1.3.0 + */ + def cacheTable(tableName: String): Unit = { + sparkSession.catalog.cacheTable(tableName) + } + + /** + * Removes the specified table from the in-memory cache. + * @group cachemgmt + * @since 1.3.0 + */ + def uncacheTable(tableName: String): Unit = { + sparkSession.catalog.uncacheTable(tableName) + } + + /** + * Removes all cached tables from the in-memory cache. + * @since 1.3.0 + */ + def clearCache(): Unit = { + sparkSession.catalog.clearCache() + } + + /** + * Creates a DataFrame from an RDD of Product (e.g. case classes, tuples). + * + * @group dataframes + * @since 1.3.0 + */ + def createDataFrame[A <: Product: TypeTag](rdd: RDD[A]): Dataset[Row] = { + sparkSession.createDataFrame(rdd) + } + + /** + * Creates a DataFrame from a local Seq of Product. + * + * @group dataframes + * @since 1.3.0 + */ + def createDataFrame[A <: Product: TypeTag](data: Seq[A]): Dataset[Row] = { + sparkSession.createDataFrame(data) + } + + /** + * Convert a `BaseRelation` created for external data sources into a `DataFrame`. + * + * @group dataframes + * @since 1.3.0 + */ + def baseRelationToDataFrame(baseRelation: BaseRelation): Dataset[Row] = { + sparkSession.baseRelationToDataFrame(baseRelation) + } + + /** + * :: DeveloperApi :: Creates a `DataFrame` from an `RDD` containing + * [[org.apache.spark.sql.Row Row]]s using the given schema. It is important to make sure that + * the structure of every [[org.apache.spark.sql.Row Row]] of the provided RDD matches the + * provided schema. Otherwise, there will be runtime exception. Example: + * {{{ + * import org.apache.spark.sql._ + * import org.apache.spark.sql.types._ + * val sqlContext = new org.apache.spark.sql.SQLContext(sc) + * + * val schema = + * StructType( + * StructField("name", StringType, false) :: + * StructField("age", IntegerType, true) :: Nil) + * + * val people = + * sc.textFile("examples/src/main/resources/people.txt").map( + * _.split(",")).map(p => Row(p(0), p(1).trim.toInt)) + * val dataFrame = sqlContext.createDataFrame(people, schema) + * dataFrame.printSchema + * // root + * // |-- name: string (nullable = false) + * // |-- age: integer (nullable = true) + * + * dataFrame.createOrReplaceTempView("people") + * sqlContext.sql("select name from people").collect.foreach(println) + * }}} + * + * @group dataframes + * @since 1.3.0 + */ + @DeveloperApi + def createDataFrame(rowRDD: RDD[Row], schema: StructType): Dataset[Row] = { + sparkSession.createDataFrame(rowRDD, schema) + } + + /** + * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL + * representation) that is generally created automatically through implicits from a + * `SparkSession`, or can be created explicitly by calling static methods on + * [[org.apache.spark.sql.Encoders Encoders]]. + * + * ==Example== + * + * {{{ + * + * import spark.implicits._ + * case class Person(name: String, age: Long) + * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) + * val ds = spark.createDataset(data) + * + * ds.show() + * // +-------+---+ + * // | name|age| + * // +-------+---+ + * // |Michael| 29| + * // | Andy| 30| + * // | Justin| 19| + * // +-------+---+ + * }}} + * + * @since 2.0.0 + * @group dataset + */ + def createDataset[T: Encoder](data: Seq[T]): Dataset[T] = { + sparkSession.createDataset(data) + } + + /** + * Creates a [[Dataset]] from an RDD of a given type. This method requires an encoder (to + * convert a JVM object of type `T` to and from the internal Spark SQL representation) that is + * generally created automatically through implicits from a `SparkSession`, or can be created + * explicitly by calling static methods on [[org.apache.spark.sql.Encoders Encoders]]. + * + * @since 2.0.0 + * @group dataset + */ + def createDataset[T: Encoder](data: RDD[T]): Dataset[T] = { + sparkSession.createDataset(data) + } + + /** + * Creates a [[Dataset]] from a `JList` of a given type. This method requires an encoder (to + * convert a JVM object of type `T` to and from the internal Spark SQL representation) that is + * generally created automatically through implicits from a `SparkSession`, or can be created + * explicitly by calling static methods on [[org.apache.spark.sql.Encoders Encoders]]. + * + * ==Java Example== + * + * {{{ + * List data = Arrays.asList("hello", "world"); + * Dataset ds = spark.createDataset(data, Encoders.STRING()); + * }}} + * + * @since 2.0.0 + * @group dataset + */ + def createDataset[T: Encoder](data: JList[T]): Dataset[T] = { + sparkSession.createDataset(data) + } + + /** + * :: DeveloperApi :: Creates a `DataFrame` from a `JavaRDD` containing + * [[org.apache.spark.sql.Row Row]]s using the given schema. It is important to make sure that + * the structure of every [[org.apache.spark.sql.Row Row]] of the provided RDD matches the + * provided schema. Otherwise, there will be runtime exception. + * + * @group dataframes + * @since 1.3.0 + */ + @DeveloperApi + def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): Dataset[Row] = { + sparkSession.createDataFrame(rowRDD, schema) + } + + /** + * :: DeveloperApi :: Creates a `DataFrame` from a `JList` containing + * [[org.apache.spark.sql.Row Row]]s using the given schema. It is important to make sure that + * the structure of every [[org.apache.spark.sql.Row Row]] of the provided List matches the + * provided schema. Otherwise, there will be runtime exception. + * + * @group dataframes + * @since 1.6.0 + */ + @DeveloperApi + def createDataFrame(rows: JList[Row], schema: StructType): Dataset[Row] = { + sparkSession.createDataFrame(rows, schema) + } + + /** + * Applies a schema to an RDD of Java Beans. + * + * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, SELECT * queries + * will return the columns in an undefined order. + * @group dataframes + * @since 1.3.0 + */ + def createDataFrame(rdd: RDD[_], beanClass: Class[_]): Dataset[Row] = { + sparkSession.createDataFrame(rdd, beanClass) + } + + /** + * Applies a schema to an RDD of Java Beans. + * + * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, SELECT * queries + * will return the columns in an undefined order. + * @group dataframes + * @since 1.3.0 + */ + def createDataFrame(rdd: JavaRDD[_], beanClass: Class[_]): Dataset[Row] = { + sparkSession.createDataFrame(rdd, beanClass) + } + + /** + * Applies a schema to a List of Java Beans. + * + * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, SELECT * queries + * will return the columns in an undefined order. + * @group dataframes + * @since 1.6.0 + */ + def createDataFrame(data: JList[_], beanClass: Class[_]): Dataset[Row] = { + sparkSession.createDataFrame(data, beanClass) + } + + /** + * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a + * `DataFrame`. + * {{{ + * sqlContext.read.parquet("/path/to/file.parquet") + * sqlContext.read.schema(schema).json("/path/to/file.json") + * }}} + * + * @group genericdata + * @since 1.4.0 + */ + def read: DataFrameReader + + /** + * Returns a `DataStreamReader` that can be used to read streaming data in as a `DataFrame`. + * {{{ + * sparkSession.readStream.parquet("/path/to/directory/of/parquet/files") + * sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files") + * }}} + * + * @since 2.0.0 + */ + def readStream: DataStreamReader + + /** + * Creates an external table from the given path and returns the corresponding DataFrame. It + * will use the default data source configured by spark.sql.sources.default. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable(tableName: String, path: String): Dataset[Row] = { + sparkSession.catalog.createTable(tableName, path) + } + + /** + * Creates an external table from the given path based on a data source and returns the + * corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable(tableName: String, path: String, source: String): Dataset[Row] = { + sparkSession.catalog.createTable(tableName, path, source) + } + + /** + * Creates an external table from the given path based on a data source and a set of options. + * Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + options: JMap[String, String]): Dataset[Row] = { + sparkSession.catalog.createTable(tableName, source, options) + } + + /** + * (Scala-specific) Creates an external table from the given path based on a data source and a + * set of options. Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + options: Map[String, String]): Dataset[Row] = { + sparkSession.catalog.createTable(tableName, source, options) + } + + /** + * Create an external table from the given path based on a data source, a schema and a set of + * options. Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: JMap[String, String]): Dataset[Row] = { + sparkSession.catalog.createTable(tableName, source, schema, options) + } + + /** + * (Scala-specific) Create an external table from the given path based on a data source, a + * schema and a set of options. Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: Map[String, String]): Dataset[Row] = { + sparkSession.catalog.createTable(tableName, source, schema, options) + } + + /** + * Drops the temporary table with the given table name in the catalog. If the table has been + * cached/persisted before, it's also unpersisted. + * + * @param tableName + * the name of the table to be unregistered. + * @group basic + * @since 1.3.0 + */ + def dropTempTable(tableName: String): Unit = { + sparkSession.catalog.dropTempView(tableName) + } + + /** + * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements in a + * range from 0 to `end` (exclusive) with step value 1. + * + * @since 1.4.1 + * @group dataframe + */ + def range(end: Long): Dataset[Row] = sparkSession.range(end).toDF() + + /** + * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements in a + * range from `start` to `end` (exclusive) with step value 1. + * + * @since 1.4.0 + * @group dataframe + */ + def range(start: Long, end: Long): Dataset[Row] = sparkSession.range(start, end).toDF() + + /** + * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements in a + * range from `start` to `end` (exclusive) with a step value. + * + * @since 2.0.0 + * @group dataframe + */ + def range(start: Long, end: Long, step: Long): Dataset[Row] = { + sparkSession.range(start, end, step).toDF() + } + + /** + * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements in an + * range from `start` to `end` (exclusive) with an step value, with partition number specified. + * + * @since 1.4.0 + * @group dataframe + */ + def range(start: Long, end: Long, step: Long, numPartitions: Int): Dataset[Row] = { + sparkSession.range(start, end, step, numPartitions).toDF() + } + + /** + * Executes a SQL query using Spark, returning the result as a `DataFrame`. This API eagerly + * runs DDL/DML commands, but not for SELECT queries. + * + * @group basic + * @since 1.3.0 + */ + def sql(sqlText: String): Dataset[Row] = sparkSession.sql(sqlText) + + /** + * Returns the specified table as a `DataFrame`. + * + * @group ddl_ops + * @since 1.3.0 + */ + def table(tableName: String): Dataset[Row] = { + sparkSession.table(tableName) + } + + /** + * Returns a `DataFrame` containing names of existing tables in the current database. The + * returned DataFrame has three columns, database, tableName and isTemporary (a Boolean + * indicating if a table is a temporary one or not). + * + * @group ddl_ops + * @since 1.3.0 + */ + def tables(): Dataset[Row] = { + mapTableDatasetOutput(sparkSession.catalog.listTables()) + } + + /** + * Returns a `DataFrame` containing names of existing tables in the given database. The returned + * DataFrame has three columns, database, tableName and isTemporary (a Boolean indicating if a + * table is a temporary one or not). + * + * @group ddl_ops + * @since 1.3.0 + */ + def tables(databaseName: String): Dataset[Row] = { + mapTableDatasetOutput(sparkSession.catalog.listTables(databaseName)) + } + + private def mapTableDatasetOutput(tables: Dataset[Table]): Dataset[Row] = { + tables + .select( + // Re-implement `org.apache.spark.sql.catalog.Table.database` method. + // Abusing `coalesce` to tell Spark all these columns are not nullable. + when( + coalesce(array_size(col("namespace")), lit(0)).equalTo(lit(1)), + coalesce(col("namespace")(0), lit(""))) + .otherwise(lit("")) + .as("namespace"), + coalesce(col("name"), lit("")).as("tableName"), + col("isTemporary")) + } + + /** + * Returns a `StreamingQueryManager` that allows managing all the + * [[org.apache.spark.sql.api.StreamingQuery StreamingQueries]] active on `this` context. + * + * @since 2.0.0 + */ + def streams: StreamingQueryManager + + /** + * Returns the names of tables in the current database as an array. + * + * @group ddl_ops + * @since 1.3.0 + */ + def tableNames(): Array[String] = { + tableNames(sparkSession.catalog.currentDatabase) + } + + /** + * Returns the names of tables in the given database as an array. + * + * @group ddl_ops + * @since 1.3.0 + */ + def tableNames(databaseName: String): Array[String] = { + sparkSession.catalog + .listTables(databaseName) + .select(col("name")) + .as(Encoders.STRING) + .collect() + } + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + // Deprecated methods + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + /** + * @deprecated + * As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rowRDD: RDD[Row], schema: StructType): Dataset[Row] = { + createDataFrame(rowRDD, schema) + } + + /** + * @deprecated + * As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rowRDD: JavaRDD[Row], schema: StructType): Dataset[Row] = { + createDataFrame(rowRDD, schema) + } + + /** + * @deprecated + * As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rdd: RDD[_], beanClass: Class[_]): Dataset[Row] = { + createDataFrame(rdd, beanClass) + } + + /** + * @deprecated + * As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): Dataset[Row] = { + createDataFrame(rdd, beanClass) + } + + /** + * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty + * `DataFrame` if no paths are passed in. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().parquet()`. + */ + @deprecated("Use read.parquet() instead.", "1.4.0") + @scala.annotation.varargs + def parquetFile(paths: String*): Dataset[Row] = { + if (paths.isEmpty) { + emptyDataFrame + } else { + read.parquet(paths: _*) + } + } + + /** + * Loads a JSON file (one object per line), returning the result as a `DataFrame`. It goes + * through the entire dataset once to determine the schema. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonFile(path: String): Dataset[Row] = { + read.json(path) + } + + /** + * Loads a JSON file (one object per line) and applies the given schema, returning the result as + * a `DataFrame`. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonFile(path: String, schema: StructType): Dataset[Row] = { + read.schema(schema).json(path) + } + + /** + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonFile(path: String, samplingRatio: Double): Dataset[Row] = { + read.option("samplingRatio", samplingRatio.toString).json(path) + } + + /** + * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a + * `DataFrame`. It goes through the entire dataset once to determine the schema. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: RDD[String]): Dataset[Row] = read.json(json) + + /** + * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a + * `DataFrame`. It goes through the entire dataset once to determine the schema. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: JavaRDD[String]): Dataset[Row] = read.json(json) + + /** + * Loads an RDD[String] storing JSON objects (one object per record) and applies the given + * schema, returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: RDD[String], schema: StructType): Dataset[Row] = { + read.schema(schema).json(json) + } + + /** + * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the given + * schema, returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: JavaRDD[String], schema: StructType): Dataset[Row] = { + read.schema(schema).json(json) + } + + /** + * Loads an RDD[String] storing JSON objects (one object per record) inferring the schema, + * returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: RDD[String], samplingRatio: Double): Dataset[Row] = { + read.option("samplingRatio", samplingRatio.toString).json(json) + } + + /** + * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the schema, + * returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: JavaRDD[String], samplingRatio: Double): Dataset[Row] = { + read.option("samplingRatio", samplingRatio.toString).json(json) + } + + /** + * Returns the dataset stored at path as a DataFrame, using the default data source configured + * by spark.sql.sources.default. + * + * @group genericdata + * @deprecated + * As of 1.4.0, replaced by `read().load(path)`. + */ + @deprecated("Use read.load(path) instead.", "1.4.0") + def load(path: String): Dataset[Row] = { + read.load(path) + } + + /** + * Returns the dataset stored at path as a DataFrame, using the given data source. + * + * @group genericdata + * @deprecated + * As of 1.4.0, replaced by `read().format(source).load(path)`. + */ + @deprecated("Use read.format(source).load(path) instead.", "1.4.0") + def load(path: String, source: String): Dataset[Row] = { + read.format(source).load(path) + } + + /** + * (Java-specific) Returns the dataset specified by the given data source and a set of options + * as a DataFrame. + * + * @group genericdata + * @deprecated + * As of 1.4.0, replaced by `read().format(source).options(options).load()`. + */ + @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") + def load(source: String, options: JMap[String, String]): Dataset[Row] = { + read.options(options).format(source).load() + } + + /** + * (Scala-specific) Returns the dataset specified by the given data source and a set of options + * as a DataFrame. + * + * @group genericdata + * @deprecated + * As of 1.4.0, replaced by `read().format(source).options(options).load()`. + */ + @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") + def load(source: String, options: Map[String, String]): Dataset[Row] = { + read.options(options).format(source).load() + } + + /** + * (Java-specific) Returns the dataset specified by the given data source and a set of options + * as a DataFrame, using the given schema as the schema of the DataFrame. + * + * @group genericdata + * @deprecated + * As of 1.4.0, replaced by `read().format(source).schema(schema).options(options).load()`. + */ + @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") + def load(source: String, schema: StructType, options: JMap[String, String]): Dataset[Row] = { + read.format(source).schema(schema).options(options).load() + } + + /** + * (Scala-specific) Returns the dataset specified by the given data source and a set of options + * as a DataFrame, using the given schema as the schema of the DataFrame. + * + * @group genericdata + * @deprecated + * As of 1.4.0, replaced by `read().format(source).schema(schema).options(options).load()`. + */ + @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") + def load(source: String, schema: StructType, options: Map[String, String]): Dataset[Row] = { + read.format(source).schema(schema).options(options).load() + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL url named + * table. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().jdbc()`. + */ + @deprecated("Use read.jdbc() instead.", "1.4.0") + def jdbc(url: String, table: String): Dataset[Row] = { + read.jdbc(url, table, new Properties) + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL url named + * table. Partitions of the table will be retrieved in parallel based on the parameters passed + * to this function. + * + * @param columnName + * the name of a column of integral type that will be used for partitioning. + * @param lowerBound + * the minimum value of `columnName` used to decide partition stride + * @param upperBound + * the maximum value of `columnName` used to decide partition stride + * @param numPartitions + * the number of partitions. the range `minValue`-`maxValue` will be split evenly into this + * many partitions + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().jdbc()`. + */ + @deprecated("Use read.jdbc() instead.", "1.4.0") + def jdbc( + url: String, + table: String, + columnName: String, + lowerBound: Long, + upperBound: Long, + numPartitions: Int): Dataset[Row] = { + read.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, new Properties) + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL url named + * table. The theParts parameter gives a list expressions suitable for inclusion in WHERE + * clauses; each one defines one partition of the `DataFrame`. + * + * @group specificdata + * @deprecated + * As of 1.4.0, replaced by `read().jdbc()`. + */ + @deprecated("Use read.jdbc() instead.", "1.4.0") + def jdbc(url: String, table: String, theParts: Array[String]): Dataset[Row] = { + read.jdbc(url, table, theParts, new Properties) + } +} + +/** + * This SQLContext object contains utility functions to create a singleton SQLContext instance, or + * to get the created SQLContext instance. + * + * It also provides utility functions to support preference for threads in multiple sessions + * scenario, setActive could set a SQLContext for current thread, which will be returned by + * getOrCreate instead of the global one. + */ +trait SQLContextCompanion { + private[sql] type SQLContextImpl <: SQLContext + private[sql] type SparkContextImpl <: SparkContext + + /** + * Get the singleton SQLContext if it exists or create a new one using the given SparkContext. + * + * This function can be used to create a singleton SQLContext object that can be shared across + * the JVM. + * + * If there is an active SQLContext for current thread, it will be returned instead of the + * global one. + * + * @since 1.5.0 + */ + @deprecated("Use SparkSession.builder instead", "2.0.0") + def getOrCreate(sparkContext: SparkContextImpl): SQLContextImpl + + /** + * Changes the SQLContext that will be returned in this thread and its children when + * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives a + * SQLContext with an isolated session, instead of the global (first created) context. + * + * @since 1.6.0 + */ + @deprecated("Use SparkSession.setActiveSession instead", "2.0.0") + def setActive(sqlContext: SQLContextImpl): Unit = { + SparkSession.setActiveSession(sqlContext.sparkSession) + } + + /** + * Clears the active SQLContext for current thread. Subsequent calls to getOrCreate will return + * the first created context instead of a thread-local override. + * + * @since 1.6.0 + */ + @deprecated("Use SparkSession.clearActiveSession instead", "2.0.0") + def clearActive(): Unit = { + SparkSession.clearActiveSession() + } +} diff --git a/sql/api/src/main/scala/org/apache/spark/sql/api/SQLImplicits.scala b/sql/api/src/main/scala/org/apache/spark/sql/api/SQLImplicits.scala index 5e022570d3ca7..200e913b5412e 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/api/SQLImplicits.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/api/SQLImplicits.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, DE * * @since 1.6.0 */ -abstract class SQLImplicits extends LowPrioritySQLImplicits with Serializable { +abstract class SQLImplicits extends EncoderImplicits with Serializable { type DS[U] <: Dataset[U] protected def session: SparkSession @@ -51,8 +51,35 @@ abstract class SQLImplicits extends LowPrioritySQLImplicits with Serializable { } } - // Primitives + /** + * Creates a [[Dataset]] from a local Seq. + * @since 1.6.0 + */ + implicit def localSeqToDatasetHolder[T: Encoder](s: Seq[T]): DatasetHolder[T, DS] = { + new DatasetHolder(session.createDataset(s).asInstanceOf[DS[T]]) + } + + /** + * Creates a [[Dataset]] from an RDD. + * + * @since 1.6.0 + */ + implicit def rddToDatasetHolder[T: Encoder](rdd: RDD[T]): DatasetHolder[T, DS] = + new DatasetHolder(session.createDataset(rdd).asInstanceOf[DS[T]]) + + /** + * An implicit conversion that turns a Scala `Symbol` into a [[org.apache.spark.sql.Column]]. + * @since 1.3.0 + */ + implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) +} +/** + * EncoderImplicits used to implicitly generate SQL Encoders. Note that these functions don't rely + * on or expose `SparkSession`. + */ +trait EncoderImplicits extends LowPrioritySQLImplicits with Serializable { + // Primitives /** @since 1.6.0 */ implicit def newIntEncoder: Encoder[Int] = Encoders.scalaInt @@ -270,28 +297,6 @@ abstract class SQLImplicits extends LowPrioritySQLImplicits with Serializable { /** @since 1.6.1 */ implicit def newProductArrayEncoder[A <: Product: TypeTag]: Encoder[Array[A]] = newArrayEncoder(ScalaReflection.encoderFor[A]) - - /** - * Creates a [[Dataset]] from a local Seq. - * @since 1.6.0 - */ - implicit def localSeqToDatasetHolder[T: Encoder](s: Seq[T]): DatasetHolder[T, DS] = { - new DatasetHolder(session.createDataset(s).asInstanceOf[DS[T]]) - } - - /** - * Creates a [[Dataset]] from an RDD. - * - * @since 1.6.0 - */ - implicit def rddToDatasetHolder[T: Encoder](rdd: RDD[T]): DatasetHolder[T, DS] = - new DatasetHolder(session.createDataset(rdd).asInstanceOf[DS[T]]) - - /** - * An implicit conversion that turns a Scala `Symbol` into a [[org.apache.spark.sql.Column]]. - * @since 1.3.0 - */ - implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) } /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/api/SparkSession.scala b/sql/api/src/main/scala/org/apache/spark/sql/api/SparkSession.scala index 64b0a87c573d3..af2144cb9eb41 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/api/SparkSession.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/api/SparkSession.scala @@ -30,7 +30,7 @@ import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.annotation.{DeveloperApi, Experimental, Stable, Unstable} import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Encoder, ExperimentalMethods, Row, RuntimeConfig, SparkSessionExtensions, SQLContext} +import org.apache.spark.sql.{Encoder, ExperimentalMethods, Row, RuntimeConfig, SparkSessionExtensions} import org.apache.spark.sql.internal.{SessionState, SharedState} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.StructType @@ -470,7 +470,6 @@ abstract class SparkSession extends Serializable with Closeable { * is. * @since 3.5.0 */ - @Experimental def sql(sqlText: String, args: Array[_]): Dataset[Row] /** @@ -488,7 +487,6 @@ abstract class SparkSession extends Serializable with Closeable { * `array()`, `struct()`, in that case it is taken as is. * @since 3.4.0 */ - @Experimental def sql(sqlText: String, args: Map[String, Any]): Dataset[Row] /** @@ -506,7 +504,6 @@ abstract class SparkSession extends Serializable with Closeable { * `array()`, `struct()`, in that case it is taken as is. * @since 3.4.0 */ - @Experimental def sql(sqlText: String, args: util.Map[String, Any]): Dataset[Row] = { sql(sqlText, args.asScala.toMap) } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/avro/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/avro/functions.scala index fffad557aca5e..e30a9e7c2ba01 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/avro/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/avro/functions.scala @@ -94,4 +94,32 @@ object functions { def to_avro(data: Column, jsonFormatSchema: String): Column = { Column.fn("to_avro", data, lit(jsonFormatSchema)) } + + /** + * Returns schema in the DDL format of the avro schema in JSON string format. + * + * @param jsonFormatSchema + * the avro schema in JSON string format. + * + * @since 4.0.0 + */ + @Experimental + def schema_of_avro(jsonFormatSchema: String): Column = { + Column.fn("schema_of_avro", lit(jsonFormatSchema)) + } + + /** + * Returns schema in the DDL format of the avro schema in JSON string format. + * + * @param jsonFormatSchema + * the avro schema in JSON string format. + * @param options + * options to control how the Avro record is parsed. + * + * @since 4.0.0 + */ + @Experimental + def schema_of_avro(jsonFormatSchema: String, options: java.util.Map[String, String]): Column = { + Column.fnWithOptions("schema_of_avro", options.asScala.iterator, lit(jsonFormatSchema)) + } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/AgnosticEncoder.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/AgnosticEncoder.scala index 9ae7de97abf58..d998502ac1b25 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/AgnosticEncoder.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/AgnosticEncoder.scala @@ -231,6 +231,8 @@ object AgnosticEncoders { // Nullable leaf encoders case object NullEncoder extends LeafEncoder[java.lang.Void](NullType) case object StringEncoder extends LeafEncoder[String](StringType) + case class CharEncoder(length: Int) extends LeafEncoder[String](CharType(length)) + case class VarcharEncoder(length: Int) extends LeafEncoder[String](VarcharType(length)) case object BinaryEncoder extends LeafEncoder[Array[Byte]](BinaryType) case object ScalaBigIntEncoder extends LeafEncoder[BigInt](DecimalType.BigIntDecimal) case object JavaBigIntEncoder extends LeafEncoder[JBigInt](DecimalType.BigIntDecimal) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index 8b6da805a6e87..7260ff8f9fefd 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -21,7 +21,7 @@ import scala.collection.mutable import scala.reflect.classTag import org.apache.spark.sql.{AnalysisException, Row} -import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, DateEncoder, DayTimeIntervalEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, TimestampEncoder, UDTEncoder, VariantEncoder, YearMonthIntervalEncoder} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, CharEncoder, DateEncoder, DayTimeIntervalEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, TimestampEncoder, UDTEncoder, VarcharEncoder, VariantEncoder, YearMonthIntervalEncoder} import org.apache.spark.sql.errors.{DataTypeErrorsBase, ExecutionErrors} import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types._ @@ -80,7 +80,11 @@ object RowEncoder extends DataTypeErrorsBase { case DoubleType => BoxedDoubleEncoder case dt: DecimalType => JavaDecimalEncoder(dt, lenientSerialization = true) case BinaryType => BinaryEncoder - case _: StringType => StringEncoder + case CharType(length) if SqlApiConf.get.preserveCharVarcharTypeInfo => + CharEncoder(length) + case VarcharType(length) if SqlApiConf.get.preserveCharVarcharTypeInfo => + VarcharEncoder(length) + case s: StringType if StringHelper.isPlainString(s) => StringEncoder case TimestampType if SqlApiConf.get.datetimeJava8ApiEnabled => InstantEncoder(lenient) case TimestampType => TimestampEncoder(lenient) case TimestampNTZType => LocalDateTimeEncoder diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala index 71e8517a4164e..94e014fb77f1b 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala @@ -57,6 +57,14 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] { } } + /** + * Create a multi-part identifier. + */ + override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = + withOrigin(ctx) { + ctx.parts.asScala.map(_.getText).toSeq + } + /** * Resolve/create a primitive type. */ @@ -76,10 +84,11 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] { case (TIMESTAMP_LTZ, Nil) => TimestampType case (STRING, Nil) => typeCtx.children.asScala.toSeq match { - case Seq(_) => SqlApiConf.get.defaultStringType + case Seq(_) => StringType case Seq(_, ctx: CollateClauseContext) => - val collationName = visitCollateClause(ctx) - val collationId = CollationFactory.collationNameToId(collationName) + val collationNameParts = visitCollateClause(ctx).toArray + val collationId = CollationFactory.collationNameToId( + CollationFactory.resolveFullyQualifiedName(collationNameParts)) StringType(collationId) } case (CHARACTER | CHAR, length :: Nil) => CharType(length.getText.toInt) @@ -219,8 +228,8 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] { /** * Returns a collation name. */ - override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) { - ctx.identifier.getText + override def visitCollateClause(ctx: CollateClauseContext): Seq[String] = withOrigin(ctx) { + visitMultipartIdentifier(ctx.collationName) } /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala index 9c043320dc812..8dff1ceccfcfe 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala @@ -22,7 +22,7 @@ import java.time.temporal.ChronoField import java.util.{Calendar, TimeZone} import java.util.Calendar.{DAY_OF_MONTH, DST_OFFSET, ERA, HOUR_OF_DAY, MINUTE, MONTH, SECOND, YEAR, ZONE_OFFSET} -import scala.collection.mutable.AnyRefMap +import scala.collection.mutable.HashMap import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} @@ -285,12 +285,12 @@ object RebaseDateTime { } // Loads rebasing info from an JSON file. JSON records in the files should conform to - // `JsonRebaseRecord`. AnyRefMap is used here instead of Scala's immutable map because - // it is 2 times faster in DateTimeRebaseBenchmark. - private[sql] def loadRebaseRecords(fileName: String): AnyRefMap[String, RebaseInfo] = { + // `JsonRebaseRecord`. Mutable HashMap is used here instead of AnyRefMap due to SPARK-49491. + private[sql] def loadRebaseRecords(fileName: String): HashMap[String, RebaseInfo] = { val file = SparkClassUtils.getSparkClassLoader.getResource(fileName) val jsonRebaseRecords = mapper.readValue[Seq[JsonRebaseRecord]](file) - val anyRefMap = new AnyRefMap[String, RebaseInfo]((3 * jsonRebaseRecords.size) / 2) + val hashMap = new HashMap[String, RebaseInfo] + hashMap.sizeHint(jsonRebaseRecords.size) jsonRebaseRecords.foreach { jsonRecord => val rebaseInfo = RebaseInfo(jsonRecord.switches, jsonRecord.diffs) var i = 0 @@ -299,9 +299,9 @@ object RebaseDateTime { rebaseInfo.diffs(i) = rebaseInfo.diffs(i) * MICROS_PER_SECOND i += 1 } - anyRefMap.update(jsonRecord.tz, rebaseInfo) + hashMap.update(jsonRecord.tz, rebaseInfo) } - anyRefMap + hashMap } /** @@ -313,7 +313,7 @@ object RebaseDateTime { */ private val gregJulianRebaseMap = loadRebaseRecords("gregorian-julian-rebase-micros.json") - private def getLastSwitchTs(rebaseMap: AnyRefMap[String, RebaseInfo]): Long = { + private def getLastSwitchTs(rebaseMap: HashMap[String, RebaseInfo]): Long = { val latestTs = rebaseMap.values.map(_.switches.last).max require( rebaseMap.values.forall(_.diffs.last == 0), @@ -404,7 +404,7 @@ object RebaseDateTime { if (micros >= lastSwitchGregorianTs) { micros } else { - val rebaseRecord = gregJulianRebaseMap.getOrNull(timeZoneId) + val rebaseRecord = gregJulianRebaseMap.get(timeZoneId).orNull if (rebaseRecord == null || micros < rebaseRecord.switches(0)) { rebaseGregorianToJulianMicros(TimeZone.getTimeZone(timeZoneId), micros) } else { @@ -526,7 +526,7 @@ object RebaseDateTime { if (micros >= lastSwitchJulianTs) { micros } else { - val rebaseRecord = julianGregRebaseMap.getOrNull(timeZoneId) + val rebaseRecord = julianGregRebaseMap.get(timeZoneId).orNull if (rebaseRecord == null || micros < rebaseRecord.switches(0)) { rebaseJulianToGregorianMicros(TimeZone.getTimeZone(timeZoneId), micros) } else { diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkCharVarcharUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkCharVarcharUtils.scala index 2a26c079e8d4d..51b2c40f9bf2e 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkCharVarcharUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkCharVarcharUtils.scala @@ -54,8 +54,7 @@ trait SparkCharVarcharUtils { StructType(fields.map { field => field.copy(dataType = replaceCharVarcharWithString(field.dataType)) }) - case _: CharType => StringType - case _: VarcharType => StringType + case CharType(_) | VarcharType(_) if !SqlApiConf.get.preserveCharVarcharTypeInfo => StringType case _ => dt } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index 0608322be13b3..e8c50be9f5513 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -73,7 +73,7 @@ object SparkStringUtils extends Logging { /** * Format a sequence with semantics similar to calling .mkString(). Any elements beyond - * maxNumToStringFields will be dropped and replaced by a "... N more fields" placeholder. + * `maxFields` will be dropped and replaced by a "... N more fields" placeholder. * * @return * the trimmed and formatted string. @@ -90,10 +90,11 @@ object SparkStringUtils extends Logging { "Truncated the string representation of a plan since it was too large. This " + s"behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.") } - val numFields = math.max(0, maxFields - 1) - seq - .take(numFields) - .mkString(start, sep, sep + "... " + (seq.length - numFields) + " more fields" + end) + val numFields = math.max(0, maxFields) + val restNum = seq.length - numFields + val ending = (if (numFields == 0) "" else sep) + + (if (restNum == 0) "" else s"... $restNum more fields") + end + seq.take(numFields).mkString(start, sep, ending) } else { seq.mkString(start, sep, end) } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/errors/CompilationErrors.scala b/sql/api/src/main/scala/org/apache/spark/sql/errors/CompilationErrors.scala index 3e63b8281f739..617cab4b2a39b 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/errors/CompilationErrors.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/errors/CompilationErrors.scala @@ -41,6 +41,18 @@ private[sql] trait CompilationErrors extends DataTypeErrorsBase { cause = Option(cause)) } + def describeJsonNotExtendedError(tableName: String): AnalysisException = { + new AnalysisException( + errorClass = "DESCRIBE_JSON_NOT_EXTENDED", + messageParameters = Map("tableName" -> tableName)) + } + + def describeColJsonUnsupportedError(): AnalysisException = { + new AnalysisException( + errorClass = "UNSUPPORTED_FEATURE.DESC_TABLE_COLUMN_JSON", + messageParameters = Map.empty) + } + def cannotFindDescriptorFileError(filePath: String, cause: Throwable): AnalysisException = { new AnalysisException( errorClass = "PROTOBUF_DESCRIPTOR_FILE_NOT_FOUND", diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala index 2a04212ee2585..9f509fa843a2b 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala @@ -1147,6 +1147,77 @@ object functions { */ def sum_distinct(e: Column): Column = Column.fn("sum", isDistinct = true, e) + /** + * Aggregate function: returns the concatenation of non-null input values. + * + * @group agg_funcs + * @since 4.0.0 + */ + def listagg(e: Column): Column = Column.fn("listagg", e) + + /** + * Aggregate function: returns the concatenation of non-null input values, separated by the + * delimiter. + * + * @group agg_funcs + * @since 4.0.0 + */ + def listagg(e: Column, delimiter: Column): Column = Column.fn("listagg", e, delimiter) + + /** + * Aggregate function: returns the concatenation of distinct non-null input values. + * + * @group agg_funcs + * @since 4.0.0 + */ + def listagg_distinct(e: Column): Column = Column.fn("listagg", isDistinct = true, e) + + /** + * Aggregate function: returns the concatenation of distinct non-null input values, separated by + * the delimiter. + * + * @group agg_funcs + * @since 4.0.0 + */ + def listagg_distinct(e: Column, delimiter: Column): Column = + Column.fn("listagg", isDistinct = true, e, delimiter) + + /** + * Aggregate function: returns the concatenation of non-null input values. Alias for `listagg`. + * + * @group agg_funcs + * @since 4.0.0 + */ + def string_agg(e: Column): Column = Column.fn("string_agg", e) + + /** + * Aggregate function: returns the concatenation of non-null input values, separated by the + * delimiter. Alias for `listagg`. + * + * @group agg_funcs + * @since 4.0.0 + */ + def string_agg(e: Column, delimiter: Column): Column = Column.fn("string_agg", e, delimiter) + + /** + * Aggregate function: returns the concatenation of distinct non-null input values. Alias for + * `listagg`. + * + * @group agg_funcs + * @since 4.0.0 + */ + def string_agg_distinct(e: Column): Column = Column.fn("string_agg", isDistinct = true, e) + + /** + * Aggregate function: returns the concatenation of distinct non-null input values, separated by + * the delimiter. Alias for `listagg`. + * + * @group agg_funcs + * @since 4.0.0 + */ + def string_agg_distinct(e: Column, delimiter: Column): Column = + Column.fn("string_agg", isDistinct = true, e, delimiter) + /** * Aggregate function: alias for `var_samp`. * diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala index 773494f418659..76cd436b39b58 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala @@ -40,6 +40,7 @@ private[sql] trait SqlApiConf { def timestampType: AtomicType def allowNegativeScaleOfDecimalEnabled: Boolean def charVarcharAsString: Boolean + def preserveCharVarcharTypeInfo: Boolean def datetimeJava8ApiEnabled: Boolean def sessionLocalTimeZone: String def legacyTimeParserPolicy: LegacyBehaviorPolicy.Value @@ -47,7 +48,6 @@ private[sql] trait SqlApiConf { def stackTracesInDataFrameContext: Int def dataFrameQueryContextEnabled: Boolean def legacyAllowUntypedScalaUDFs: Boolean - def allowReadingUnknownCollations: Boolean } private[sql] object SqlApiConf { @@ -60,7 +60,6 @@ private[sql] object SqlApiConf { SqlApiConfHelper.LOCAL_RELATION_CACHE_THRESHOLD_KEY } val DEFAULT_COLLATION: String = SqlApiConfHelper.DEFAULT_COLLATION - val ALLOW_READING_UNKNOWN_COLLATIONS: String = SqlApiConfHelper.ALLOW_READING_UNKNOWN_COLLATIONS def get: SqlApiConf = SqlApiConfHelper.getConfGetter.get()() @@ -82,6 +81,7 @@ private[sql] object DefaultSqlApiConf extends SqlApiConf { override def timestampType: AtomicType = TimestampType override def allowNegativeScaleOfDecimalEnabled: Boolean = false override def charVarcharAsString: Boolean = false + override def preserveCharVarcharTypeInfo: Boolean = false override def datetimeJava8ApiEnabled: Boolean = false override def sessionLocalTimeZone: String = TimeZone.getDefault.getID override def legacyTimeParserPolicy: LegacyBehaviorPolicy.Value = LegacyBehaviorPolicy.CORRECTED @@ -89,5 +89,4 @@ private[sql] object DefaultSqlApiConf extends SqlApiConf { override def stackTracesInDataFrameContext: Int = 1 override def dataFrameQueryContextEnabled: Boolean = true override def legacyAllowUntypedScalaUDFs: Boolean = false - override def allowReadingUnknownCollations: Boolean = false } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala index c8d6f395d4506..13ef13e5894e0 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala @@ -33,8 +33,6 @@ private[sql] object SqlApiConfHelper { val SESSION_LOCAL_TIMEZONE_KEY: String = "spark.sql.session.timeZone" val LOCAL_RELATION_CACHE_THRESHOLD_KEY: String = "spark.sql.session.localRelationCacheThreshold" val DEFAULT_COLLATION: String = "spark.sql.session.collation.default" - val ALLOW_READING_UNKNOWN_COLLATIONS: String = - "spark.sql.collation.allowReadingUnknownCollations" val confGetter: AtomicReference[() => SqlApiConf] = { new AtomicReference[() => SqlApiConf](() => DefaultSqlApiConf) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/columnNodes.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/columnNodes.scala index e3cc320a8b00f..ef4bdb8d5bdff 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/internal/columnNodes.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/columnNodes.scala @@ -70,6 +70,19 @@ private[sql] trait ColumnNode extends ColumnNodeLike { trait ColumnNodeLike { private[internal] def normalize(): ColumnNodeLike = this private[internal] def sql: String + private[internal] def children: Seq[ColumnNodeLike] + + private[sql] def foreach(f: ColumnNodeLike => Unit): Unit = { + f(this) + children.foreach(_.foreach(f)) + } + + private[sql] def collect[A](pf: PartialFunction[ColumnNodeLike, A]): Seq[A] = { + val ret = new collection.mutable.ArrayBuffer[A]() + val lifted = pf.lift + foreach(node => lifted(node).foreach(ret.+=)) + ret.toSeq + } } private[internal] object ColumnNode { @@ -118,6 +131,8 @@ private[sql] case class Literal( case v: Short => toSQLValue(v) case _ => value.toString } + + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } /** @@ -141,6 +156,8 @@ private[sql] case class UnresolvedAttribute( copy(planId = None, origin = NO_ORIGIN) override def sql: String = nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".") + + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } private[sql] object UnresolvedAttribute { @@ -167,24 +184,6 @@ private[sql] object UnresolvedAttribute { apply(unparsedIdentifier, None, false, CurrentOrigin.get) } -/** - * Reference to an attribute in the outer context, used for Subqueries. - * - * @param nameParts - * name of the attribute. - * @param planId - * id of the plan (Dataframe) that produces the attribute. - */ -private[sql] case class LazyOuterReference( - nameParts: Seq[String], - planId: Option[Long] = None, - override val origin: Origin = CurrentOrigin.get) - extends ColumnNode { - override private[internal] def normalize(): LazyOuterReference = - copy(planId = None, origin = NO_ORIGIN) - override def sql: String = nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".") -} - /** * Reference to all columns in a namespace (global, a Dataframe, or a nested struct). * @@ -201,6 +200,7 @@ private[sql] case class UnresolvedStar( override private[internal] def normalize(): UnresolvedStar = copy(planId = None, origin = NO_ORIGIN) override def sql: String = unparsedTarget.map(_ + ".*").getOrElse("*") + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } /** @@ -226,6 +226,8 @@ private[sql] case class UnresolvedFunction( copy(arguments = ColumnNode.normalize(arguments), origin = NO_ORIGIN) override def sql: String = functionName + argumentsToSql(arguments) + + override private[internal] def children: Seq[ColumnNodeLike] = arguments } /** @@ -240,6 +242,7 @@ private[sql] case class SqlExpression( extends ColumnNode { override private[internal] def normalize(): SqlExpression = copy(origin = NO_ORIGIN) override def sql: String = expression + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } /** @@ -268,6 +271,8 @@ private[sql] case class Alias( } s"${child.sql} AS $alias" } + + override private[internal] def children: Seq[ColumnNodeLike] = Seq(child) } /** @@ -293,10 +298,14 @@ private[sql] case class Cast( override def sql: String = { s"${optionToSql(evalMode)}CAST(${child.sql} AS ${dataType.sql})" } + + override private[internal] def children: Seq[ColumnNodeLike] = Seq(child) ++ evalMode } private[sql] object Cast { - sealed abstract class EvalMode(override val sql: String = "") extends ColumnNodeLike + sealed abstract class EvalMode(override val sql: String = "") extends ColumnNodeLike { + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty + } object Legacy extends EvalMode object Ansi extends EvalMode object Try extends EvalMode("TRY_") @@ -318,6 +327,7 @@ private[sql] case class UnresolvedRegex( override private[internal] def normalize(): UnresolvedRegex = copy(planId = None, origin = NO_ORIGIN) override def sql: String = regex + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } /** @@ -340,13 +350,19 @@ private[sql] case class SortOrder( copy(child = child.normalize(), origin = NO_ORIGIN) override def sql: String = s"${child.sql} ${sortDirection.sql} ${nullOrdering.sql}" + + override def children: Seq[ColumnNodeLike] = Seq(child, sortDirection, nullOrdering) } private[sql] object SortOrder { - sealed abstract class SortDirection(override val sql: String) extends ColumnNodeLike + sealed abstract class SortDirection(override val sql: String) extends ColumnNodeLike { + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty + } object Ascending extends SortDirection("ASC") object Descending extends SortDirection("DESC") - sealed abstract class NullOrdering(override val sql: String) extends ColumnNodeLike + sealed abstract class NullOrdering(override val sql: String) extends ColumnNodeLike { + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty + } object NullsFirst extends NullOrdering("NULLS FIRST") object NullsLast extends NullOrdering("NULLS LAST") } @@ -370,6 +386,8 @@ private[sql] case class Window( origin = NO_ORIGIN) override def sql: String = s"${windowFunction.sql} OVER (${windowSpec.sql})" + + override private[internal] def children: Seq[ColumnNodeLike] = Seq(windowFunction, windowSpec) } private[sql] case class WindowSpec( @@ -388,6 +406,9 @@ private[sql] case class WindowSpec( optionToSql(frame)) parts.filter(_.nonEmpty).mkString(" ") } + override private[internal] def children: Seq[ColumnNodeLike] = { + partitionColumns ++ sortColumns ++ frame + } } private[sql] case class WindowFrame( @@ -399,15 +420,19 @@ private[sql] case class WindowFrame( copy(lower = lower.normalize(), upper = upper.normalize()) override private[internal] def sql: String = s"${frameType.sql} BETWEEN ${lower.sql} AND ${upper.sql}" + override private[internal] def children: Seq[ColumnNodeLike] = Seq(frameType, lower, upper) } private[sql] object WindowFrame { - sealed abstract class FrameType(override val sql: String) extends ColumnNodeLike + sealed abstract class FrameType(override val sql: String) extends ColumnNodeLike { + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty + } object Row extends FrameType("ROWS") object Range extends FrameType("RANGE") sealed abstract class FrameBoundary extends ColumnNodeLike { override private[internal] def normalize(): FrameBoundary = this + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } object CurrentRow extends FrameBoundary { override private[internal] def sql = "CURRENT ROW" @@ -421,6 +446,7 @@ private[sql] object WindowFrame { case class Value(value: ColumnNode) extends FrameBoundary { override private[internal] def normalize(): Value = copy(value.normalize()) override private[internal] def sql: String = value.sql + override private[internal] def children: Seq[ColumnNodeLike] = Seq(value) } def value(i: Int): Value = Value(Literal(i, Some(IntegerType))) def value(l: Long): Value = Value(Literal(l, Some(LongType))) @@ -452,6 +478,8 @@ private[sql] case class LambdaFunction( } argumentsSql + " -> " + function.sql } + + override private[internal] def children: Seq[ColumnNodeLike] = function +: arguments } object LambdaFunction { @@ -473,6 +501,8 @@ private[sql] case class UnresolvedNamedLambdaVariable( copy(origin = NO_ORIGIN) override def sql: String = name + + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } object UnresolvedNamedLambdaVariable { @@ -513,6 +543,8 @@ private[sql] case class UnresolvedExtractValue( copy(child = child.normalize(), extraction = extraction.normalize(), origin = NO_ORIGIN) override def sql: String = s"${child.sql}[${extraction.sql}]" + + override private[internal] def children: Seq[ColumnNodeLike] = Seq(child, extraction) } /** @@ -539,6 +571,9 @@ private[sql] case class UpdateFields( case Some(value) => s"update_field(${structExpression.sql}, $fieldName, ${value.sql})" case None => s"drop_field(${structExpression.sql}, $fieldName)" } + override private[internal] def children: Seq[ColumnNodeLike] = { + structExpression +: valueExpression.toSeq + } } /** @@ -567,6 +602,11 @@ private[sql] case class CaseWhenOtherwise( branches.map(cv => s" WHEN ${cv._1.sql} THEN ${cv._2.sql}").mkString + otherwise.map(o => s" ELSE ${o.sql}").getOrElse("") + " END" + + override private[internal] def children: Seq[ColumnNodeLike] = { + val branchChildren = branches.flatMap { case (condition, value) => Seq(condition, value) } + branchChildren ++ otherwise + } } /** @@ -588,8 +628,26 @@ private[sql] case class InvokeInlineUserDefinedFunction( override def sql: String = function.name + argumentsToSql(arguments) + + override private[internal] def children: Seq[ColumnNodeLike] = arguments } private[sql] trait UserDefinedFunctionLike { def name: String = SparkClassUtils.getFormattedClassName(this) } + +/** + * A marker node to trigger Spark Classic DataFrame lazy analysis. + * + * @param child + * that needs to be lazily analyzed in Spark Classic DataFrame. + */ +private[sql] case class LazyExpression( + child: ColumnNode, + override val origin: Origin = CurrentOrigin.get) + extends ColumnNode { + override private[internal] def normalize(): ColumnNode = + copy(child = child.normalize(), origin = NO_ORIGIN) + override def sql: String = "lazy" + argumentsToSql(Seq(child)) + override private[internal] def children: Seq[ColumnNodeLike] = Seq(child) +} diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala index 49d8bf9e001ab..6dcb8a876b7a2 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.internal.types -import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} /** @@ -26,7 +25,7 @@ import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} abstract class AbstractStringType(supportsTrimCollation: Boolean = false) extends AbstractDataType with Serializable { - override private[sql] def defaultConcreteType: DataType = SqlApiConf.get.defaultStringType + override private[sql] def defaultConcreteType: DataType = StringType override private[sql] def simpleString: String = "string" override private[sql] def acceptsType(other: DataType): Boolean = other match { diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessor.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessor.scala index 55477b4dda0c9..b47629cb54396 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessor.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessor.scala @@ -20,16 +20,25 @@ package org.apache.spark.sql.streaming import java.io.Serializable import org.apache.spark.annotation.{Evolving, Experimental} +import org.apache.spark.sql.api.EncoderImplicits import org.apache.spark.sql.errors.ExecutionErrors /** * Represents the arbitrary stateful logic that needs to be provided by the user to perform * stateful manipulations on keyed streams. + * + * Users can also explicitly use `import implicits._` to access the EncoderImplicits and use the + * state variable APIs relying on implicit encoders. */ @Experimental @Evolving private[sql] abstract class StatefulProcessor[K, I, O] extends Serializable { + // scalastyle:off + // Disable style checker so "implicits" object can start with lowercase i + object implicits extends EncoderImplicits + // scalastyle:on + /** * Handle to the stateful processor that provides access to the state store and other stateful * processing related APIs. diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala index fc32248b4baf3..53dfc5e9b2828 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala @@ -110,4 +110,13 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { f(this) || elementType.existsRecursively(f) } + + override private[spark] def transformRecursively( + f: PartialFunction[DataType, DataType]): DataType = { + if (f.isDefinedAt(this)) { + f(this) + } else { + ArrayType(elementType.transformRecursively(f), containsNull) + } + } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/CharType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/CharType.scala index 5e30ff6e52a14..68dad6c87c01e 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/CharType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/CharType.scala @@ -17,14 +17,19 @@ package org.apache.spark.sql.types +import org.json4s.JsonAST.{JString, JValue} + import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.catalyst.util.CollationFactory @Experimental -case class CharType(length: Int) extends AtomicType { +case class CharType(length: Int) + extends StringType(CollationFactory.UTF8_BINARY_COLLATION_ID, FixedLength(length)) { require(length >= 0, "The length of char type cannot be negative.") override def defaultSize: Int = length override def typeName: String = s"char($length)" + override def jsonValue: JValue = JString(typeName) override def toString: String = s"CharType($length)" private[spark] override def asNullable: CharType = this } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala index 4cf7d8efb96a5..db7e7c0ae1885 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -27,7 +27,7 @@ import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ -import org.apache.spark.{SparkException, SparkIllegalArgumentException, SparkThrowable} +import org.apache.spark.{SparkIllegalArgumentException, SparkThrowable} import org.apache.spark.annotation.Stable import org.apache.spark.sql.catalyst.analysis.SqlApiAnalysis import org.apache.spark.sql.catalyst.parser.DataTypeParser @@ -105,6 +105,13 @@ abstract class DataType extends AbstractDataType { */ private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = f(this) + /** + * Recursively applies the provided partial function `f` to transform this DataType tree. + */ + private[spark] def transformRecursively(f: PartialFunction[DataType, DataType]): DataType = { + if (f.isDefinedAt(this)) f(this) else this + } + final override private[sql] def defaultConcreteType: DataType = this override private[sql] def acceptsType(other: DataType): Boolean = sameType(other) @@ -340,17 +347,8 @@ object DataType { fields.collect { case (fieldPath, JString(collation)) => collation.split("\\.", 2) match { case Array(provider: String, collationName: String) => - try { - CollationFactory.assertValidProvider(provider) - fieldPath -> collationName - } catch { - case e: SparkException - if e.getCondition == "COLLATION_INVALID_PROVIDER" && - SqlApiConf.get.allowReadingUnknownCollations => - // If the collation provider is unknown and the config for reading such - // collations is enabled, return the UTF8_BINARY collation. - fieldPath -> "UTF8_BINARY" - } + CollationFactory.assertValidProvider(provider) + fieldPath -> collationName } }.toMap @@ -359,16 +357,7 @@ object DataType { } private def stringTypeWithCollation(collationName: String): StringType = { - try { - StringType(CollationFactory.collationNameToId(collationName)) - } catch { - case e: SparkException - if e.getCondition == "COLLATION_INVALID_NAME" && - SqlApiConf.get.allowReadingUnknownCollations => - // If the collation name is unknown and the config for reading such collations is enabled, - // return the UTF8_BINARY collation. - StringType(CollationFactory.UTF8_BINARY_COLLATION_ID) - } + StringType(CollationFactory.collationNameToId(collationName)) } protected[types] def buildFormattedString( @@ -458,7 +447,7 @@ object DataType { private[sql] def equalsIgnoreCompatibleCollation(from: DataType, to: DataType): Boolean = { (from, to) match { // String types with possibly different collations are compatible. - case (_: StringType, _: StringType) => true + case (a: StringType, b: StringType) => a.constraint == b.constraint case (fromDataType, toDataType) => fromDataType == toDataType } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala index 1dfb9aaf9e29b..de656c13ca4bf 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -89,6 +89,18 @@ case class MapType(keyType: DataType, valueType: DataType, valueContainsNull: Bo override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { f(this) || keyType.existsRecursively(f) || valueType.existsRecursively(f) } + + override private[spark] def transformRecursively( + f: PartialFunction[DataType, DataType]): DataType = { + if (f.isDefinedAt(this)) { + f(this) + } else { + MapType( + keyType.transformRecursively(f), + valueType.transformRecursively(f), + valueContainsNull) + } + } } /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala index 1eb645e37c4aa..cd3182ab2dcde 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -21,6 +21,7 @@ import org.json4s.JsonAST.{JString, JValue} import org.apache.spark.annotation.Stable import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.internal.SqlApiConf /** * The data type representing `String` values. Please use the singleton `DataTypes.StringType`. @@ -30,7 +31,11 @@ import org.apache.spark.sql.catalyst.util.CollationFactory * The id of collation for this StringType. */ @Stable -class StringType private (val collationId: Int) extends AtomicType with Serializable { +class StringType private[sql] ( + val collationId: Int, + val constraint: StringConstraint = NoConstraint) + extends AtomicType + with Serializable { /** * Support for Binary Equality implies that strings are considered equal only if they are byte @@ -39,7 +44,8 @@ class StringType private (val collationId: Int) extends AtomicType with Serializ * equality and hashing). */ private[sql] def supportsBinaryEquality: Boolean = - CollationFactory.fetchCollation(collationId).supportsBinaryEquality + collationId == CollationFactory.UTF8_BINARY_COLLATION_ID || + CollationFactory.fetchCollation(collationId).supportsBinaryEquality private[sql] def supportsLowercaseEquality: Boolean = CollationFactory.fetchCollation(collationId).supportsLowercaseEquality @@ -75,15 +81,26 @@ class StringType private (val collationId: Int) extends AtomicType with Serializ */ override def typeName: String = if (isUTF8BinaryCollation) "string" - else s"string collate ${CollationFactory.fetchCollation(collationId).collationName}" + else s"string collate $collationName" + + override def toString: String = + if (isUTF8BinaryCollation) "StringType" + else s"StringType($collationName)" + + private[sql] def collationName: String = + CollationFactory.fetchCollation(collationId).collationName // Due to backwards compatibility and compatibility with other readers // all string types are serialized in json as regular strings and // the collation information is written to struct field metadata override def jsonValue: JValue = JString("string") - override def equals(obj: Any): Boolean = - obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId == collationId + override def equals(obj: Any): Boolean = { + obj match { + case s: StringType => s.collationId == collationId && s.constraint == constraint + case _ => false + } + } override def hashCode(): Int = collationId.hashCode() @@ -101,7 +118,8 @@ class StringType private (val collationId: Int) extends AtomicType with Serializ * @since 1.3.0 */ @Stable -case object StringType extends StringType(0) { +case object StringType + extends StringType(CollationFactory.UTF8_BINARY_COLLATION_ID, NoConstraint) { private[spark] def apply(collationId: Int): StringType = new StringType(collationId) def apply(collation: String): StringType = { @@ -109,3 +127,65 @@ case object StringType extends StringType(0) { new StringType(collationId) } } + +sealed trait StringConstraint + +case object StringHelper extends PartialOrdering[StringConstraint] { + override def tryCompare(x: StringConstraint, y: StringConstraint): Option[Int] = { + (x, y) match { + case (NoConstraint, NoConstraint) => Some(0) + case (NoConstraint, _) => Some(-1) + case (_, NoConstraint) => Some(1) + case (FixedLength(l1), FixedLength(l2)) => Some(l2.compareTo(l1)) + case (FixedLength(l1), MaxLength(l2)) if l1 <= l2 => Some(1) + case (MaxLength(l1), FixedLength(l2)) if l1 >= l2 => Some(-1) + case (MaxLength(l1), MaxLength(l2)) => Some(l2.compareTo(l1)) + case _ => None + } + } + + override def lteq(x: StringConstraint, y: StringConstraint): Boolean = { + tryCompare(x, y).exists(_ <= 0) + } + + override def gteq(x: StringConstraint, y: StringConstraint): Boolean = { + tryCompare(x, y).exists(_ >= 0) + } + + override def equiv(x: StringConstraint, y: StringConstraint): Boolean = { + tryCompare(x, y).contains(0) + } + + def isPlainString(s: StringType): Boolean = s.constraint == NoConstraint + + def isMoreConstrained(a: StringType, b: StringType): Boolean = + gteq(a.constraint, b.constraint) + + def tightestCommonString(s1: StringType, s2: StringType): Option[StringType] = { + if (s1.collationId != s2.collationId) { + return None + } + if (!SqlApiConf.get.preserveCharVarcharTypeInfo) { + return Some(StringType(s1.collationId)) + } + Some((s1.constraint, s2.constraint) match { + case (FixedLength(l1), FixedLength(l2)) => CharType(l1.max(l2)) + case (MaxLength(l1), FixedLength(l2)) => VarcharType(l1.max(l2)) + case (FixedLength(l1), MaxLength(l2)) => VarcharType(l1.max(l2)) + case (MaxLength(l1), MaxLength(l2)) => VarcharType(l1.max(l2)) + case _ => StringType(s1.collationId) + }) + } + + def removeCollation(s: StringType): StringType = s match { + case CharType(length) => CharType(length) + case VarcharType(length) => VarcharType(length) + case _: StringType => StringType + } +} + +case object NoConstraint extends StringConstraint + +case class FixedLength(length: Int) extends StringConstraint + +case class MaxLength(length: Int) extends StringConstraint diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala index d4e590629921c..f33a49e686a59 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala @@ -147,6 +147,18 @@ case class StructField( if (metadata.contains("comment")) Option(metadata.getString("comment")) else None } + /** + * Return the default value of this StructField. This is used for storing the default value of a + * function parameter. + */ + private[sql] def getDefault(): Option[String] = { + if (metadata.contains("default")) { + Option(metadata.getString("default")) + } else { + None + } + } + /** * Updates the StructField with a new current default value. */ diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala index 07f6b50bd4a7a..cc95d8ee94b02 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -502,6 +502,18 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { f(this) || fields.exists(field => field.dataType.existsRecursively(f)) } + + override private[spark] def transformRecursively( + f: PartialFunction[DataType, DataType]): DataType = { + if (f.isDefinedAt(this)) { + return f(this) + } + + val newFields = fields.map { field => + field.copy(dataType = field.dataType.transformRecursively(f)) + } + StructType(newFields) + } } /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala index 4993e249b3059..6272cb03bd797 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala @@ -40,8 +40,11 @@ private[sql] object UpCastRule { case (DateType, TimestampNTZType) => true case (TimestampNTZType, TimestampType) => true case (TimestampType, TimestampNTZType) => true - case (_: AtomicType, StringType) => true - case (_: CalendarIntervalType, StringType) => true + + case (s1: StringType, s2: StringType) => StringHelper.isMoreConstrained(s1, s2) + // TODO: allow upcast from int/double/decimal to char/varchar of sufficient length + case (_: AtomicType, s: StringType) => StringHelper.isPlainString(s) + case (_: CalendarIntervalType, s: StringType) => StringHelper.isPlainString(s) case (NullType, _) => true // Spark supports casting between long and timestamp, please see `longToTimestamp` and @@ -69,7 +72,7 @@ private[sql] object UpCastRule { case _ => false } - private def legalNumericPrecedence(from: DataType, to: DataType): Boolean = { + def legalNumericPrecedence(from: DataType, to: DataType): Boolean = { val fromPrecedence = numericPrecedence.indexOf(from) val toPrecedence = numericPrecedence.indexOf(to) fromPrecedence >= 0 && fromPrecedence < toPrecedence diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/VarcharType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/VarcharType.scala index 3d21e2e65804e..22f7947b25037 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/VarcharType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/VarcharType.scala @@ -16,14 +16,19 @@ */ package org.apache.spark.sql.types +import org.json4s.JsonAST.{JString, JValue} + import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.catalyst.util.CollationFactory @Experimental -case class VarcharType(length: Int) extends AtomicType { +case class VarcharType(length: Int) + extends StringType(CollationFactory.UTF8_BINARY_COLLATION_ID, MaxLength(length)) { require(length >= 0, "The length of varchar type cannot be negative.") override def defaultSize: Int = length override def typeName: String = s"varchar($length)" + override def jsonValue: JValue = JString(typeName) override def toString: String = s"VarcharType($length)" private[spark] override def asNullable: VarcharType = this } diff --git a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt index 290568730a22c..24bc5a5efcaae 100644 --- a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt @@ -2,10 +2,10 @@ CalendarInterval ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor CalendarInterval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Call setInterval & getInterval 1028 1028 0 130.5 7.7 1.0X +Call setInterval & getInterval 1040 1051 16 129.1 7.7 1.0X diff --git a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt index 526008a3fced1..a5bd7ce02cc8f 100644 --- a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt +++ b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt @@ -2,10 +2,10 @@ CalendarInterval ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor CalendarInterval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Call setInterval & getInterval 1018 1021 4 131.8 7.6 1.0X +Call setInterval & getInterval 1030 1030 0 130.3 7.7 1.0X diff --git a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt index b5635dcb20d33..8b9dd199d9df8 100644 --- a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt @@ -1,105 +1,105 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 1 1 0 1390.0 0.7 1.0X -Use EnumSet 2 2 0 441.2 2.3 0.3X +Use HashSet 3 3 0 291.9 3.4 1.0X +Use EnumSet 4 4 0 227.7 4.4 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 485.8 2.1 1.0X -Use EnumSet 2 2 0 544.4 1.8 1.1X +Use HashSet 7 8 1 138.0 7.2 1.0X +Use EnumSet 5 5 0 185.8 5.4 1.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 493.1 2.0 1.0X -Use EnumSet 2 2 0 575.2 1.7 1.2X +Use HashSet 14 14 0 71.9 13.9 1.0X +Use EnumSet 5 5 0 186.1 5.4 2.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 9 9 0 107.2 9.3 1.0X -Use EnumSet 2 2 0 534.9 1.9 5.0X +Use HashSet 11 11 1 91.4 10.9 1.0X +Use EnumSet 5 5 0 186.1 5.4 2.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 10 10 0 98.5 10.1 1.0X -Use EnumSet 2 2 0 534.9 1.9 5.4X +Use HashSet 12 13 0 80.5 12.4 1.0X +Use EnumSet 5 5 0 188.4 5.3 2.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 0 0 0 408.8 2.4 1.0X -Use EnumSet 1 1 0 136.6 7.3 0.3X +Use HashSet 0 0 0 397.1 2.5 1.0X +Use EnumSet 0 0 0 291.5 3.4 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 1 1 0 102.5 9.8 1.0X -Use EnumSet 0 0 0 291.4 3.4 2.8X +Use HashSet 2 2 0 49.9 20.0 1.0X +Use EnumSet 0 0 0 291.2 3.4 5.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 7 7 0 14.6 68.6 1.0X -Use EnumSet 1 1 0 132.3 7.6 9.1X +Use HashSet 6 6 0 16.0 62.4 1.0X +Use EnumSet 1 1 0 132.4 7.6 8.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 34 35 0 2.9 342.4 1.0X -Use EnumSet 1 1 0 150.1 6.7 51.4X +Use HashSet 29 30 1 3.4 292.1 1.0X +Use EnumSet 1 1 0 150.4 6.7 43.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 63 63 1 1.6 627.2 1.0X -Use EnumSet 1 1 0 138.3 7.2 86.8X +Use HashSet 56 56 1 1.8 557.0 1.0X +Use EnumSet 1 1 0 138.6 7.2 77.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 4 4 0 227.4 4.4 1.0X -Use EnumSet 5 5 0 187.2 5.3 0.8X +Use HashSet 4 4 0 265.6 3.8 1.0X +Use EnumSet 5 5 0 196.1 5.1 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 14 14 0 70.4 14.2 1.0X -Use EnumSet 7 7 0 150.5 6.6 2.1X +Use HashSet 9 10 0 110.6 9.0 1.0X +Use EnumSet 6 6 0 160.0 6.3 1.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 27 28 0 36.6 27.3 1.0X -Use EnumSet 7 7 0 151.3 6.6 4.1X +Use HashSet 22 22 0 45.4 22.0 1.0X +Use EnumSet 6 6 0 163.3 6.1 3.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 54 55 1 18.4 54.4 1.0X -Use EnumSet 7 7 0 147.6 6.8 8.0X +Use HashSet 49 49 0 20.4 49.1 1.0X +Use EnumSet 6 6 0 158.7 6.3 7.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Use HashSet 84 85 0 11.9 83.9 1.0X -Use EnumSet 7 7 0 137.2 7.3 11.5X +Use HashSet 76 77 1 13.1 76.2 1.0X +Use EnumSet 6 7 0 159.0 6.3 12.1X diff --git a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt index 1794f82b64b11..4b1c3dce2b115 100644 --- a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt +++ b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt @@ -1,105 +1,105 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 1 1 0 1441.2 0.7 1.0X -Use EnumSet 2 2 0 563.7 1.8 0.4X +Use HashSet 4 4 0 279.7 3.6 1.0X +Use EnumSet 4 4 0 225.9 4.4 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 445.8 2.2 1.0X -Use EnumSet 2 2 0 554.4 1.8 1.2X +Use HashSet 9 9 1 110.9 9.0 1.0X +Use EnumSet 5 5 0 185.6 5.4 1.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 547.8 1.8 1.0X -Use EnumSet 2 2 0 561.3 1.8 1.0X +Use HashSet 14 14 2 74.0 13.5 1.0X +Use EnumSet 5 5 0 185.6 5.4 2.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 8 8 0 121.9 8.2 1.0X -Use EnumSet 2 2 0 545.1 1.8 4.5X +Use HashSet 14 14 1 71.1 14.1 1.0X +Use EnumSet 5 5 0 185.7 5.4 2.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 9 9 0 110.1 9.1 1.0X -Use EnumSet 2 2 0 545.0 1.8 5.0X +Use HashSet 15 15 0 68.2 14.7 1.0X +Use EnumSet 5 5 0 185.7 5.4 2.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 0 0 0 409.8 2.4 1.0X -Use EnumSet 1 1 0 127.6 7.8 0.3X +Use HashSet 0 0 0 407.9 2.5 1.0X +Use EnumSet 0 0 0 225.2 4.4 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 60.0 16.7 1.0X -Use EnumSet 1 1 0 145.0 6.9 2.4X +Use HashSet 2 2 0 48.3 20.7 1.0X +Use EnumSet 1 1 0 87.6 11.4 1.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 10 10 0 10.2 97.7 1.0X -Use EnumSet 1 1 0 137.8 7.3 13.5X +Use HashSet 10 11 1 9.6 103.8 1.0X +Use EnumSet 1 1 0 103.2 9.7 10.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 33 33 1 3.1 327.8 1.0X -Use EnumSet 1 1 0 137.9 7.3 45.2X +Use HashSet 40 40 0 2.5 395.0 1.0X +Use EnumSet 1 1 0 99.3 10.1 39.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 60 60 0 1.7 596.5 1.0X -Use EnumSet 1 1 0 131.7 7.6 78.6X +Use HashSet 64 64 1 1.6 639.0 1.0X +Use EnumSet 1 1 0 108.8 9.2 69.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 4 4 0 250.2 4.0 1.0X -Use EnumSet 5 5 0 190.5 5.2 0.8X +Use HashSet 5 5 0 215.6 4.6 1.0X +Use EnumSet 5 5 0 194.3 5.1 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 13 14 0 74.9 13.3 1.0X -Use EnumSet 7 7 0 148.9 6.7 2.0X +Use HashSet 10 11 1 98.4 10.2 1.0X +Use EnumSet 6 6 0 159.7 6.3 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 32 33 1 31.6 31.7 1.0X -Use EnumSet 7 7 0 150.4 6.7 4.8X +Use HashSet 25 26 1 40.4 24.7 1.0X +Use EnumSet 6 6 0 158.8 6.3 3.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 55 58 9 18.2 55.1 1.0X -Use EnumSet 7 7 0 146.6 6.8 8.1X +Use HashSet 54 55 1 18.4 54.3 1.0X +Use EnumSet 6 7 0 155.6 6.4 8.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test create and contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Use HashSet 82 83 1 12.1 82.3 1.0X -Use EnumSet 7 7 0 145.2 6.9 12.0X +Use HashSet 80 82 1 12.4 80.3 1.0X +Use EnumSet 6 7 0 156.7 6.4 12.6X diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt index 73f125fc87862..fa12bcbaa3c38 100644 --- a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt @@ -2,23 +2,23 @@ Escape ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 7441 7453 11 0.1 7440.7 1.0X -New 768 770 1 1.3 768.3 9.7X +Legacy 9203 9215 8 0.1 9203.3 1.0X +New 813 816 2 1.2 813.1 11.3X ================================================================================================ Unescape ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 4446 4454 5 0.2 4446.4 1.0X -New 605 611 3 1.7 605.1 7.3X +Legacy 4679 4687 5 0.2 4678.5 1.0X +New 590 595 5 1.7 589.7 7.9X diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt index 87f5177d28715..dcdef85ea89d8 100644 --- a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt @@ -2,23 +2,23 @@ Escape ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 6851 6867 7 0.1 6850.8 1.0X -New 741 755 38 1.3 741.0 9.2X +Legacy 8620 8633 11 0.1 8620.5 1.0X +New 779 786 4 1.3 779.3 11.1X ================================================================================================ Unescape ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 5810 5823 15 0.2 5809.8 1.0X -New 597 602 5 1.7 596.6 9.7X +Legacy 5714 5728 8 0.2 5714.0 1.0X +New 593 597 3 1.7 592.5 9.6X diff --git a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt index b09cc75270118..edf44bac9a395 100644 --- a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt @@ -1,10 +1,10 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -arrayOfAny 3 3 0 3243.9 0.3 1.0X -arrayOfAnyAsObject 3 3 0 3243.9 0.3 1.0X -arrayOfAnyAsSeq 225 230 6 44.5 22.5 0.0X -arrayOfInt 273 278 5 36.7 27.3 0.0X -arrayOfIntAsObject 274 278 3 36.5 27.4 0.0X +arrayOfAny 6 6 0 1611.9 0.6 1.0X +arrayOfAnyAsObject 6 6 0 1611.9 0.6 1.0X +arrayOfAnyAsSeq 175 175 1 57.3 17.5 0.0X +arrayOfInt 271 272 0 36.8 27.1 0.0X +arrayOfIntAsObject 250 251 1 40.0 25.0 0.0X diff --git a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt index 56d0a136c2933..fae20f2b0ac35 100644 --- a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt +++ b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt @@ -1,10 +1,10 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -arrayOfAny 6 6 0 1619.9 0.6 1.0X -arrayOfAnyAsObject 6 6 0 1619.9 0.6 1.0X -arrayOfAnyAsSeq 157 158 1 63.5 15.7 0.0X -arrayOfInt 252 254 4 39.6 25.2 0.0X -arrayOfIntAsObject 252 253 2 39.6 25.2 0.0X +arrayOfAny 6 6 0 1611.8 0.6 1.0X +arrayOfAnyAsObject 6 6 0 1611.9 0.6 1.0X +arrayOfAnyAsSeq 157 157 2 63.8 15.7 0.0X +arrayOfInt 253 254 0 39.5 25.3 0.0X +arrayOfIntAsObject 253 254 1 39.5 25.3 0.0X diff --git a/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt index d246505fc26ca..829099b78e422 100644 --- a/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt @@ -2,69 +2,69 @@ single ints ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For single ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2151 2154 4 249.6 4.0 1.0X -codegen version 3580 3597 23 149.9 6.7 0.6X -codegen version 64-bit 3385 3408 33 158.6 6.3 0.6X -codegen HiveHash version 2884 2886 2 186.1 5.4 0.7X +interpreted version 2089 2090 2 257.0 3.9 1.0X +codegen version 3541 3544 4 151.6 6.6 0.6X +codegen version 64-bit 3238 3269 44 165.8 6.0 0.6X +codegen HiveHash version 2563 2568 8 209.5 4.8 0.8X ================================================================================================ single longs ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For single longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2710 2711 1 198.1 5.0 1.0X -codegen version 5082 5083 0 105.6 9.5 0.5X -codegen version 64-bit 3962 3964 3 135.5 7.4 0.7X -codegen HiveHash version 3309 3310 1 162.2 6.2 0.8X +interpreted version 2718 2719 1 197.5 5.1 1.0X +codegen version 4520 4525 8 118.8 8.4 0.6X +codegen version 64-bit 3863 3874 15 139.0 7.2 0.7X +codegen HiveHash version 3158 3161 4 170.0 5.9 0.9X ================================================================================================ normal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For normal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 1425 1425 1 1.5 679.5 1.0X -codegen version 1648 1648 1 1.3 785.8 0.9X -codegen version 64-bit 725 726 2 2.9 345.7 2.0X -codegen HiveHash version 3675 3677 3 0.6 1752.2 0.4X +interpreted version 1335 1341 8 1.6 636.6 1.0X +codegen version 1803 1803 0 1.2 859.9 0.7X +codegen version 64-bit 735 735 0 2.9 350.3 1.8X +codegen HiveHash version 3635 3639 6 0.6 1733.2 0.4X ================================================================================================ array ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 1018 1019 1 0.1 7768.0 1.0X -codegen version 3632 3633 2 0.0 27706.7 0.3X -codegen version 64-bit 2340 2342 3 0.1 17849.7 0.4X -codegen HiveHash version 750 751 1 0.2 5721.5 1.4X +interpreted version 958 959 2 0.1 7308.1 1.0X +codegen version 3436 3441 7 0.0 26216.6 0.3X +codegen version 64-bit 2352 2353 1 0.1 17945.7 0.4X +codegen HiveHash version 685 689 5 0.2 5227.3 1.4X ================================================================================================ map ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 0 0 0 97.3 10.3 1.0X -codegen version 249 249 1 0.0 60732.6 0.0X -codegen version 64-bit 169 170 1 0.0 41356.6 0.0X -codegen HiveHash version 27 28 0 0.1 6709.4 0.0X +interpreted version 0 0 0 90.4 11.1 1.0X +codegen version 271 271 0 0.0 66159.1 0.0X +codegen version 64-bit 185 185 0 0.0 45145.5 0.0X +codegen HiveHash version 30 30 0 0.1 7378.6 0.0X diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt index 571a8a1d82881..30934fe57cd42 100644 --- a/sql/catalyst/benchmarks/HashBenchmark-results.txt +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -2,69 +2,69 @@ single ints ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For single ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2276 2276 0 235.9 4.2 1.0X -codegen version 3664 3669 7 146.5 6.8 0.6X -codegen version 64-bit 3478 3483 6 154.3 6.5 0.7X -codegen HiveHash version 3008 3010 3 178.5 5.6 0.8X +interpreted version 2157 2161 6 248.9 4.0 1.0X +codegen version 3655 3660 6 146.9 6.8 0.6X +codegen version 64-bit 3509 3510 1 153.0 6.5 0.6X +codegen HiveHash version 2857 2859 3 187.9 5.3 0.8X ================================================================================================ single longs ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For single longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 3006 3007 2 178.6 5.6 1.0X -codegen version 5317 5322 7 101.0 9.9 0.6X -codegen version 64-bit 3761 3765 6 142.8 7.0 0.8X -codegen HiveHash version 3401 3429 41 157.9 6.3 0.9X +interpreted version 3009 3011 4 178.4 5.6 1.0X +codegen version 5332 5336 6 100.7 9.9 0.6X +codegen version 64-bit 3997 3999 2 134.3 7.4 0.8X +codegen HiveHash version 3310 3310 1 162.2 6.2 0.9X ================================================================================================ normal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For normal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2549 2549 0 0.8 1215.4 1.0X -codegen version 2291 2292 1 0.9 1092.5 1.1X -codegen version 64-bit 724 726 2 2.9 345.4 3.5X -codegen HiveHash version 3719 3726 10 0.6 1773.2 0.7X +interpreted version 2557 2557 0 0.8 1219.1 1.0X +codegen version 2217 2218 1 0.9 1057.3 1.2X +codegen version 64-bit 703 704 1 3.0 335.3 3.6X +codegen HiveHash version 3734 3741 10 0.6 1780.7 0.7X ================================================================================================ array ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 990 992 4 0.1 7549.7 1.0X -codegen version 3619 3619 0 0.0 27611.8 0.3X -codegen version 64-bit 2385 2386 0 0.1 18199.3 0.4X -codegen HiveHash version 727 727 0 0.2 5543.0 1.4X +interpreted version 951 954 3 0.1 7252.8 1.0X +codegen version 3450 3452 3 0.0 26319.1 0.3X +codegen version 64-bit 2296 2296 0 0.1 17516.4 0.4X +codegen HiveHash version 703 704 1 0.2 5360.9 1.4X ================================================================================================ map ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash For map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 0 0 0 101.8 9.8 1.0X -codegen version 240 240 0 0.0 58478.2 0.0X -codegen version 64-bit 169 170 0 0.0 41373.6 0.0X -codegen HiveHash version 29 29 0 0.1 7006.4 0.0X +interpreted version 0 0 0 97.4 10.3 1.0X +codegen version 223 223 0 0.0 54377.2 0.0X +codegen version 64-bit 152 152 0 0.0 37102.0 0.0X +codegen HiveHash version 26 26 0 0.2 6290.5 0.0X diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt index 4e4d54c6da6fe..e1cfa115e2730 100644 --- a/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt @@ -2,76 +2,76 @@ Benchmark for MurMurHash 3 and xxHash64 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 10 10 0 205.6 4.9 1.0X -xxHash 64-bit 11 11 0 190.0 5.3 0.9X -HiveHasher 14 14 0 152.3 6.6 0.7X +Murmur3_x86_32 10 10 0 205.7 4.9 1.0X +xxHash 64-bit 10 10 0 200.6 5.0 1.0X +HiveHasher 14 14 1 151.9 6.6 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 16: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 14 15 0 145.8 6.9 1.0X -xxHash 64-bit 13 13 0 161.0 6.2 1.1X -HiveHasher 23 23 1 92.4 10.8 0.6X +Murmur3_x86_32 14 15 0 145.5 6.9 1.0X +xxHash 64-bit 12 13 2 168.2 5.9 1.2X +HiveHasher 23 23 1 90.9 11.0 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 24: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 20 20 0 104.9 9.5 1.0X -xxHash 64-bit 15 15 0 139.7 7.2 1.3X -HiveHasher 34 34 0 61.9 16.1 0.6X +Murmur3_x86_32 20 20 0 104.0 9.6 1.0X +xxHash 64-bit 15 15 0 143.1 7.0 1.4X +HiveHasher 34 34 0 62.5 16.0 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 31: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 31 31 0 68.5 14.6 1.0X -xxHash 64-bit 26 26 0 80.0 12.5 1.2X -HiveHasher 45 45 1 46.9 21.3 0.7X +Murmur3_x86_32 32 32 0 65.9 15.2 1.0X +xxHash 64-bit 27 28 0 76.3 13.1 1.2X +HiveHasher 44 44 0 48.1 20.8 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 95: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 68 70 3 30.7 32.6 1.0X -xxHash 64-bit 57 57 0 36.8 27.2 1.2X -HiveHasher 158 158 0 13.3 75.5 0.4X +Murmur3_x86_32 70 71 0 29.8 33.6 1.0X +xxHash 64-bit 58 58 0 36.4 27.5 1.2X +HiveHasher 157 157 0 13.4 74.8 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 287: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 208 210 3 10.1 99.2 1.0X -xxHash 64-bit 102 103 0 20.5 48.8 2.0X -HiveHasher 531 532 0 3.9 253.4 0.4X +Murmur3_x86_32 198 198 0 10.6 94.5 1.0X +xxHash 64-bit 102 102 0 20.6 48.6 1.9X +HiveHasher 533 533 0 3.9 254.0 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 1055: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 711 718 12 3.0 338.9 1.0X -xxHash 64-bit 296 298 4 7.1 141.0 2.4X -HiveHasher 2031 2032 2 1.0 968.4 0.4X +Murmur3_x86_32 709 717 13 3.0 338.3 1.0X +xxHash 64-bit 293 294 1 7.2 139.8 2.4X +HiveHasher 2042 2043 1 1.0 973.9 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 2079: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 1398 1401 5 1.5 666.7 1.0X -xxHash 64-bit 552 553 1 3.8 263.3 2.5X -HiveHasher 4026 4026 0 0.5 1919.5 0.3X +Murmur3_x86_32 1388 1388 1 1.5 661.7 1.0X +xxHash 64-bit 550 550 1 3.8 262.2 2.5X +HiveHasher 4050 4052 3 0.5 1931.1 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8223: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 5707 5709 3 0.4 2721.3 1.0X -xxHash 64-bit 2074 2074 1 1.0 988.7 2.8X -HiveHasher 15993 15993 0 0.1 7626.2 0.4X +Murmur3_x86_32 5726 5727 1 0.4 2730.6 1.0X +xxHash 64-bit 2068 2069 2 1.0 986.1 2.8X +HiveHasher 16089 16098 12 0.1 7671.8 0.4X diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt index 236b9e5b404d4..d49fd90cd3707 100644 --- a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt @@ -2,76 +2,76 @@ Benchmark for MurMurHash 3 and xxHash64 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 11 11 0 184.1 5.4 1.0X -xxHash 64-bit 10 10 0 214.5 4.7 1.2X -HiveHasher 14 14 0 150.9 6.6 0.8X +Murmur3_x86_32 11 12 0 183.2 5.5 1.0X +xxHash 64-bit 10 10 0 213.3 4.7 1.2X +HiveHasher 14 14 0 149.8 6.7 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 16: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 17 17 0 123.5 8.1 1.0X -xxHash 64-bit 12 12 0 176.1 5.7 1.4X -HiveHasher 22 23 0 93.3 10.7 0.8X +Murmur3_x86_32 17 17 0 123.2 8.1 1.0X +xxHash 64-bit 12 12 0 175.5 5.7 1.4X +HiveHasher 23 23 0 92.6 10.8 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 24: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 23 24 0 89.4 11.2 1.0X -xxHash 64-bit 14 14 0 145.9 6.9 1.6X -HiveHasher 33 33 0 63.2 15.8 0.7X +Murmur3_x86_32 24 24 0 89.0 11.2 1.0X +xxHash 64-bit 14 15 0 145.4 6.9 1.6X +HiveHasher 33 33 0 62.9 15.9 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 31: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 36 36 0 58.7 17.0 1.0X -xxHash 64-bit 27 28 0 76.4 13.1 1.3X -HiveHasher 42 44 5 49.4 20.2 0.8X +Murmur3_x86_32 36 36 0 59.1 16.9 1.0X +xxHash 64-bit 28 28 0 76.0 13.2 1.3X +HiveHasher 43 44 4 49.2 20.3 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 95: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 87 87 0 24.2 41.3 1.0X -xxHash 64-bit 61 62 0 34.1 29.3 1.4X -HiveHasher 158 158 0 13.3 75.2 0.5X +Murmur3_x86_32 87 87 0 24.0 41.6 1.0X +xxHash 64-bit 64 64 0 32.9 30.4 1.4X +HiveHasher 159 159 0 13.2 75.6 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 287: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 244 244 0 8.6 116.3 1.0X -xxHash 64-bit 117 117 1 18.0 55.6 2.1X -HiveHasher 531 531 0 3.9 253.4 0.5X +Murmur3_x86_32 244 245 1 8.6 116.4 1.0X +xxHash 64-bit 123 123 0 17.1 58.5 2.0X +HiveHasher 534 535 0 3.9 254.8 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 1055: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 873 873 0 2.4 416.1 1.0X -xxHash 64-bit 387 388 1 5.4 184.6 2.3X -HiveHasher 2032 2032 0 1.0 968.7 0.4X +Murmur3_x86_32 878 879 0 2.4 418.9 1.0X +xxHash 64-bit 400 401 1 5.2 190.9 2.2X +HiveHasher 2045 2045 0 1.0 974.9 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 2079: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 1704 1704 0 1.2 812.5 1.0X -xxHash 64-bit 762 763 1 2.8 363.2 2.2X -HiveHasher 4024 4024 0 0.5 1918.7 0.4X +Murmur3_x86_32 1715 1715 0 1.2 817.9 1.0X +xxHash 64-bit 782 782 0 2.7 372.9 2.2X +HiveHasher 4050 4066 22 0.5 1931.3 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8223: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 6702 6703 1 0.3 3195.8 1.0X -xxHash 64-bit 2999 3003 6 0.7 1429.8 2.2X -HiveHasher 15981 15981 1 0.1 7620.1 0.4X +Murmur3_x86_32 6744 6747 3 0.3 3216.0 1.0X +xxHash 64-bit 3043 3044 1 0.7 1451.2 2.2X +HiveHasher 16085 16085 0 0.1 7669.8 0.4X diff --git a/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt index c1b127d9e7884..88eee350370b9 100644 --- a/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt @@ -2,13 +2,13 @@ UnHex Comparison ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Cardinality 1000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Common Codecs 4444 4451 11 0.2 4444.1 1.0X -Java 5500 5533 41 0.2 5500.5 0.8X -Spark 3466 3472 6 0.3 3466.0 1.3X -Spark Binary 2625 2627 2 0.4 2625.3 1.7X +Common Codecs 4912 4952 35 0.2 4912.5 1.0X +Java 5772 5781 14 0.2 5772.1 0.9X +Spark 3482 3488 10 0.3 3482.0 1.4X +Spark Binary 2638 2639 0 0.4 2638.3 1.9X diff --git a/sql/catalyst/benchmarks/HexBenchmark-results.txt b/sql/catalyst/benchmarks/HexBenchmark-results.txt index c544346c34d33..adc459ceb8c7c 100644 --- a/sql/catalyst/benchmarks/HexBenchmark-results.txt +++ b/sql/catalyst/benchmarks/HexBenchmark-results.txt @@ -2,13 +2,13 @@ UnHex Comparison ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Cardinality 1000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Common Codecs 4794 4800 6 0.2 4793.6 1.0X -Java 4247 4262 16 0.2 4247.2 1.1X -Spark 3957 3963 8 0.3 3957.5 1.2X -Spark Binary 2743 2745 2 0.4 2743.4 1.7X +Common Codecs 4900 4906 5 0.2 4900.0 1.0X +Java 4133 4143 10 0.2 4133.2 1.2X +Spark 3987 3988 1 0.3 3986.6 1.2X +Spark Binary 2762 2766 3 0.4 2761.6 1.8X diff --git a/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-jdk21-results.txt index 1cdf1d8e42753..e852e1f715ba4 100644 --- a/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-jdk21-results.txt @@ -1,7 +1,7 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor internal row comparable wrapper: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -toSet 100 102 3 2.0 500.5 1.0X -mergePartitions 183 185 2 1.1 913.5 0.5X +toSet 100 102 3 2.0 501.6 1.0X +mergePartitions 180 182 2 1.1 900.1 0.6X diff --git a/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-results.txt b/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-results.txt index b920e5255016e..705c53b53bb3a 100644 --- a/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-results.txt +++ b/sql/catalyst/benchmarks/InternalRowComparableWrapperBenchmark-results.txt @@ -1,7 +1,7 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor internal row comparable wrapper: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -toSet 113 115 3 1.8 566.2 1.0X -mergePartitions 206 208 2 1.0 1030.5 0.5X +toSet 114 116 2 1.8 570.9 1.0X +mergePartitions 208 209 1 1.0 1040.6 0.5X diff --git a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt index 384cce30b67aa..c58763bed6876 100644 --- a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt @@ -2,13 +2,13 @@ unsafe projection ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor unsafe projection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -single long 1327 1327 0 202.4 4.9 1.0X -single nullable long 2362 2377 22 113.6 8.8 0.6X -7 primitive types 7062 7064 2 38.0 26.3 0.2X -7 nullable primitive types 10610 10625 21 25.3 39.5 0.1X +single long 1330 1330 0 201.8 5.0 1.0X +single nullable long 2375 2389 20 113.0 8.8 0.6X +7 primitive types 7116 7120 6 37.7 26.5 0.2X +7 nullable primitive types 10688 10694 8 25.1 39.8 0.1X diff --git a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt index 60c49d2917eb5..06fb444c3e730 100644 --- a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt +++ b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt @@ -2,13 +2,13 @@ unsafe projection ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor unsafe projection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -single long 1363 1363 1 197.0 5.1 1.0X -single nullable long 2454 2456 2 109.4 9.1 0.6X -7 primitive types 6944 6946 2 38.7 25.9 0.2X -7 nullable primitive types 10300 10314 19 26.1 38.4 0.1X +single long 1380 1382 3 194.5 5.1 1.0X +single nullable long 2449 2450 2 109.6 9.1 0.6X +7 primitive types 7002 7003 2 38.3 26.1 0.2X +7 nullable primitive types 10355 10370 20 25.9 38.6 0.1X diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java index 4200619d3c5f9..310d18ddb3486 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java @@ -51,7 +51,7 @@ public class ExpressionInfo { "window_funcs", "xml_funcs", "table_funcs", "url_funcs", "variant_funcs")); private static final Set validSources = - new HashSet<>(Arrays.asList("built-in", "hive", "python_udf", "scala_udf", + new HashSet<>(Arrays.asList("built-in", "hive", "python_udf", "scala_udf", "sql_udf", "java_udf", "python_udtf", "internal")); public String getClassName() { diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java index c057c36ca8204..5074348a1fd6a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java @@ -174,7 +174,7 @@ public final UnsafeRow getValueRow(int rowId) { */ @Override public final long spill(long size, MemoryConsumer trigger) throws IOException { - logger.warn("Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0."); + logger.debug("Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0."); return 0; } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionUtils.java index 2bad67d426af6..38bdcbec2069d 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionUtils.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionUtils.java @@ -24,7 +24,6 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; -import org.apache.spark.sql.catalyst.expressions.SharedFactory; import org.apache.spark.sql.catalyst.json.CreateJacksonParser; import org.apache.spark.sql.catalyst.util.GenericArrayData; import org.apache.spark.unsafe.types.UTF8String; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java index 60b250adb41ef..cbaea8cad8582 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java @@ -21,7 +21,9 @@ import org.apache.spark.annotation.Evolving; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.metric.CustomTaskMetric; import org.apache.spark.sql.connector.write.LogicalWriteInfo; +import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.types.StructType; /** @@ -52,4 +54,16 @@ public interface StagedTable extends Table { * table's writers. */ void abortStagedChanges(); + + /** + * Retrieve driver metrics after a commit. This is analogous + * to {@link Write#reportDriverMetrics()}. Note that these metrics must be included in the + * supported custom metrics reported by `supportedCustomMetrics` of the + * {@link StagingTableCatalog} that returned the staged table. + * + * @return an Array of commit metric values. Throws if the table has not been committed yet. + */ + default CustomTaskMetric[] reportDriverMetrics() throws RuntimeException { + return new CustomTaskMetric[0]; + } } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java index eead1ade40791..f457a4a3d7863 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java @@ -21,11 +21,13 @@ import org.apache.spark.annotation.Evolving; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.metric.CustomMetric; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException; import org.apache.spark.sql.connector.write.BatchWrite; +import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriterCommitMessage; import org.apache.spark.sql.errors.QueryCompilationErrors; import org.apache.spark.sql.types.StructType; @@ -200,4 +202,14 @@ default StagedTable stageCreateOrReplace( return stageCreateOrReplace( ident, CatalogV2Util.v2ColumnsToStructType(columns), partitions, properties); } + + /** + * @return An Array of commit metrics that are supported by the catalog. This is analogous to + * {@link Write#supportedCustomMetrics()}. The corresponding + * {@link StagedTable#reportDriverMetrics()} method must be called to + * retrieve the actual metric values after a commit. The methods are not in the same class + * because the supported metrics are required before the staged table object is created + * and only the staged table object can capture the write metrics during the commit. + */ + default CustomMetric[] supportedCustomMetrics() { return new CustomMetric[0]; } } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java index ba3470f85338c..77dbaa7687b41 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java @@ -67,6 +67,11 @@ public interface TableCatalog extends CatalogPlugin { */ String PROP_COMMENT = "comment"; + /** + * A reserved property to specify the collation of the table. + */ + String PROP_COLLATION = "collation"; + /** * A reserved property to specify the provider of the table. */ diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java index bd2dec9e27be0..49afcd5ebcd50 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java @@ -118,7 +118,7 @@ yield visitBinaryArithmetic( "COT", "ASIN", "ASINH", "ACOS", "ACOSH", "ATAN", "ATANH", "ATAN2", "CBRT", "DEGREES", "RADIANS", "SIGN", "WIDTH_BUCKET", "SUBSTRING", "UPPER", "LOWER", "TRANSLATE", "DATE_ADD", "DATE_DIFF", "TRUNC", "AES_ENCRYPT", "AES_DECRYPT", "SHA1", "SHA2", "MD5", - "CRC32", "BIT_LENGTH", "CHAR_LENGTH", "CONCAT" -> + "CRC32", "BIT_LENGTH", "CHAR_LENGTH", "CONCAT", "RPAD", "LPAD" -> visitSQLFunction(name, expressionsToStringArray(e.children())); case "CASE_WHEN" -> visitCaseWhen(expressionsToStringArray(e.children())); case "TRIM" -> visitTrim("BOTH", expressionsToStringArray(e.children())); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java index bfb1833b731a7..54b62c00283fa 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java @@ -69,14 +69,14 @@ public abstract class ColumnVector implements AutoCloseable { public abstract void close(); /** - * Cleans up memory for this column vector if it's not writable. The column vector is not usable - * after this. + * Cleans up memory for this column vector if it's resources are freeable between batches. + * The column vector is not usable after this. * - * If this is a writable column vector, it is a no-op. + * If this is a writable column vector or constant column vector, it is a no-op. */ - public void closeIfNotWritable() { - // By default, we just call close() for all column vectors. If a column vector is writable, it - // should override this method and do nothing. + public void closeIfFreeable() { + // By default, we just call close() for all column vectors. If a column vector is writable or + // constant, it should override this method and do nothing. close(); } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java index 52e4115af336a..7ef570a212292 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java @@ -46,12 +46,12 @@ public void close() { } /** - * Called to close all the columns if they are not writable. This is used to clean up memory - * allocated during columnar processing. + * Called to close all the columns if their resources are freeable between batches. + * This is used to clean up memory allocated during columnar processing. */ - public void closeIfNotWritable() { + public void closeIfFreeable() { for (ColumnVector c: columns) { - c.closeIfNotWritable(); + c.closeIfFreeable(); } } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java index aaac980bb332a..ac05981da5a24 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java @@ -188,6 +188,8 @@ public Object get(int ordinal, DataType dataType) { return getInt(ordinal); } else if (dataType instanceof TimestampType) { return getLong(ordinal); + } else if (dataType instanceof TimestampNTZType) { + return getLong(ordinal); } else if (dataType instanceof ArrayType) { return getArray(ordinal); } else if (dataType instanceof StructType) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 2b2a186f76d9d..fab65251ed51b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -66,6 +66,8 @@ object CatalystTypeConverters { case arrayType: ArrayType => ArrayConverter(arrayType.elementType) case mapType: MapType => MapConverter(mapType.keyType, mapType.valueType) case structType: StructType => StructConverter(structType) + case CharType(length) => new CharConverter(length) + case VarcharType(length) => new VarcharConverter(length) case _: StringType => StringConverter case DateType if SQLConf.get.datetimeJava8ApiEnabled => LocalDateConverter case DateType => DateConverter @@ -296,6 +298,33 @@ object CatalystTypeConverters { toScala(row.getStruct(column, structType.size)) } + private class CharConverter(length: Int) extends CatalystTypeConverter[Any, String, UTF8String] { + override def toCatalystImpl(scalaValue: Any): UTF8String = + CharVarcharCodegenUtils.charTypeWriteSideCheck( + StringConverter.toCatalystImpl(scalaValue), length) + override def toScala(catalystValue: UTF8String): String = if (catalystValue == null) { + null + } else { + CharVarcharCodegenUtils.charTypeWriteSideCheck(catalystValue, length).toString + } + override def toScalaImpl(row: InternalRow, column: Int): String = + CharVarcharCodegenUtils.charTypeWriteSideCheck(row.getUTF8String(column), length).toString + } + + private class VarcharConverter(length: Int) + extends CatalystTypeConverter[Any, String, UTF8String] { + override def toCatalystImpl(scalaValue: Any): UTF8String = + CharVarcharCodegenUtils.varcharTypeWriteSideCheck( + StringConverter.toCatalystImpl(scalaValue), length) + override def toScala(catalystValue: UTF8String): String = if (catalystValue == null) { + null + } else { + CharVarcharCodegenUtils.varcharTypeWriteSideCheck(catalystValue, length).toString + } + override def toScalaImpl(row: InternalRow, column: Int): String = + CharVarcharCodegenUtils.varcharTypeWriteSideCheck(row.getUTF8String(column), length).toString + } + private object StringConverter extends CatalystTypeConverter[Any, String, UTF8String] { override def toCatalystImpl(scalaValue: Any): UTF8String = scalaValue match { case str: String => UTF8String.fromString(str) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala index 4752434015375..55613b2b20134 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala @@ -20,11 +20,11 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.{expressions => exprs} import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, AgnosticEncoders, Codec, JavaSerializationCodec, KryoSerializationCodec} -import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, BoxedLeafEncoder, DateEncoder, DayTimeIntervalEncoder, InstantEncoder, IterableEncoder, JavaBeanEncoder, JavaBigIntEncoder, JavaDecimalEncoder, JavaEnumEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, OptionEncoder, PrimitiveBooleanEncoder, PrimitiveByteEncoder, PrimitiveDoubleEncoder, PrimitiveFloatEncoder, PrimitiveIntEncoder, PrimitiveLongEncoder, PrimitiveShortEncoder, ProductEncoder, ScalaBigIntEncoder, ScalaDecimalEncoder, ScalaEnumEncoder, StringEncoder, TimestampEncoder, TransformingEncoder, UDTEncoder, YearMonthIntervalEncoder} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, BoxedLeafEncoder, CharEncoder, DateEncoder, DayTimeIntervalEncoder, InstantEncoder, IterableEncoder, JavaBeanEncoder, JavaBigIntEncoder, JavaDecimalEncoder, JavaEnumEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, OptionEncoder, PrimitiveBooleanEncoder, PrimitiveByteEncoder, PrimitiveDoubleEncoder, PrimitiveFloatEncoder, PrimitiveIntEncoder, PrimitiveLongEncoder, PrimitiveShortEncoder, ProductEncoder, ScalaBigIntEncoder, ScalaDecimalEncoder, ScalaEnumEncoder, StringEncoder, TimestampEncoder, TransformingEncoder, UDTEncoder, VarcharEncoder, YearMonthIntervalEncoder} import org.apache.spark.sql.catalyst.encoders.EncoderUtils.{externalDataTypeFor, isNativeEncoder} import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, IsNull, Literal, MapKeys, MapValues, UpCast} import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, CreateExternalRow, DecodeUsingSerializer, InitializeJavaBean, Invoke, NewInstance, StaticInvoke, UnresolvedCatalystToExternalMap, UnresolvedMapObjects, WrapOption} -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, IntervalUtils} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CharVarcharCodegenUtils, DateTimeUtils, IntervalUtils} import org.apache.spark.sql.types._ object DeserializerBuildHelper { @@ -80,6 +80,32 @@ object DeserializerBuildHelper { returnNullable = false) } + def createDeserializerForChar( + path: Expression, + returnNullable: Boolean, + length: Int): Expression = { + val expr = StaticInvoke( + classOf[CharVarcharCodegenUtils], + StringType, + "charTypeWriteSideCheck", + path :: Literal(length) :: Nil, + returnNullable = returnNullable) + createDeserializerForString(expr, returnNullable) + } + + def createDeserializerForVarchar( + path: Expression, + returnNullable: Boolean, + length: Int): Expression = { + val expr = StaticInvoke( + classOf[CharVarcharCodegenUtils], + StringType, + "varcharTypeWriteSideCheck", + path :: Literal(length) :: Nil, + returnNullable = returnNullable) + createDeserializerForString(expr, returnNullable) + } + def createDeserializerForString(path: Expression, returnNullable: Boolean): Expression = { Invoke(path, "toString", ObjectType(classOf[java.lang.String]), returnNullable = returnNullable) @@ -258,6 +284,10 @@ object DeserializerBuildHelper { "withName", createDeserializerForString(path, returnNullable = false) :: Nil, returnNullable = false) + case CharEncoder(length) => + createDeserializerForChar(path, returnNullable = false, length) + case VarcharEncoder(length) => + createDeserializerForVarchar(path, returnNullable = false, length) case StringEncoder => createDeserializerForString(path, returnNullable = false) case _: ScalaDecimalEncoder => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/QueryPlanningTracker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/QueryPlanningTracker.scala index 2e14c09bc8193..d1007404158f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/QueryPlanningTracker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/QueryPlanningTracker.scala @@ -94,6 +94,16 @@ object QueryPlanningTracker { * Callbacks after planning phase completion. */ abstract class QueryPlanningTrackerCallback { + /** + * Called when query fails analysis + * + * @param tracker tracker that triggered the callback. + * @param parsedPlan The plan prior to analysis + * see @org.apache.spark.sql.catalyst.analysis.Analyzer + */ + def analysisFailed(tracker: QueryPlanningTracker, parsedPlan: LogicalPlan): Unit = { + // Noop by default for backward compatibility + } /** * Called when query has been analyzed. * @@ -147,6 +157,17 @@ class QueryPlanningTracker( ret } + /** + * Set when the query has been parsed but failed to be analyzed. + * Can be called multiple times upon plan change. + * + * @param parsedPlan The plan prior analysis + * see @org.apache.spark.sql.catalyst.analysis.Analyzer + */ + private[sql] def setAnalysisFailed(parsedPlan: LogicalPlan): Unit = { + trackerCallback.foreach(_.analysisFailed(this, parsedPlan)) + } + /** * Set when the query has been analysed. * Can be called multiple times upon plan change. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SerializerBuildHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SerializerBuildHelper.scala index daebe15c298f6..089d463ecacbb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SerializerBuildHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SerializerBuildHelper.scala @@ -22,11 +22,11 @@ import scala.language.existentials import org.apache.spark.sql.catalyst.{expressions => exprs} import org.apache.spark.sql.catalyst.DeserializerBuildHelper.expressionWithNullSafety import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, AgnosticEncoders, Codec, JavaSerializationCodec, KryoSerializationCodec} -import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLeafEncoder, BoxedLongEncoder, BoxedShortEncoder, DateEncoder, DayTimeIntervalEncoder, InstantEncoder, IterableEncoder, JavaBeanEncoder, JavaBigIntEncoder, JavaDecimalEncoder, JavaEnumEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, OptionEncoder, PrimitiveLeafEncoder, ProductEncoder, ScalaBigIntEncoder, ScalaDecimalEncoder, ScalaEnumEncoder, StringEncoder, TimestampEncoder, TransformingEncoder, UDTEncoder, YearMonthIntervalEncoder} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLeafEncoder, BoxedLongEncoder, BoxedShortEncoder, CharEncoder, DateEncoder, DayTimeIntervalEncoder, InstantEncoder, IterableEncoder, JavaBeanEncoder, JavaBigIntEncoder, JavaDecimalEncoder, JavaEnumEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, OptionEncoder, PrimitiveLeafEncoder, ProductEncoder, ScalaBigIntEncoder, ScalaDecimalEncoder, ScalaEnumEncoder, StringEncoder, TimestampEncoder, TransformingEncoder, UDTEncoder, VarcharEncoder, YearMonthIntervalEncoder} import org.apache.spark.sql.catalyst.encoders.EncoderUtils.{externalDataTypeFor, isNativeEncoder, lenientExternalDataTypeFor} import org.apache.spark.sql.catalyst.expressions.{BoundReference, CheckOverflow, CreateNamedStruct, Expression, IsNull, KnownNotNull, Literal, UnsafeArrayData} import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils, GenericArrayData, IntervalUtils} +import org.apache.spark.sql.catalyst.util.{ArrayData, CharVarcharCodegenUtils, DateTimeUtils, GenericArrayData, IntervalUtils} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -63,6 +63,24 @@ object SerializerBuildHelper { Invoke(inputObject, "doubleValue", DoubleType) } + def createSerializerForChar(inputObject: Expression, length: Int): Expression = { + StaticInvoke( + classOf[CharVarcharCodegenUtils], + CharType(length), + "charTypeWriteSideCheck", + createSerializerForString(inputObject) :: Literal(length) :: Nil, + returnNullable = false) + } + + def createSerializerForVarchar(inputObject: Expression, length: Int): Expression = { + StaticInvoke( + classOf[CharVarcharCodegenUtils], + VarcharType(length), + "varcharTypeWriteSideCheck", + createSerializerForString(inputObject) :: Literal(length) :: Nil, + returnNullable = false) + } + def createSerializerForString(inputObject: Expression): Expression = { StaticInvoke( classOf[UTF8String], @@ -298,6 +316,8 @@ object SerializerBuildHelper { case BoxedDoubleEncoder => createSerializerForDouble(input) case JavaEnumEncoder(_) => createSerializerForJavaEnum(input) case ScalaEnumEncoder(_, _) => createSerializerForScalaEnum(input) + case CharEncoder(length) => createSerializerForChar(input, length) + case VarcharEncoder(length) => createSerializerForVarchar(input, length) case StringEncoder => createSerializerForString(input) case ScalaDecimalEncoder(dt) => createSerializerForBigDecimal(input, dt) case JavaDecimalEncoder(dt, false) => createSerializerForBigDecimal(input, dt) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index bed7bea61597f..92cfc4119dd0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -28,6 +28,13 @@ import scala.util.{Failure, Random, Success, Try} import org.apache.spark.{SparkException, SparkThrowable, SparkUnsupportedOperationException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst._ +import org.apache.spark.sql.catalyst.analysis.resolver.{ + AnalyzerBridgeState, + HybridAnalyzer, + Resolver => OperatorResolver, + ResolverExtension, + ResolverGuard +} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders.OuterScopes import org.apache.spark.sql.catalyst.expressions._ @@ -146,7 +153,26 @@ case class AnalysisContext( // lookup a temporary function. And export to the view metadata. referredTempFunctionNames: mutable.Set[String] = mutable.Set.empty, referredTempVariableNames: Seq[Seq[String]] = Seq.empty, - outerPlan: Option[LogicalPlan] = None) + outerPlan: Option[LogicalPlan] = None, + + /** + * This is a bridge state between this fixed-point [[Analyzer]] and a single-pass [[Resolver]]. + * It's managed ([[setSinglePassResolverBridgeState]] method) by the [[HybridAnalyzer]] - the + * goal is to preserve it correctly between the fixed-point and single-pass runs. + * [[AnalysisContext.reset]] simply propagates it to prevent it from being reset in + * [[Analyzer.execute]]. Normally it's always [[None]], unless + * [[ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER]] is set to [[true]]. + * + * See [[AnalyzerBridgeState]] and [[HybridAnalyzer]] for more info. + */ + private var singlePassResolverBridgeState: Option[AnalyzerBridgeState] = None) { + + def setSinglePassResolverBridgeState(bridgeState: Option[AnalyzerBridgeState]): Unit = + singlePassResolverBridgeState = bridgeState + + def getSinglePassResolverBridgeState: Option[AnalyzerBridgeState] = + singlePassResolverBridgeState +} object AnalysisContext { private val value = new ThreadLocal[AnalysisContext]() { @@ -154,7 +180,16 @@ object AnalysisContext { } def get: AnalysisContext = value.get() - def reset(): Unit = value.remove() + + def reset(): Unit = { + // We need to preserve the single-pass resolver bridge state here, since it's managed by the + // [[HybridAnalyzer]] (set or reset to `None`) to avoid it being reset in [[execute]]. + // It acts as a bridge between the single-pass and fixed-point analyzers in the absence of any + // other explicit state. + val prevSinglePassResolverBridgeState = value.get.getSinglePassResolverBridgeState + value.remove() + value.get.setSinglePassResolverBridgeState(prevSinglePassResolverBridgeState) + } private def set(context: AnalysisContext): Unit = value.set(context) @@ -219,9 +254,15 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor def executeAndCheck(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { if (plan.analyzed) return plan AnalysisHelper.markInAnalyzer { - val analyzed = executeAndTrack(plan, tracker) - checkAnalysis(analyzed) - analyzed + new HybridAnalyzer( + this, + new ResolverGuard(catalogManager), + new OperatorResolver( + catalogManager, + singlePassResolverExtensions, + singlePassMetadataResolverExtensions + ) + ).apply(plan, tracker) } } @@ -245,6 +286,20 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor errorOnExceed = true, maxIterationsSetting = SQLConf.ANALYZER_MAX_ITERATIONS.key) + /** + * Extensions for the single-pass analyzer. + * + * See [[ResolverExtension]] for more info. + */ + val singlePassResolverExtensions: Seq[ResolverExtension] = Nil + + /** + * Extensions used for early resolution of the single-pass analyzer. + * + * See [[ResolverExtension]] for more info. + */ + val singlePassMetadataResolverExtensions: Seq[ResolverExtension] = Nil + /** * Override to provide additional rules for the "Resolution" batch. */ @@ -279,7 +334,8 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor CTESubstitution, WindowsSubstitution, EliminateUnions, - SubstituteUnresolvedOrdinals), + SubstituteUnresolvedOrdinals, + EliminateLazyExpression), Batch("Disable Hints", Once, new ResolveHints.DisableHints), Batch("Hints", fixedPoint, @@ -298,6 +354,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor ResolveFieldNameAndPosition :: AddMetadataColumns :: DeduplicateRelations :: + ResolveCollationName :: new ResolveReferences(catalogManager) :: // Please do not insert any other rules in between. See the TODO comments in rule // ResolveLateralColumnAliasReference for more details. @@ -316,9 +373,12 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor ResolveProcedures :: BindProcedures :: ResolveTableSpec :: + ValidateAndStripPipeExpressions :: + ResolveSQLFunctions :: ResolveAliases :: ResolveSubquery :: ResolveSubqueryColumnAliases :: + ResolveDefaultStringTypes :: ResolveWindowOrder :: ResolveWindowFrame :: ResolveNaturalAndUsingJoin :: @@ -978,26 +1038,9 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor // If `AnalysisContext.catalogAndNamespace` is non-empty, analyzer will expand single-part names // with it, instead of current catalog and namespace. private def resolveViews(plan: LogicalPlan): LogicalPlan = plan match { - // The view's child should be a logical plan parsed from the `desc.viewText`, the variable - // `viewText` should be defined, or else we throw an error on the generation of the View - // operator. - case view @ View(desc, isTempView, child) if !child.resolved => - // Resolve all the UnresolvedRelations and Views in the child. - val newChild = AnalysisContext.withAnalysisContext(desc) { - val nestedViewDepth = AnalysisContext.get.nestedViewDepth - val maxNestedViewDepth = AnalysisContext.get.maxNestedViewDepth - if (nestedViewDepth > maxNestedViewDepth) { - throw QueryCompilationErrors.viewDepthExceedsMaxResolutionDepthError( - desc.identifier, maxNestedViewDepth, view) - } - SQLConf.withExistingConf(View.effectiveSQLConf(desc.viewSQLConfigs, isTempView)) { - executeSameContext(child) - } - } - // Fail the analysis eagerly because outside AnalysisContext, the unresolved operators - // inside a view maybe resolved incorrectly. - checkAnalysis(newChild) - view.copy(child = newChild) + case view: View if !view.child.resolved => + ViewResolution + .resolve(view, resolveChild = executeSameContext, checkAnalysis = checkAnalysis) case p @ SubqueryAlias(_, view: View) => p.copy(child = resolveViews(view)) case _ => plan @@ -1015,7 +1058,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor case i @ InsertIntoStatement(table, _, _, _, _, _, _) => val relation = table match { case u: UnresolvedRelation if !u.isStreaming => - relationResolution.resolveRelation(u).getOrElse(u) + resolveRelation(u).getOrElse(u) case other => other } @@ -1032,7 +1075,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor case write: V2WriteCommand => write.table match { case u: UnresolvedRelation if !u.isStreaming => - relationResolution.resolveRelation(u).map(unwrapRelationPlan).map { + resolveRelation(u).map(unwrapRelationPlan).map { case v: View => throw QueryCompilationErrors.writeIntoViewNotAllowedError( v.desc.identifier, write) case r: DataSourceV2Relation => write.withNewTable(r) @@ -1047,12 +1090,12 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor } case u: UnresolvedRelation => - relationResolution.resolveRelation(u).map(resolveViews).getOrElse(u) + resolveRelation(u).map(resolveViews).getOrElse(u) case r @ RelationTimeTravel(u: UnresolvedRelation, timestamp, version) if timestamp.forall(ts => ts.resolved && !SubqueryExpression.hasSubquery(ts)) => val timeTravelSpec = TimeTravelSpec.create(timestamp, version, conf.sessionLocalTimeZone) - relationResolution.resolveRelation(u, timeTravelSpec).getOrElse(r) + resolveRelation(u, timeTravelSpec).getOrElse(r) case u @ UnresolvedTable(identifier, cmd, suggestAlternative) => lookupTableOrView(identifier).map { @@ -1116,6 +1159,25 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor } } } + + def resolveRelation( + unresolvedRelation: UnresolvedRelation, + timeTravelSpec: Option[TimeTravelSpec] = None): Option[LogicalPlan] = { + relationResolution + .resolveRelation( + unresolvedRelation, + timeTravelSpec + ) + .map { relation => + // We put the synchronously resolved relation into the [[AnalyzerBridgeState]] for + // it to be later reused by the single-pass [[Resolver]] to avoid resolving the relation + // metadata twice. + AnalysisContext.get.getSinglePassResolverBridgeState.map { bridgeState => + bridgeState.relationsWithResolvedMetadata.put(unresolvedRelation, relation) + } + relation + } + } } /** Handle INSERT INTO for DSv2 */ @@ -1608,9 +1670,6 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor case s: Sort if !s.resolved || s.missingInput.nonEmpty => resolveReferencesInSort(s) - case u: UnresolvedWithCTERelations => - UnresolvedWithCTERelations(this.apply(u.unresolvedPlan), u.cteRelations) - case q: LogicalPlan => logTrace(s"Attempting to resolve ${q.simpleString(conf.maxToStringFields)}") q.mapExpressions(resolveExpressionByPlanChildren(_, q, includeLastResort = true)) @@ -1830,10 +1889,14 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor // Replace the index with the corresponding expression in aggregateExpressions. The index is // a 1-base position of aggregateExpressions, which is output columns (select expression) - case Aggregate(groups, aggs, child, hint) if aggs.forall(_.resolved) && + case Aggregate(groups, aggs, child, hint) + if aggs + .filter(!containUnresolvedPipeAggregateOrdinal(_)) + .forall(_.resolved) && groups.exists(containUnresolvedOrdinal) => - val newGroups = groups.map(resolveGroupByExpressionOrdinal(_, aggs)) - Aggregate(newGroups, aggs, child, hint) + val newAggs = aggs.map(resolvePipeAggregateExpressionOrdinal(_, child.output)) + val newGroups = groups.map(resolveGroupByExpressionOrdinal(_, newAggs)) + Aggregate(newGroups, newAggs, child, hint) } private def containUnresolvedOrdinal(e: Expression): Boolean = e match { @@ -1842,6 +1905,11 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor case _ => false } + private def containUnresolvedPipeAggregateOrdinal(e: Expression): Boolean = e match { + case UnresolvedAlias(_: UnresolvedPipeAggregateOrdinal, _) => true + case _ => false + } + private def resolveGroupByExpressionOrdinal( expr: Expression, aggs: Seq[Expression]): Expression = expr match { @@ -1877,6 +1945,17 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor } } + private def resolvePipeAggregateExpressionOrdinal( + expr: NamedExpression, + inputs: Seq[Attribute]): NamedExpression = expr match { + case UnresolvedAlias(UnresolvedPipeAggregateOrdinal(index), _) => + // In this case, the user applied the SQL pipe aggregate operator ("|> AGGREGATE") and used + // ordinals in its GROUP BY clause. This expression then refers to the i-th attribute of the + // child operator (one-based). Here we resolve the ordinal to the corresponding attribute. + inputs(index - 1) + case other => + other + } /** * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the @@ -2190,23 +2269,12 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor * can resolve outer references. * * Outer references of the subquery are updated as children of Subquery expression. - * - * If hasExplicitOuterRefs is true, the subquery should have an explicit outer reference, - * instead of common `UnresolvedAttribute`s. In this case, tries to resolve inner and outer - * references separately. */ private def resolveSubQuery( e: SubqueryExpression, - outer: LogicalPlan, - hasExplicitOuterRefs: Boolean = false)( + outer: LogicalPlan)( f: (LogicalPlan, Seq[Expression]) => SubqueryExpression): SubqueryExpression = { - val newSubqueryPlan = if (hasExplicitOuterRefs) { - executeSameContext(e.plan).transformAllExpressionsWithPruning( - _.containsPattern(UNRESOLVED_OUTER_REFERENCE)) { - case u: UnresolvedOuterReference => - resolveOuterReference(u.nameParts, outer).getOrElse(u) - } - } else AnalysisContext.withOuterPlan(outer) { + val newSubqueryPlan = AnalysisContext.withOuterPlan(outer) { executeSameContext(e.plan) } @@ -2231,11 +2299,10 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor */ private def resolveSubQueries(plan: LogicalPlan, outer: LogicalPlan): LogicalPlan = { plan.transformAllExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION), ruleId) { - case s @ ScalarSubquery(sub, _, exprId, _, _, _, _, hasExplicitOuterRefs) - if !sub.resolved => - resolveSubQuery(s, outer, hasExplicitOuterRefs)(ScalarSubquery(_, _, exprId)) - case e @ Exists(sub, _, exprId, _, _, hasExplicitOuterRefs) if !sub.resolved => - resolveSubQuery(e, outer, hasExplicitOuterRefs)(Exists(_, _, exprId)) + case s @ ScalarSubquery(sub, _, exprId, _, _, _, _) if !sub.resolved => + resolveSubQuery(s, outer)(ScalarSubquery(_, _, exprId)) + case e @ Exists(sub, _, exprId, _, _) if !sub.resolved => + resolveSubQuery(e, outer)(Exists(_, _, exprId)) case InSubquery(values, l @ ListQuery(_, _, exprId, _, _, _)) if values.forall(_.resolved) && !l.resolved => val expr = resolveSubQuery(l, outer)((plan, exprs) => { @@ -2298,6 +2365,277 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor } } + /** + * This rule resolves SQL function expressions. It pulls out function inputs and place them + * in a separate [[Project]] node below the operator and replace the SQL function with its + * actual function body. SQL function expressions in [[Aggregate]] are handled in a special + * way. Non-aggregated SQL functions in the aggregate expressions of an Aggregate need to be + * pulled out into a Project above the Aggregate before replacing the SQL function expressions + * with actual function bodies. For example: + * + * Before: + * Aggregate [c1] [foo(c1), foo(max(c2)), sum(foo(c2)) AS sum] + * +- Relation [c1, c2] + * + * After: + * Project [foo(c1), foo(max_c2), sum] + * +- Aggregate [c1] [c1, max(c2) AS max_c2, sum(foo(c2)) AS sum] + * +- Relation [c1, c2] + */ + object ResolveSQLFunctions extends Rule[LogicalPlan] { + + private def hasSQLFunctionExpression(exprs: Seq[Expression]): Boolean = { + exprs.exists(_.find(_.isInstanceOf[SQLFunctionExpression]).nonEmpty) + } + + /** + * Check if the function input contains aggregate expressions. + */ + private def checkFunctionInput(f: SQLFunctionExpression): Unit = { + if (f.inputs.exists(AggregateExpression.containsAggregate)) { + // The input of a SQL function should not contain aggregate functions after + // `extractAndRewrite`. If there are aggregate functions, it means they are + // nested in another aggregate function, which is not allowed. + // For example: SELECT sum(foo(sum(c1))) FROM t + // We have to throw the error here because otherwise the query plan after + // resolving the SQL function will not be valid. + throw new AnalysisException( + errorClass = "NESTED_AGGREGATE_FUNCTION", + messageParameters = Map.empty) + } + } + + /** + * Resolve a SQL function expression as a logical plan check if it can be analyzed. + */ + private def resolve(f: SQLFunctionExpression): LogicalPlan = { + // Validate the SQL function input. + checkFunctionInput(f) + val plan = v1SessionCatalog.makeSQLFunctionPlan(f.name, f.function, f.inputs) + val resolved = SQLFunctionContext.withSQLFunction { + // Resolve the SQL function plan using its context. + val conf = new SQLConf() + f.function.getSQLConfigs.foreach { case (k, v) => conf.settings.put(k, v) } + SQLConf.withExistingConf(conf) { + executeSameContext(plan) + } + } + // Fail the analysis eagerly if a SQL function cannot be resolved using its input. + SimpleAnalyzer.checkAnalysis(resolved) + resolved + } + + /** + * Rewrite SQL function expressions into actual resolved function bodies and extract + * function inputs into the given project list. + */ + private def rewriteSQLFunctions[E <: Expression]( + expression: E, + projectList: ArrayBuffer[NamedExpression]): E = { + val newExpr = expression match { + case f: SQLFunctionExpression if !hasSQLFunctionExpression(f.inputs) && + // Make sure LateralColumnAliasReference in parameters is resolved and eliminated first. + // Otherwise, the projectList can contain the LateralColumnAliasReference, which will be + // pushed down to a Project without the 'referenced' alias by LCA present, leaving it + // unresolved. + !f.inputs.exists(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) => + withPosition(f) { + val plan = resolve(f) + // Extract the function input project list from the SQL function plan and + // inline the SQL function expression. + plan match { + case Project(body :: Nil, Project(aliases, _: LocalRelation)) => + projectList ++= aliases + SQLScalarFunction(f.function, aliases.map(_.toAttribute), body) + case o => + throw new AnalysisException( + errorClass = "INVALID_SQL_FUNCTION_PLAN_STRUCTURE", + messageParameters = Map("plan" -> o.toString)) + } + } + case o => o.mapChildren(rewriteSQLFunctions(_, projectList)) + } + newExpr.asInstanceOf[E] + } + + /** + * Check if the given expression contains expressions that should be extracted, + * i.e. non-aggregated SQL functions with non-foldable inputs. + */ + private def shouldExtract(e: Expression): Boolean = e match { + // Return false if the expression is already an aggregate expression. + case _: AggregateExpression => false + case _: SQLFunctionExpression => true + case _: LeafExpression => false + case o => o.children.exists(shouldExtract) + } + + /** + * Extract aggregate expressions from the given expression and replace + * them with attribute references. + * Example: + * Before: foo(c1) + foo(max(c2)) + max(foo(c2)) + * After: foo(c1) + foo(max_c2) + max_foo_c2 + * Extracted expressions: [c1, max(c2) AS max_c2, max(foo(c2)) AS max_foo_c2] + */ + private def extractAndRewrite[T <: Expression]( + expression: T, + extractedExprs: ArrayBuffer[NamedExpression]): T = { + val newExpr = expression match { + case e if !shouldExtract(e) => + val exprToAdd: NamedExpression = e match { + case o: OuterReference => Alias(o, toPrettySQL(o.e))() + case ne: NamedExpression => ne + case o => Alias(o, toPrettySQL(o))() + } + extractedExprs += exprToAdd + exprToAdd.toAttribute + case f: SQLFunctionExpression => + val newInputs = f.inputs.map(extractAndRewrite(_, extractedExprs)) + f.copy(inputs = newInputs) + case o => o.mapChildren(extractAndRewrite(_, extractedExprs)) + } + newExpr.asInstanceOf[T] + } + + /** + * Replace all [[SQLFunctionExpression]]s in an expression with attribute references + * from the aliasMap. + */ + private def replaceSQLFunctionWithAttr[T <: Expression]( + expr: T, + aliasMap: mutable.HashMap[Expression, Alias]): T = { + expr.transform { + case f: SQLFunctionExpression if aliasMap.contains(f.canonicalized) => + aliasMap(f.canonicalized).toAttribute + }.asInstanceOf[T] + } + + private def rewrite(plan: LogicalPlan): LogicalPlan = plan match { + // Return if a sub-tree does not contain SQLFunctionExpression. + case p: LogicalPlan if !p.containsPattern(SQL_FUNCTION_EXPRESSION) => p + + case f @ Filter(cond, a: Aggregate) + if !f.resolved || AggregateExpression.containsAggregate(cond) || + ResolveGroupingAnalytics.hasGroupingFunction(cond) || + cond.containsPattern(TEMP_RESOLVED_COLUMN) => + // If the filter's condition contains aggregate expressions or grouping expressions or temp + // resolved column, we cannot rewrite both the filter and the aggregate until they are + // resolved by ResolveAggregateFunctions or ResolveGroupingAnalytics, because rewriting SQL + // functions in aggregate can add an additional project on top of the aggregate + // which breaks the pattern matching in those rules. + f.copy(child = a.copy(child = rewrite(a.child))) + + case h @ UnresolvedHaving(_, a: Aggregate) => + // Similarly UnresolvedHaving should be resolved by ResolveAggregateFunctions first + // before rewriting aggregate. + h.copy(child = a.copy(child = rewrite(a.child))) + + case a: Aggregate if a.resolved && hasSQLFunctionExpression(a.expressions) => + val child = rewrite(a.child) + // Extract SQL functions in the grouping expressions and place them in a project list + // below the current aggregate. Also update their appearances in the aggregate expressions. + val bottomProjectList = ArrayBuffer.empty[NamedExpression] + val aliasMap = mutable.HashMap.empty[Expression, Alias] + val newGrouping = a.groupingExpressions.map { expr => + expr.transformDown { + case f: SQLFunctionExpression => + val alias = aliasMap.getOrElseUpdate(f.canonicalized, Alias(f, f.name)()) + bottomProjectList += alias + alias.toAttribute + } + } + val aggregateExpressions = a.aggregateExpressions.map( + replaceSQLFunctionWithAttr(_, aliasMap)) + + // Rewrite SQL functions in the aggregate expressions that are not wrapped in + // aggregate functions. They need to be extracted into a project list above the + // current aggregate. + val aggExprs = ArrayBuffer.empty[NamedExpression] + val topProjectList = aggregateExpressions.map(extractAndRewrite(_, aggExprs)) + + // Rewrite SQL functions in the new aggregate expressions that are wrapped inside + // aggregate functions. + val newAggExprs = aggExprs.map(rewriteSQLFunctions(_, bottomProjectList)) + + val bottomProject = if (bottomProjectList.nonEmpty) { + Project(child.output ++ bottomProjectList, child) + } else { + child + } + val newAgg = if (newGrouping.nonEmpty || newAggExprs.nonEmpty) { + a.copy( + groupingExpressions = newGrouping, + aggregateExpressions = newAggExprs.toSeq, + child = bottomProject) + } else { + bottomProject + } + if (topProjectList.nonEmpty) Project(topProjectList, newAgg) else newAgg + + case p: Project if p.resolved && hasSQLFunctionExpression(p.expressions) => + val newChild = rewrite(p.child) + val projectList = ArrayBuffer.empty[NamedExpression] + val newPList = p.projectList.map(rewriteSQLFunctions(_, projectList)) + if (newPList != newChild.output) { + p.copy(newPList, Project(newChild.output ++ projectList, newChild)) + } else { + assert(projectList.isEmpty) + p.copy(child = newChild) + } + + case f: Filter if f.resolved && hasSQLFunctionExpression(f.expressions) => + val newChild = rewrite(f.child) + val projectList = ArrayBuffer.empty[NamedExpression] + val newCond = rewriteSQLFunctions(f.condition, projectList) + if (newCond != f.condition) { + Project(f.output, Filter(newCond, Project(newChild.output ++ projectList, newChild))) + } else { + assert(projectList.isEmpty) + f.copy(child = newChild) + } + + case j: Join if j.resolved && hasSQLFunctionExpression(j.expressions) => + val newLeft = rewrite(j.left) + val newRight = rewrite(j.right) + val projectList = ArrayBuffer.empty[NamedExpression] + val joinCond = j.condition.map(rewriteSQLFunctions(_, projectList)) + if (joinCond != j.condition) { + // Join condition cannot have non-deterministic expressions. We can safely + // replace the aliases with the original SQL function input expressions. + val aliasMap = projectList.collect { case a: Alias => a.toAttribute -> a.child }.toMap + val newJoinCond = joinCond.map(_.transform { + case a: Attribute => aliasMap.getOrElse(a, a) + }) + j.copy(left = newLeft, right = newRight, condition = newJoinCond) + } else { + assert(projectList.isEmpty) + j.copy(left = newLeft, right = newRight) + } + + case o: LogicalPlan if o.resolved && hasSQLFunctionExpression(o.expressions) => + o.transformExpressionsWithPruning(_.containsPattern(SQL_FUNCTION_EXPRESSION)) { + case f: SQLFunctionExpression => + f.failAnalysis( + errorClass = "UNSUPPORTED_SQL_UDF_USAGE", + messageParameters = Map( + "functionName" -> toSQLId(f.function.name.nameParts), + "nodeName" -> o.nodeName.toString)) + } + + case p: LogicalPlan => p.mapChildren(rewrite) + } + + def apply(plan: LogicalPlan): LogicalPlan = { + // Only rewrite SQL functions when they are not in nested function calls. + if (SQLFunctionContext.get.nestedSQLFunctionDepth > 0) { + plan + } else { + rewrite(plan) + } + } + } + /** * Turns projections that contain aggregate expressions into aggregations. */ @@ -2782,6 +3120,9 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor ne case e: Expression if e.foldable => e // No need to create an attribute reference if it will be evaluated as a Literal. + case e: SortOrder => + // For SortOder just recursively extract the from child expression. + e.copy(child = extractExpr(e.child)) case e: NamedArgumentExpression => // For NamedArgumentExpression, we extract the value and replace it with // an AttributeReference (with an internal column name, e.g. "_w0"). @@ -3653,7 +3994,6 @@ object CleanupAliases extends Rule[LogicalPlan] with AliasHelper { /** * Ignore event time watermark in batch query, which is only supported in Structured Streaming. - * TODO: add this rule into analyzer rule list. */ object EliminateEventTimeWatermark extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsWithPruning( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiStringPromotionTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiStringPromotionTypeCoercion.scala index 8345a4b9637e2..e7be95bc645ea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiStringPromotionTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiStringPromotionTypeCoercion.scala @@ -42,6 +42,7 @@ import org.apache.spark.sql.types.{ IntegralType, LongType, NullType, + StringHelper, StringType, StringTypeExpression, TimestampType @@ -99,7 +100,7 @@ object AnsiStringPromotionTypeCoercion { case (_: StringType, _: AnsiIntervalType) => None // [SPARK-50060] If a binary operation contains two collated string types with different // collation IDs, we can't decide which collation ID the result should have. - case (st1: StringType, st2: StringType) if st1.collationId != st2.collationId => None + case (st1: StringType, st2: StringType) => StringHelper.tightestCommonString(st1, st2) case (_: StringType, a: AtomicType) => Some(a) case (other, st: StringType) if !other.isInstanceOf[StringType] => findWiderTypeForString(st, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala index 3ba17c8b8e1a3..aa977b240007b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala @@ -102,6 +102,8 @@ object AnsiTypeCoercion extends TypeCoercionBase { case (NullType, t1) => Some(t1) case (t1, NullType) => Some(t1) + case(s1: StringType, s2: StringType) => StringHelper.tightestCommonString(s1, s2) + case (t1: IntegralType, t2: DecimalType) if t2.isWiderThan(t1) => Some(t2) case (t1: DecimalType, t2: IntegralType) if t1.isWiderThan(t2) => @@ -168,7 +170,12 @@ object AnsiTypeCoercion extends TypeCoercionBase { // If a function expects a StringType, no StringType instance should be implicitly cast to // StringType with a collation that's not accepted (aka. lockdown unsupported collations). - case (_: StringType, _: StringType) => None + case (s1: StringType, s2: StringType) => + if (s1.collationId == s2.collationId && StringHelper.isMoreConstrained(s1, s2)) { + Some(s2) + } else { + None + } case (_: StringType, _: AbstractStringType) => None // If a function expects integral type, fractional input is not allowed. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index ff0dbcd7ef153..50f149bb28064 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -123,7 +123,7 @@ object CTESubstitution extends Rule[LogicalPlan] { startOfQuery: Boolean = true): Unit = { val resolver = conf.resolver plan match { - case UnresolvedWith(child, relations) => + case UnresolvedWith(child, relations, _) => val newNames = ArrayBuffer.empty[String] newNames ++= outerCTERelationNames relations.foreach { @@ -149,10 +149,15 @@ object CTESubstitution extends Rule[LogicalPlan] { plan: LogicalPlan, cteDefs: ArrayBuffer[CTERelationDef]): LogicalPlan = { plan.resolveOperatorsUp { - case UnresolvedWith(child, relations) => - val resolvedCTERelations = - resolveCTERelations(relations, isLegacy = true, forceInline = false, Seq.empty, cteDefs) - substituteCTE(child, alwaysInline = true, resolvedCTERelations) + case cte @ UnresolvedWith(child, relations, allowRecursion) => + if (allowRecursion) { + cte.failAnalysis( + errorClass = "RECURSIVE_CTE_IN_LEGACY_MODE", + messageParameters = Map.empty) + } + val resolvedCTERelations = resolveCTERelations(relations, isLegacy = true, + forceInline = false, Seq.empty, cteDefs, allowRecursion) + substituteCTE(child, alwaysInline = true, resolvedCTERelations, None) } } @@ -202,14 +207,21 @@ object CTESubstitution extends Rule[LogicalPlan] { var firstSubstituted: Option[LogicalPlan] = None val newPlan = plan.resolveOperatorsDownWithPruning( _.containsAnyPattern(UNRESOLVED_WITH, PLAN_EXPRESSION)) { - case UnresolvedWith(child: LogicalPlan, relations) => + // allowRecursion flag is set to `True` by the parser if the `RECURSIVE` keyword is used. + case cte @ UnresolvedWith(child: LogicalPlan, relations, allowRecursion) => + if (allowRecursion && forceInline) { + cte.failAnalysis( + errorClass = "RECURSIVE_CTE_WHEN_INLINING_IS_FORCED", + messageParameters = Map.empty) + } val resolvedCTERelations = - resolveCTERelations(relations, isLegacy = false, forceInline, outerCTEDefs, cteDefs) ++ - outerCTEDefs + resolveCTERelations(relations, isLegacy = false, forceInline, outerCTEDefs, cteDefs, + allowRecursion) ++ outerCTEDefs val substituted = substituteCTE( traverseAndSubstituteCTE(child, forceInline, resolvedCTERelations, cteDefs)._1, forceInline, - resolvedCTERelations) + resolvedCTERelations, + None) if (firstSubstituted.isEmpty) { firstSubstituted = Some(substituted) } @@ -228,7 +240,8 @@ object CTESubstitution extends Rule[LogicalPlan] { isLegacy: Boolean, forceInline: Boolean, outerCTEDefs: Seq[(String, CTERelationDef)], - cteDefs: ArrayBuffer[CTERelationDef]): Seq[(String, CTERelationDef)] = { + cteDefs: ArrayBuffer[CTERelationDef], + allowRecursion: Boolean): Seq[(String, CTERelationDef)] = { val alwaysInline = isLegacy || forceInline var resolvedCTERelations = if (alwaysInline) { Seq.empty @@ -247,30 +260,116 @@ object CTESubstitution extends Rule[LogicalPlan] { // NOTE: we must call `traverseAndSubstituteCTE` before `substituteCTE`, as the relations // in the inner CTE have higher priority over the relations in the outer CTE when resolving // inner CTE relations. For example: - // WITH t1 AS (SELECT 1) - // t2 AS ( - // WITH t1 AS (SELECT 2) - // WITH t3 AS (SELECT * FROM t1) - // ) - // t3 should resolve the t1 to `SELECT 2` instead of `SELECT 1`. - traverseAndSubstituteCTE(relation, forceInline, resolvedCTERelations, cteDefs)._1 + // WITH + // t1 AS (SELECT 1), + // t2 AS ( + // WITH + // t1 AS (SELECT 2), + // t3 AS (SELECT * FROM t1) + // SELECT * FROM t1 + // ) + // SELECT * FROM t2 + // t3 should resolve the t1 to `SELECT 2` ("inner" t1) instead of `SELECT 1`. + // + // When recursion allowed (RECURSIVE keyword used): + // Consider following example: + // WITH + // t1 AS (SELECT 1), + // t2 AS ( + // WITH RECURSIVE + // t1 AS ( + // SELECT 1 AS level + // UNION ( + // WITH t3 AS (SELECT level + 1 FROM t1 WHERE level < 10) + // SELECT * FROM t3 + // ) + // ) + // SELECT * FROM t1 + // ) + // SELECT * FROM t2 + // t1 reference within t3 would initially resolve to outer `t1` (SELECT 1), as the inner t1 + // is not yet known. Therefore, we need to remove definitions that conflict with current + // relation `name` from the list of `outerCTEDefs` entering `traverseAndSubstituteCTE()`. + // NOTE: It will be recognized later in the code that this is actually a self-reference + // (reference to the inner t1). + val nonConflictingCTERelations = if (allowRecursion) { + resolvedCTERelations.filterNot { + case (cteName, cteDef) => cteDef.conf.resolver(cteName, name) + } + } else { + resolvedCTERelations + } + traverseAndSubstituteCTE(relation, forceInline, nonConflictingCTERelations, cteDefs)._1 } - // CTE definition can reference a previous one - val substituted = substituteCTE(innerCTEResolved, alwaysInline, resolvedCTERelations) + + // If recursion is allowed (RECURSIVE keyword specified) + // then it has higher priority than outer or previous relations. + // Therefore, we construct a `CTERelationDef` for the current relation. + // Later if we encounter unresolved relation which we need to find which CTE Def it is + // referencing to, we first check if it is a reference to this one. If yes, then we set the + // reference as being recursive. + val recursiveCTERelation = if (allowRecursion) { + Some(name -> CTERelationDef(relation)) + } else { + None + } + // CTE definition can reference a previous one or itself if recursion allowed. + val substituted = substituteCTE(innerCTEResolved, alwaysInline, + resolvedCTERelations, recursiveCTERelation) val cteRelation = CTERelationDef(substituted) if (!alwaysInline) { cteDefs += cteRelation } + // Prepending new CTEs makes sure that those have higher priority over outer ones. resolvedCTERelations +:= (name -> cteRelation) } resolvedCTERelations } + /** + * This function is called from `substituteCTE` to actually substitute unresolved relations + * with CTE references. + */ + private def resolveWithCTERelations( + table: String, + alwaysInline: Boolean, + cteRelations: Seq[(String, CTERelationDef)], + recursiveCTERelation: Option[(String, CTERelationDef)], + unresolvedRelation: UnresolvedRelation): LogicalPlan = { + if (recursiveCTERelation.isDefined && conf.resolver(recursiveCTERelation.get._1, table)) { + // self-reference is found + recursiveCTERelation.map { + case (_, d) => + SubqueryAlias(table, + CTERelationRef(d.id, d.resolved, d.output, d.isStreaming, recursive = true)) + }.get + } else { + cteRelations + .find(r => conf.resolver(r._1, table)) + .map { + case (_, d) => + if (alwaysInline) { + d.child + } else { + // Add a `SubqueryAlias` for hint-resolving rules to match relation names. + // This is a non-recursive reference, recursive parameter is by default set to false + SubqueryAlias(table, + CTERelationRef(d.id, d.resolved, d.output, d.isStreaming)) + } + } + .getOrElse(unresolvedRelation) + } + } + + /** + * Substitute unresolved relations in the plan with CTE references (CTERelationRef). + */ private def substituteCTE( plan: LogicalPlan, alwaysInline: Boolean, - cteRelations: Seq[(String, CTERelationDef)]): LogicalPlan = { + cteRelations: Seq[(String, CTERelationDef)], + recursiveCTERelation: Option[(String, CTERelationDef)]): LogicalPlan = { plan.resolveOperatorsUpWithPruning( _.containsAnyPattern(RELATION_TIME_TRAVEL, UNRESOLVED_RELATION, PLAN_EXPRESSION, UNRESOLVED_IDENTIFIER)) { @@ -279,28 +378,29 @@ object CTESubstitution extends Rule[LogicalPlan] { throw QueryCompilationErrors.timeTravelUnsupportedError(toSQLId(table)) case u @ UnresolvedRelation(Seq(table), _, _) => - cteRelations.find(r => plan.conf.resolver(r._1, table)).map { case (_, d) => - if (alwaysInline) { - d.child - } else { - // Add a `SubqueryAlias` for hint-resolving rules to match relation names. - SubqueryAlias(table, CTERelationRef(d.id, d.resolved, d.output, d.isStreaming)) - } - }.getOrElse(u) + resolveWithCTERelations(table, alwaysInline, cteRelations, + recursiveCTERelation, u) case p: PlanWithUnresolvedIdentifier => // We must look up CTE relations first when resolving `UnresolvedRelation`s, // but we can't do it here as `PlanWithUnresolvedIdentifier` is a leaf node - // and may produce `UnresolvedRelation` later. - // Here we wrap it with `UnresolvedWithCTERelations` so that we can - // delay the CTE relations lookup after `PlanWithUnresolvedIdentifier` is resolved. - UnresolvedWithCTERelations(p, cteRelations) + // and may produce `UnresolvedRelation` later. Instead, we delay CTE resolution + // by moving it to the planBuilder of the corresponding `PlanWithUnresolvedIdentifier`. + p.copy(planBuilder = (nameParts, children) => { + p.planBuilder.apply(nameParts, children) match { + case u @ UnresolvedRelation(Seq(table), _, _) => + resolveWithCTERelations(table, alwaysInline, cteRelations, + recursiveCTERelation, u) + case other => other + } + }) case other => // This cannot be done in ResolveSubquery because ResolveSubquery does not know the CTE. other.transformExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION)) { case e: SubqueryExpression => - e.withNewPlan(apply(substituteCTE(e.plan, alwaysInline, cteRelations))) + e.withNewPlan( + apply(substituteCTE(e.plan, alwaysInline, cteRelations, None))) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 586a0312e1507..0a68524c31241 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._ -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, Median, PercentileCont, PercentileDisc} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, ListAgg, Median, PercentileCont, PercentileDisc} import org.apache.spark.sql.catalyst.optimizer.{BooleanSimplification, DecorrelateInnerQuery, InlineCTE} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -76,6 +76,10 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB dt.existsRecursively(_.isInstanceOf[MapType]) } + protected def hasVariantType(dt: DataType): Boolean = { + dt.existsRecursively(_.isInstanceOf[VariantType]) + } + protected def mapColumnInSetOperation(plan: LogicalPlan): Option[Attribute] = plan match { case _: Intersect | _: Except | _: Distinct => plan.output.find(a => hasMapType(a.dataType)) @@ -84,6 +88,21 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case _ => None } + protected def variantColumnInSetOperation(plan: LogicalPlan): Option[Attribute] = plan match { + case _: Intersect | _: Except | _: Distinct => + plan.output.find(a => hasVariantType(a.dataType)) + case d: Deduplicate => + d.keys.find(a => hasVariantType(a.dataType)) + case _ => None + } + + protected def variantExprInPartitionExpression(plan: LogicalPlan): Option[Expression] = + plan match { + case r: RepartitionByExpression => + r.partitionExpressions.find(e => hasVariantType(e.dataType)) + case _ => None + } + private def checkLimitLikeClause(name: String, limitExpr: Expression): Unit = { limitExpr match { case e if !e.foldable => limitExpr.failAnalysis( @@ -173,6 +192,15 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB ) } + private def containsUnsupportedLCA(e: Expression, operator: LogicalPlan): Boolean = { + e.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE) && operator.expressions.exists { + case a: Alias + if e.collect { case l: LateralColumnAliasReference => l.nameParts.head }.contains(a.name) => + a.exists(_.isInstanceOf[Generator]) + case _ => false + } + } + /** * Checks for errors in a `SELECT` clause, such as a trailing comma or an empty select list. * @@ -255,9 +283,11 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB plan.foreachUp { case p if p.analyzed => // Skip already analyzed sub-plans - case leaf: LeafNode if leaf.output.map(_.dataType).exists(CharVarcharUtils.hasCharVarchar) => + case leaf: LeafNode if !SQLConf.get.preserveCharVarcharTypeInfo && + leaf.output.map(_.dataType).exists(CharVarcharUtils.hasCharVarchar) => throw SparkException.internalError( - "Logical plan should not have output of char/varchar type: " + leaf) + s"Logical plan should not have output of char/varchar type when " + + s"${SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key} is false: " + leaf) case u: UnresolvedNamespace => u.schemaNotFound(u.multipartIdentifier) @@ -340,6 +370,14 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB // surrounded with single quotes, or there is a typo in the attribute name. case GetMapValue(map, key: Attribute) if isMapWithStringKey(map) && !key.resolved => failUnresolvedAttribute(operator, key, "UNRESOLVED_MAP_KEY") + + case e: Expression if containsUnsupportedLCA(e, operator) => + val lcaRefNames = + e.collect { case lcaRef: LateralColumnAliasReference => lcaRef.name }.distinct + failAnalysis( + errorClass = "UNSUPPORTED_FEATURE.LATERAL_COLUMN_ALIAS_IN_GENERATOR", + messageParameters = + Map("lca" -> toSQLId(lcaRefNames), "generatorExpr" -> toSQLExpr(e))) } // Fail if we still have an unresolved all in group by. This needs to run before the @@ -423,10 +461,23 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB "funcName" -> toSQLExpr(wf), "windowExpr" -> toSQLExpr(w))) + case agg @ AggregateExpression(listAgg: ListAgg, _, _, _, _) + if agg.isDistinct && listAgg.needSaveOrderValue => + throw QueryCompilationErrors.functionAndOrderExpressionMismatchError( + listAgg.prettyName, listAgg.child, listAgg.orderExpressions) + case w: WindowExpression => // Only allow window functions with an aggregate expression or an offset window // function or a Pandas window UDF. w.windowFunction match { + case agg @ AggregateExpression(fun: ListAgg, _, _, _, _) + // listagg(...) WITHIN GROUP (ORDER BY ...) OVER (ORDER BY ...) is unsupported + if fun.orderingFilled && (w.windowSpec.orderSpec.nonEmpty || + w.windowSpec.frameSpecification != + SpecifiedWindowFrame(RowFrame, UnboundedPreceding, UnboundedFollowing)) => + agg.failAnalysis( + errorClass = "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", + messageParameters = Map("aggFunc" -> toSQLExpr(agg.aggregateFunction))) case agg @ AggregateExpression( _: PercentileCont | _: PercentileDisc | _: Median, _, _, _, _) if w.windowSpec.orderSpec.nonEmpty || w.windowSpec.frameSpecification != @@ -457,11 +508,10 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB errorClass = "UNBOUND_SQL_PARAMETER", messageParameters = Map("name" -> p.name)) - case l: LazyAnalysisExpression => - l.failAnalysis( - errorClass = "UNANALYZABLE_EXPRESSION", - messageParameters = Map("expr" -> toSQLExpr(l))) - + case ma @ MultiAlias(child, names) if child.resolved && !child.isInstanceOf[Generator] => + ma.failAnalysis( + errorClass = "MULTI_ALIAS_WITHOUT_GENERATOR", + messageParameters = Map("expr" -> toSQLExpr(child), "names" -> names.mkString(", "))) case _ => }) @@ -654,13 +704,13 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB operator.children.tail.zipWithIndex.foreach { case (child, ti) => // Check the number of columns if (child.output.length != ref.length) { - e.failAnalysis( - errorClass = "NUM_COLUMNS_MISMATCH", - messageParameters = Map( - "operator" -> toSQLStmt(operator.nodeName), - "firstNumColumns" -> ref.length.toString, - "invalidOrdinalNum" -> ordinalNumber(ti + 1), - "invalidNumColumns" -> child.output.length.toString)) + throw QueryCompilationErrors.numColumnsMismatch( + operator = operator.nodeName, + firstNumColumns = ref.length, + invalidOrdinalNum = ti + 1, + invalidNumColumns = child.output.length, + origin = operator.origin + ) } val dataTypesAreCompatibleFn = getDataTypesAreCompatibleFn(operator) @@ -668,15 +718,15 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), ci) => // SPARK-18058: we shall not care about the nullability of columns if (!dataTypesAreCompatibleFn(dt1, dt2)) { - e.failAnalysis( - errorClass = "INCOMPATIBLE_COLUMN_TYPE", - messageParameters = Map( - "operator" -> toSQLStmt(operator.nodeName), - "columnOrdinalNumber" -> ordinalNumber(ci), - "tableOrdinalNumber" -> ordinalNumber(ti + 1), - "dataType1" -> toSQLType(dt1), - "dataType2" -> toSQLType(dt2), - "hint" -> extraHintForAnsiTypeCoercionPlan(operator))) + throw QueryCompilationErrors.incompatibleColumnTypeError( + operator = operator.nodeName, + columnOrdinalNumber = ci, + tableOrdinalNumber = ti + 1, + dataType1 = dt1, + dataType2 = dt2, + hint = extraHintForAnsiTypeCoercionPlan(operator), + origin = operator.origin + ) } } } @@ -820,6 +870,23 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB "colName" -> toSQLId(mapCol.name), "dataType" -> toSQLType(mapCol.dataType))) + // TODO: Remove this type check once we support Variant ordering + case o if variantColumnInSetOperation(o).isDefined => + val variantCol = variantColumnInSetOperation(o).get + o.failAnalysis( + errorClass = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + messageParameters = Map( + "colName" -> toSQLId(variantCol.name), + "dataType" -> toSQLType(variantCol.dataType))) + + case o if variantExprInPartitionExpression(o).isDefined => + val variantExpr = variantExprInPartitionExpression(o).get + o.failAnalysis( + errorClass = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + messageParameters = Map( + "expr" -> toSQLExpr(variantExpr), + "dataType" -> toSQLType(variantExpr.dataType))) + case o if o.expressions.exists(!_.deterministic) && !operatorAllowsNonDeterministicExpressions(o) && !o.isInstanceOf[Project] && @@ -1039,6 +1106,8 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB @scala.annotation.tailrec def cleanQueryInScalarSubquery(p: LogicalPlan): LogicalPlan = p match { case s: SubqueryAlias => cleanQueryInScalarSubquery(s.child) + // Skip SQL function node added by the Analyzer + case s: SQLFunctionNode => cleanQueryInScalarSubquery(s.child) case p: Project => cleanQueryInScalarSubquery(p.child) case h: ResolvedHint => cleanQueryInScalarSubquery(h.child) case child => child @@ -1067,20 +1136,6 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case _ => } - def checkUnresolvedOuterReference(p: LogicalPlan, expr: SubqueryExpression): Unit = { - expr.plan.foreachUp(_.expressions.foreach(_.foreachUp { - case o: UnresolvedOuterReference => - val cols = p.inputSet.toSeq.map(attr => toSQLId(attr.name)).mkString(", ") - o.failAnalysis( - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", - messageParameters = Map("objectName" -> toSQLId(o.name), "proposal" -> cols)) - case _ => - })) - } - - // Check if there is unresolved outer attribute in the subquery plan. - checkUnresolvedOuterReference(plan, expr) - // Validate the subquery plan. checkAnalysis0(expr.plan) @@ -1088,7 +1143,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB checkOuterReference(plan, expr) expr match { - case ScalarSubquery(query, outerAttrs, _, _, _, _, _, _) => + case ScalarSubquery(query, outerAttrs, _, _, _, _, _) => // Scalar subquery must return one column as output. if (query.output.size != 1) { throw QueryCompilationErrors.subqueryReturnMoreThanOneColumn(query.output.size, @@ -1545,15 +1600,23 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB alter.conf.resolver) } + def checkNoCollationsInMapKeys(colsToAdd: Seq[QualifiedColType]): Unit = { + if (!alter.conf.allowCollationsInMapKeys) { + colsToAdd.foreach(col => SchemaUtils.checkNoCollationsInMapKeys(col.dataType)) + } + } + alter match { case AddColumns(table: ResolvedTable, colsToAdd) => colsToAdd.foreach { colToAdd => checkColumnNotExists("add", colToAdd.name, table.schema) } checkColumnNameDuplication(colsToAdd) + checkNoCollationsInMapKeys(colsToAdd) case ReplaceColumns(_: ResolvedTable, colsToAdd) => checkColumnNameDuplication(colsToAdd) + checkNoCollationsInMapKeys(colsToAdd) case RenameColumn(table: ResolvedTable, col: ResolvedFieldName, newName) => checkColumnNotExists("rename", col.path :+ newName, table.schema) @@ -1592,9 +1655,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case (CharType(l1), CharType(l2)) => l1 == l2 case (CharType(l1), VarcharType(l2)) => l1 <= l2 case (VarcharType(l1), VarcharType(l2)) => l1 <= l2 - case _ => - Cast.canUpCast(from, to) || - DataType.equalsIgnoreCompatibleCollation(field.dataType, newDataType) + case _ => Cast.canUpCast(from, to) } if (!canAlterColumnType(field.dataType, newDataType)) { alter.failAnalysis( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala index 532e5e0d0a066..168eadbd65cd6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala @@ -17,14 +17,13 @@ package org.apache.spark.sql.catalyst.analysis -import scala.annotation.tailrec - import org.apache.spark.sql.catalyst.analysis.CollationStrength.{Default, Explicit, Implicit} -import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} +import org.apache.spark.sql.catalyst.analysis.TypeCoercion.haveSameType import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project} import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.errors.QueryCompilationErrors -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringType} +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringType, StructType} import org.apache.spark.sql.util.SchemaUtils /** @@ -32,16 +31,13 @@ import org.apache.spark.sql.util.SchemaUtils */ object CollationTypeCoercion { - private val COLLATION_CONTEXT_TAG = new TreeNodeTag[CollationContext]("collationContext") + private val COLLATION_CONTEXT_TAG = new TreeNodeTag[DataType]("collationContext") private def hasCollationContextTag(expr: Expression): Boolean = { expr.getTagValue(COLLATION_CONTEXT_TAG).isDefined } def apply(expression: Expression): Expression = expression match { - case cast: Cast if shouldRemoveCast(cast) => - cast.child - case ifExpr: If => ifExpr.withNewChildren( ifExpr.predicate +: collateToSingleType(Seq(ifExpr.trueValue, ifExpr.falseValue)) @@ -53,10 +49,10 @@ object CollationTypeCoercion { outputStringType match { case Some(st) => val newBranches = caseWhenExpr.branches.map { case (condition, value) => - (condition, castStringType(value, st)) + (condition, changeType(value, st)) } val newElseValue = - caseWhenExpr.elseValue.map(e => castStringType(e, st)) + caseWhenExpr.elseValue.map(e => changeType(e, st)) CaseWhen(newBranches, newElseValue) case _ => @@ -93,13 +89,6 @@ object CollationTypeCoercion { val Seq(newStr, newPad) = collateToSingleType(Seq(str, pad)) stringPadExpr.withNewChildren(Seq(newStr, len, newPad)) - case raiseError: RaiseError => - val newErrorParams = raiseError.errorParms.dataType match { - case MapType(StringType, StringType, _) => raiseError.errorParms - case _ => Cast(raiseError.errorParms, MapType(StringType, StringType)) - } - raiseError.withNewChildren(Seq(raiseError.errorClass, newErrorParams)) - case framelessOffsetWindow @ (_: Lag | _: Lead) => val Seq(input, offset, default) = framelessOffsetWindow.children val Seq(newInput, newDefault) = collateToSingleType(Seq(input, default)) @@ -112,11 +101,9 @@ object CollationTypeCoercion { val newValues = collateToSingleType(mapCreate.values) mapCreate.withNewChildren(newKeys.zip(newValues).flatMap(pair => Seq(pair._1, pair._2))) - case namedStruct: CreateNamedStruct if namedStruct.children.size % 2 == 0 => - val newNames = collateToSingleType(namedStruct.nameExprs) - val newValues = collateToSingleType(namedStruct.valExprs) - val interleaved = newNames.zip(newValues).flatMap(pair => Seq(pair._1, pair._2)) - namedStruct.withNewChildren(interleaved) + case namedStruct: CreateNamedStruct => + // since each child is separate we should not coerce them at all + namedStruct case splitPart: SplitPart => val Seq(str, delimiter, partNum) = splitPart.children @@ -156,46 +143,123 @@ object CollationTypeCoercion { } /** - * If childType is collated and target is UTF8_BINARY, the collation of the output - * should be that of the childType. + * Returns true if the given data type has any StringType in it. */ - private def shouldRemoveCast(cast: Cast): Boolean = { - val isUserDefined = cast.getTagValue(Cast.USER_SPECIFIED_CAST).isDefined - val isChildTypeCollatedString = cast.child.dataType match { - case st: StringType => !st.isUTF8BinaryCollation - case _ => false - } - val targetType = cast.dataType + private def hasStringType(dt: DataType): Boolean = dt.existsRecursively { + case _: StringType => true + case _ => false + } - isUserDefined && isChildTypeCollatedString && targetType == StringType + /** + * Changes the data type of the expression to the given `newType`. + */ + private def changeType(expr: Expression, newType: DataType): Expression = { + mergeTypes(expr.dataType, newType) match { + case Some(newDataType) if newDataType != expr.dataType => + assert(!newDataType.existsRecursively(_.isInstanceOf[StringTypeWithContext])) + + expr match { + case lit: Literal => lit.copy(dataType = newDataType) + case cast: Cast => cast.copy(dataType = newDataType) + case subquery: SubqueryExpression => + changeTypeInSubquery(subquery, newType) + + case _ => Cast(expr, newDataType) + } + + case _ => + expr + } } /** - * Extracts StringTypes from filtered hasStringType + * Changes the data type of the expression in the subquery to the given `newType`. + * Currently only supports subqueries with [[Project]] and [[Aggregate]] plan. */ - @tailrec - private def extractStringType(dt: DataType): Option[StringType] = dt match { - case st: StringType => Some(st) - case ArrayType(et, _) => extractStringType(et) - case _ => None + private def changeTypeInSubquery( + subqueryExpression: SubqueryExpression, + newType: DataType): SubqueryExpression = { + + def transformNamedExpressions(ex: NamedExpression): NamedExpression = { + changeType(ex, newType) match { + case named: NamedExpression => named + case other => Alias(other, ex.name)() + } + } + + val newPlan = subqueryExpression.plan match { + case project: Project => + val newProjectList = project.projectList.map(transformNamedExpressions) + project.copy(projectList = newProjectList) + + case agg: Aggregate => + val newAggregateExpressions = agg.aggregateExpressions.map(transformNamedExpressions) + agg.copy(aggregateExpressions = newAggregateExpressions) + + case other => other + } + + subqueryExpression.withNewPlan(newPlan) } /** - * Casts given expression to collated StringType with id equal to collationId only - * if expression has StringType in the first place. + * If possible, returns the new data type from `inType` by applying + * the collation of `castType`. */ - def castStringType(expr: Expression, st: StringType): Expression = { - castStringType(expr.dataType, st) - .map(dt => Cast(expr, dt)) - .getOrElse(expr) + private def mergeTypes(inType: DataType, castType: DataType): Option[DataType] = { + val outType = mergeStructurally(inType, castType) { + case (_: StringType, right: StringTypeWithContext) => + right.stringType + } + + outType } - private def castStringType(inType: DataType, castType: StringType): Option[DataType] = { - inType match { - case st: StringType if st.collationId != castType.collationId => - Some(castType) - case ArrayType(arrType, nullable) => - castStringType(arrType, castType).map(ArrayType(_, nullable)) + /** + * Merges two data types structurally according to the given base case. + */ + private def mergeStructurally( + leftType: DataType, + rightType: DataType) + (baseCase: PartialFunction[(DataType, DataType), DataType]): Option[DataType] = { + (leftType, rightType) match { + + // handle the base cases first + case _ if baseCase.isDefinedAt((leftType, rightType)) => + Option(baseCase(leftType, rightType)) + + case _ if leftType == rightType => + Some(leftType) + + case (ArrayType(leftElemType, nullable), ArrayType(rightElemType, _)) => + mergeStructurally(leftElemType, rightElemType)(baseCase).map(ArrayType(_, nullable)) + + case (MapType(leftKey, leftValue, nullable), MapType(rightKey, rightValue, _)) => + for { + newKeyType <- mergeStructurally(leftKey, rightKey)(baseCase) + newValueType <- mergeStructurally(leftValue, rightValue)(baseCase) + } yield MapType(newKeyType, newValueType, nullable) + + case (ArrayType(elementType, nullable), right) => + mergeStructurally(elementType, right)(baseCase).map(ArrayType(_, nullable)) + + case (left, ArrayType(elementType, _)) => + mergeStructurally(left, elementType)(baseCase) + + case (StructType(leftFields), StructType(rightFields)) => + if (leftFields.length != rightFields.length) { + return None + } + val newFields = leftFields.zip(rightFields).map { + case (leftField, rightField) => + val newType = mergeStructurally(leftField.dataType, rightField.dataType)(baseCase) + if (newType.isEmpty) { + return None + } + leftField.copy(dataType = newType.get) + } + Some(StructType(newFields)) + case _ => None } } @@ -208,7 +272,7 @@ object CollationTypeCoercion { lctOpt match { case Some(lct) => - expressions.map(e => castStringType(e, lct)) + expressions.map(e => changeType(e, lct)) case _ => expressions } @@ -217,70 +281,83 @@ object CollationTypeCoercion { /** * Tries to find the least common StringType among the given expressions. */ - private def findLeastCommonStringType(expressions: Seq[Expression]): Option[StringType] = { + private def findLeastCommonStringType(expressions: Seq[Expression]): Option[DataType] = { if (!expressions.exists(e => SchemaUtils.hasNonUTF8BinaryCollation(e.dataType))) { + // if there are no collated types we don't need to do anything + return None + } else if (ResolveDefaultStringTypes.needsResolution(expressions)) { + // if any of the strings types are still not resolved + // we need to wait for them to be resolved first return None } val collationContextWinner = expressions.foldLeft(findCollationContext(expressions.head)) { case (Some(left), right) => findCollationContext(right).flatMap { ctx => - collationPrecedenceWinner(left, ctx) + mergeWinner(left, ctx) } - case (None, _) => return None - } - - collationContextWinner.flatMap { cc => - extractStringType(cc.dataType) + case (None, _) => None } + collationContextWinner } /** - * Tries to find the collation context for the given expression. + * Tries to find the data type with the collation context for the given expression. * If found, it will also set the [[COLLATION_CONTEXT_TAG]] on the expression, * so that the context can be reused later. */ - private def findCollationContext(expr: Expression): Option[CollationContext] = { + private def findCollationContext(expr: Expression): Option[DataType] = { val contextOpt = expr match { - case _ if hasCollationContextTag(expr) => - Some(expr.getTagValue(COLLATION_CONTEXT_TAG).get) - - // if `expr` doesn't have a string in its dataType then it doesn't - // have the collation context either - case _ if !expr.dataType.existsRecursively(_.isInstanceOf[StringType]) => - None - case collate: Collate => - Some(CollationContext(collate.dataType, Explicit)) + case _ if collationStrengthBaseCases.isDefinedAt(expr) => + collationStrengthBaseCases(expr) - case _: Alias | _: SubqueryExpression | _: AttributeReference | _: VariableReference => - Some(CollationContext(expr.dataType, Implicit)) + case getStruct: GetStructField => + val childContext = findCollationContext(getStruct.child) + childContext match { + case Some(struct: StructType) => + val field = struct.fields(getStruct.ordinal) + Some(field.dataType) + case _ => None + } - case _: Literal => - Some(CollationContext(expr.dataType, Default)) + case getMapValue: GetMapValue => + findCollationContext(getMapValue.child) match { + case Some(MapType(_, valueType, _)) => + mergeWinner(getMapValue.dataType, valueType) + case _ => + None + } - // if it does have a string type but none of its children do - // then the collation context strength is default - case _ if !expr.children.exists(_.dataType.existsRecursively(_.isInstanceOf[StringType])) => - Some(CollationContext(expr.dataType, Default)) + case struct: CreateNamedStruct => + val childrenContexts = struct.valExprs.map(findCollationContext) + if (childrenContexts.isEmpty) { + return None + } + val newFields = struct.dataType.fields.zip(childrenContexts).map { + case (field, Some(context)) => + field.copy(dataType = context) + case (field, None) => field + } + Some(StructType(newFields)) - case _ => - val contextWinnerOpt = getContextRelevantChildren(expr) - .flatMap(findCollationContext) - .foldLeft(Option.empty[CollationContext]) { - case (Some(left), right) => - collationPrecedenceWinner(left, right) - case (None, right) => - Some(right) - } + case map: CreateMap => + val keyContexts = map.keys.flatMap(findCollationContext) + val valueContexts = map.values.flatMap(findCollationContext) + if (keyContexts.length + valueContexts.length != map.children.length) { + return None + } - contextWinnerOpt.map { context => - if (hasStringType(expr.dataType)) { - CollationContext(expr.dataType, context.strength) - } else { - context - } + val keyContextWinner = mergeWinners(map.dataType.keyType, keyContexts) + val valueContextWinner = mergeWinners(map.dataType.valueType, valueContexts) + if (keyContextWinner.isEmpty || valueContextWinner.isEmpty) { + return None } + Some(MapType(keyContextWinner.get, valueContextWinner.get)) + + case _ => + val childContexts = expr.children.flatMap(findCollationContext) + mergeWinners(expr.dataType, childContexts) } contextOpt.foreach(expr.setTagValue(COLLATION_CONTEXT_TAG, _)) @@ -288,69 +365,100 @@ object CollationTypeCoercion { } /** - * Returns the children of the given expression that should be used for calculating the - * winning collation context. + * Base cases for determining the strength of the collation. */ - private def getContextRelevantChildren(expression: Expression): Seq[Expression] = { - expression match { - // collation context for named struct should be calculated based on its values only - case createStruct: CreateNamedStruct => - createStruct.valExprs + private def collationStrengthBaseCases: PartialFunction[Expression, Option[DataType]] = { + case expr if hasCollationContextTag(expr) => + Some(expr.getTagValue(COLLATION_CONTEXT_TAG).get) + + // if `expr` doesn't have a string in its dataType then it doesn't + // have the collation context either + case expr if !expr.dataType.existsRecursively(_.isInstanceOf[StringType]) => + None + + case collate: Collate => + Some(addContextToStringType(collate.dataType, Explicit)) + + case cast: Cast => + val castStrength = if (hasStringType(cast.child.dataType)) { + Implicit + } else { + Default + } - // collation context does not depend on the key for extracting the value - case extract: ExtractValue => - Seq(extract.child) + Some(addContextToStringType(cast.dataType, castStrength)) - // we currently don't support collation precedence for maps, - // as this would involve calculating them for keys and values separately - case _: CreateMap => - Seq.empty + case expr @ (_: NamedExpression | _: SubqueryExpression | _: VariableReference) => + Some(addContextToStringType(expr.dataType, Implicit)) - case _ => - expression.children + case lit: Literal => + Some(addContextToStringType(lit.dataType, Default)) + + // if it does have a string type but none of its children do + // then the collation context strength is default + case expr if !expr.children.exists(_.dataType.existsRecursively(_.isInstanceOf[StringType])) => + Some(addContextToStringType(expr.dataType, Default)) + } + + /** + * Adds collation context to the given string type so we can know its strength. + */ + private def addContextToStringType(dt: DataType, strength: CollationStrength): DataType = { + dt.transformRecursively { + case st: StringType => StringTypeWithContext(st, strength) } } /** - * Returns the collation context that wins in precedence between left and right. + * Merges multiple data types structurally according to strength of the collations into the + * data type of the `start`. + * + * If any of the data types cannot be merged, it returns None. */ - private def collationPrecedenceWinner( - left: CollationContext, - right: CollationContext): Option[CollationContext] = { - - val (leftStringType, rightStringType) = - (extractStringType(left.dataType), extractStringType(right.dataType)) match { - case (Some(l), Some(r)) => - (l, r) - case (None, None) => - return None - case (Some(_), None) => - return Some(left) - case (None, Some(_)) => - return Some(right) - } + private def mergeWinners(start: DataType, rest: Seq[DataType]): Option[DataType] = { + rest.foldLeft(Option(start)) { + case (Some(acc), childContext) => + mergeWinner(acc, childContext) + case (None, _) => + None + } + } - (left.strength, right.strength) match { - case (Explicit, Explicit) if leftStringType != rightStringType => - throw QueryCompilationErrors.explicitCollationMismatchError( - Seq(leftStringType, rightStringType)) + /** + * Merges two data types structurally according to strength of the collations. + */ + private def mergeWinner(left: DataType, right: DataType): Option[DataType] = { + mergeStructurally(left, right) { + case (left: StringTypeWithContext, right: StringTypeWithContext) => + getWinningStringType(left, right) - case (Explicit, _) => Some(left) - case (_, Explicit) => Some(right) + case (_: StringType, right: StringTypeWithContext) => + right + } + } - case (Implicit, Implicit) if leftStringType != rightStringType => + /** Determines the winning StringTypeWithContext based on the strength of the collation. */ + private def getWinningStringType( + left: StringTypeWithContext, + right: StringTypeWithContext): StringTypeWithContext = { + def handleMismatch(): Nothing = { + if (left.strength == Explicit) { + throw QueryCompilationErrors.explicitCollationMismatchError( + Seq(left.stringType, right.stringType)) + } else { throw QueryCompilationErrors.implicitCollationMismatchError( - Seq(leftStringType, rightStringType)) - - case (Implicit, _) => Some(left) - case (_, Implicit) => Some(right) + Seq(left.stringType, right.stringType)) + } + } - case (Default, Default) if leftStringType != rightStringType => - throw QueryCompilationErrors.implicitCollationMismatchError( - Seq(leftStringType, rightStringType)) + (left.strength.priority, right.strength.priority) match { + case (leftPriority, rightPriority) if leftPriority == rightPriority => + if (left.sameType(right)) left + else handleMismatch() - case _ => - Some(left) + case (leftPriority, rightPriority) => + if (leftPriority < rightPriority) left + else right } } } @@ -358,18 +466,32 @@ object CollationTypeCoercion { /** * Represents the strength of collation used for determining precedence in collation resolution. */ -private sealed trait CollationStrength {} +private sealed trait CollationStrength { + val priority: Int +} private object CollationStrength { - case object Explicit extends CollationStrength {} - case object Implicit extends CollationStrength {} - case object Default extends CollationStrength {} + case object Explicit extends CollationStrength { + override val priority: Int = 0 + } + case object Implicit extends CollationStrength { + override val priority: Int = 1 + } + case object Default extends CollationStrength { + override val priority: Int = 2 + } } /** * Encapsulates the context for collation, including data type and strength. * - * @param dataType The data type associated with this collation context. + * @param stringType StringType. * @param strength The strength level of the collation, which determines its precedence. */ -private case class CollationContext(dataType: DataType, strength: CollationStrength) {} +private case class StringTypeWithContext(stringType: StringType, strength: CollationStrength) + extends DataType { + + override def defaultSize: Int = stringType.defaultSize + + override private[spark] def asNullable: DataType = this +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala index e869cb281ce05..56b2103c555db 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala @@ -53,9 +53,10 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { (exprs, plan) } else { plan match { - // For `Distinct` and `SubqueryAlias`, we can't recursively resolve and add attributes - // via its children. - case u: UnaryNode if !u.isInstanceOf[Distinct] && !u.isInstanceOf[SubqueryAlias] => + // For `Distinct` and `SubqueryAlias` and `PipeOperator`, we can't recursively resolve and + // add attributes via its children. + case u: UnaryNode if !u.isInstanceOf[Distinct] && !u.isInstanceOf[SubqueryAlias] + && !u.isInstanceOf[PipeOperator] => val (newExprs, newChild) = { // Resolving expressions against current plan. val maybeResolvedExprs = exprs.map(resolveExpressionByPlanOutput(_, u)) @@ -221,35 +222,35 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { val outerPlan = AnalysisContext.get.outerPlan if (outerPlan.isEmpty) return e - e.transformWithPruning(_.containsAnyPattern(UNRESOLVED_ATTRIBUTE, TEMP_RESOLVED_COLUMN)) { + def resolve(nameParts: Seq[String]): Option[Expression] = try { + outerPlan.get match { + // Subqueries in UnresolvedHaving can host grouping expressions and aggregate functions. + // We should resolve columns with `agg.output` and the rule `ResolveAggregateFunctions` will + // push them down to Aggregate later. This is similar to what we do in `resolveColumns`. + case u @ UnresolvedHaving(_, agg: Aggregate) => + agg.resolveChildren(nameParts, conf.resolver) + .orElse(u.resolveChildren(nameParts, conf.resolver)) + .map(wrapOuterReference) + case other => + other.resolveChildren(nameParts, conf.resolver).map(wrapOuterReference) + } + } catch { + case ae: AnalysisException => + logDebug(ae.getMessage) + None + } + + e.transformWithPruning( + _.containsAnyPattern(UNRESOLVED_ATTRIBUTE, TEMP_RESOLVED_COLUMN)) { case u: UnresolvedAttribute => - resolveOuterReference(u.nameParts, outerPlan.get).getOrElse(u) + resolve(u.nameParts).getOrElse(u) // Re-resolves `TempResolvedColumn` as outer references if it has tried to be resolved with // Aggregate but failed. case t: TempResolvedColumn if t.hasTried => - resolveOuterReference(t.nameParts, outerPlan.get).getOrElse(t) + resolve(t.nameParts).getOrElse(t) } } - protected def resolveOuterReference( - nameParts: Seq[String], outerPlan: LogicalPlan): Option[Expression] = try { - outerPlan match { - // Subqueries in UnresolvedHaving can host grouping expressions and aggregate functions. - // We should resolve columns with `agg.output` and the rule `ResolveAggregateFunctions` will - // push them down to Aggregate later. This is similar to what we do in `resolveColumns`. - case u @ UnresolvedHaving(_, agg: Aggregate) => - agg.resolveChildren(nameParts, conf.resolver) - .orElse(u.resolveChildren(nameParts, conf.resolver)) - .map(wrapOuterReference) - case other => - other.resolveChildren(nameParts, conf.resolver).map(wrapOuterReference) - } - } catch { - case ae: AnalysisException => - logDebug(ae.getMessage) - None - } - def lookupVariable(nameParts: Seq[String]): Option[VariableReference] = { // The temp variables live in `SYSTEM.SESSION`, and the name can be qualified or not. def maybeTempVariableName(nameParts: Seq[String]): Boolean = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala index c1535343d7686..8398fb8d1e830 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala @@ -132,6 +132,13 @@ object DeduplicateRelations extends Rule[LogicalPlan] { _.output.map(_.exprId.id), newFlatMap => newFlatMap.copy(output = newFlatMap.output.map(_.newInstance()))) + case f: FlatMapGroupsInArrow => + deduplicateAndRenew[FlatMapGroupsInArrow]( + existingRelations, + f, + _.output.map(_.exprId.id), + newFlatMap => newFlatMap.copy(output = newFlatMap.output.map(_.newInstance()))) + case f: FlatMapCoGroupsInPandas => deduplicateAndRenew[FlatMapCoGroupsInPandas]( existingRelations, @@ -139,6 +146,13 @@ object DeduplicateRelations extends Rule[LogicalPlan] { _.output.map(_.exprId.id), newFlatMap => newFlatMap.copy(output = newFlatMap.output.map(_.newInstance()))) + case f: FlatMapCoGroupsInArrow => + deduplicateAndRenew[FlatMapCoGroupsInArrow]( + existingRelations, + f, + _.output.map(_.exprId.id), + newFlatMap => newFlatMap.copy(output = newFlatMap.output.map(_.newInstance()))) + case m: MapInPandas => deduplicateAndRenew[MapInPandas]( existingRelations, @@ -378,12 +392,24 @@ object DeduplicateRelations extends Rule[LogicalPlan] { newVersion.copyTagsFrom(oldVersion) Seq((oldVersion, newVersion)) + case oldVersion @ FlatMapGroupsInArrow(_, _, output, _) + if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty => + val newVersion = oldVersion.copy(output = output.map(_.newInstance())) + newVersion.copyTagsFrom(oldVersion) + Seq((oldVersion, newVersion)) + case oldVersion @ FlatMapCoGroupsInPandas(_, _, _, output, _, _) if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty => val newVersion = oldVersion.copy(output = output.map(_.newInstance())) newVersion.copyTagsFrom(oldVersion) Seq((oldVersion, newVersion)) + case oldVersion @ FlatMapCoGroupsInArrow(_, _, _, output, _, _) + if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty => + val newVersion = oldVersion.copy(output = output.map(_.newInstance())) + newVersion.copyTagsFrom(oldVersion) + Seq((oldVersion, newVersion)) + case oldVersion @ MapInPandas(_, output, _, _, _) if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty => val newVersion = oldVersion.copy(output = output.map(_.newInstance())) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/EliminateLazyExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/EliminateLazyExpression.scala new file mode 100644 index 0000000000000..68f3f90e193b6 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/EliminateLazyExpression.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern.LAZY_EXPRESSION + +/** + * `LazyExpression` is a marker node to trigger lazy analysis in DataFrames. It's useless when + * entering the analyzer and this rule removes it. + */ +object EliminateLazyExpression extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = { + plan.resolveExpressionsUpWithPruning(_.containsPattern(LAZY_EXPRESSION)) { + case l: LazyExpression => l.child + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 5103f8048856a..54f6820d2091f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -506,6 +506,8 @@ object FunctionRegistry { expression[CollectList]("collect_list"), expression[CollectList]("array_agg", true, Some("3.3.0")), expression[CollectSet]("collect_set"), + expression[ListAgg]("listagg"), + expression[ListAgg]("string_agg", setAlias = true), expressionBuilder("count_min_sketch", CountMinSketchAggExpressionBuilder), expression[BoolAnd]("every", true), expression[BoolAnd]("bool_and"), @@ -882,6 +884,7 @@ object FunctionRegistry { // Avro expression[FromAvro]("from_avro"), expression[ToAvro]("to_avro"), + expression[SchemaOfAvro]("schema_of_avro"), // Protobuf expression[FromProtobuf]("from_protobuf"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionResolution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionResolution.scala index 5a27a72190325..800126e0030e8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionResolution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionResolution.scala @@ -128,18 +128,15 @@ class FunctionResolution( numArgs: Int, u: UnresolvedFunction): Expression = { func match { - case owg: SupportsOrderingWithinGroup if u.isDistinct => - throw QueryCompilationErrors.distinctInverseDistributionFunctionUnsupportedError( - owg.prettyName - ) + case owg: SupportsOrderingWithinGroup if !owg.isDistinctSupported && u.isDistinct => + throw QueryCompilationErrors.distinctWithOrderingFunctionUnsupportedError(owg.prettyName) case owg: SupportsOrderingWithinGroup - if !owg.orderingFilled && u.orderingWithinGroup.isEmpty => - throw QueryCompilationErrors.inverseDistributionFunctionMissingWithinGroupError( - owg.prettyName - ) + if owg.isOrderingMandatory && !owg.orderingFilled && u.orderingWithinGroup.isEmpty => + throw QueryCompilationErrors.functionMissingWithinGroupError(owg.prettyName) case owg: SupportsOrderingWithinGroup if owg.orderingFilled && u.orderingWithinGroup.nonEmpty => - throw QueryCompilationErrors.wrongNumOrderingsForInverseDistributionFunctionError( + // e.g mode(expr1) within group (order by expr2) is not supported + throw QueryCompilationErrors.wrongNumOrderingsForFunctionError( owg.prettyName, 0, u.orderingWithinGroup.length @@ -198,7 +195,7 @@ class FunctionResolution( case agg: AggregateFunction => // Note: PythonUDAF does not support these advanced clauses. if (agg.isInstanceOf[PythonUDAF]) checkUnsupportedAggregateClause(agg, u) - // After parse, the inverse distribution functions not set the ordering within group yet. + // After parse, the functions not set the ordering within group yet. val newAgg = agg match { case owg: SupportsOrderingWithinGroup if !owg.orderingFilled && u.orderingWithinGroup.nonEmpty => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCollationName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCollationName.scala new file mode 100644 index 0000000000000..50f36f78a4724 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCollationName.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern.UNRESOLVED_COLLATION +import org.apache.spark.sql.catalyst.util.CollationFactory + +/** + * Resolves fully qualified collation name and replaces [[UnresolvedCollation]] with + * [[ResolvedCollation]]. + */ +object ResolveCollationName extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = + plan.resolveExpressionsWithPruning(_.containsPattern(UNRESOLVED_COLLATION), ruleId) { + case UnresolvedCollation(collationName) => + ResolvedCollation(CollationFactory.resolveFullyQualifiedName(collationName.toArray)) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultStringTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultStringTypes.scala new file mode 100644 index 0000000000000..75958ff3e1177 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultStringTypes.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{AddColumns, AlterColumn, AlterViewAs, ColumnDefinition, CreateView, LogicalPlan, QualifiedColType, ReplaceColumns, V1CreateTablePlan, V2CreateTablePlan} +import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} +import org.apache.spark.sql.types.{DataType, StringType} + +/** + * Resolves default string types in queries and commands. For queries, the default string type is + * determined by the session's default string type. For DDL, the default string type is the + * default type of the object (table -> schema -> catalog). However, this is not implemented yet. + * So, we will just use UTF8_BINARY for now. + */ +object ResolveDefaultStringTypes extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = { + val newPlan = apply0(plan) + if (plan.ne(newPlan)) { + // Due to how tree transformations work and StringType object being equal to + // StringType("UTF8_BINARY"), we need to transform the plan twice + // to ensure the correct results for occurrences of default string type. + val finalPlan = apply0(newPlan) + RuleExecutor.forceAdditionalIteration(finalPlan) + finalPlan + } else { + newPlan + } + } + + private def apply0(plan: LogicalPlan): LogicalPlan = { + if (isDDLCommand(plan)) { + transformDDL(plan) + } else { + transformPlan(plan, sessionDefaultStringType) + } + } + + /** + * Returns whether any of the given `plan` needs to have its + * default string type resolved. + */ + def needsResolution(plan: LogicalPlan): Boolean = { + if (!isDDLCommand(plan) && isDefaultSessionCollationUsed) { + return false + } + + plan.exists(node => needsResolution(node.expressions)) + } + + /** + * Returns whether any of the given `expressions` needs to have its + * default string type resolved. + */ + def needsResolution(expressions: Seq[Expression]): Boolean = { + expressions.exists(needsResolution) + } + + /** + * Returns whether the given `expression` needs to have its + * default string type resolved. + */ + def needsResolution(expression: Expression): Boolean = { + expression.exists(e => transformExpression.isDefinedAt(e)) + } + + private def isDefaultSessionCollationUsed: Boolean = conf.defaultStringType == StringType + + /** + * Returns the default string type that should be used in a given DDL command (for now always + * UTF8_BINARY). + */ + private def stringTypeForDDLCommand(table: LogicalPlan): StringType = + StringType("UTF8_BINARY") + + /** Returns the session default string type */ + private def sessionDefaultStringType: StringType = + StringType(conf.defaultStringType.collationId) + + private def isDDLCommand(plan: LogicalPlan): Boolean = plan exists { + case _: AddColumns | _: ReplaceColumns | _: AlterColumn => true + case _ => isCreateOrAlterPlan(plan) + } + + private def isCreateOrAlterPlan(plan: LogicalPlan): Boolean = plan match { + case _: V1CreateTablePlan | _: V2CreateTablePlan | _: CreateView | _: AlterViewAs => true + case _ => false + } + + private def transformDDL(plan: LogicalPlan): LogicalPlan = { + val newType = stringTypeForDDLCommand(plan) + + plan resolveOperators { + case p if isCreateOrAlterPlan(p) => + transformPlan(p, newType) + + case addCols: AddColumns => + addCols.copy(columnsToAdd = replaceColumnTypes(addCols.columnsToAdd, newType)) + + case replaceCols: ReplaceColumns => + replaceCols.copy(columnsToAdd = replaceColumnTypes(replaceCols.columnsToAdd, newType)) + + case alter: AlterColumn + if alter.dataType.isDefined && hasDefaultStringType(alter.dataType.get) => + alter.copy(dataType = Some(replaceDefaultStringType(alter.dataType.get, newType))) + } + } + + /** + * Transforms the given plan, by transforming all expressions in its operators to use the given + * new type instead of the default string type. + */ + private def transformPlan(plan: LogicalPlan, newType: StringType): LogicalPlan = { + plan resolveExpressionsUp { expression => + transformExpression + .andThen(_.apply(newType)) + .applyOrElse(expression, identity[Expression]) + } + } + + /** + * Transforms the given expression, by changing all default string types to the given new type. + */ + private def transformExpression: PartialFunction[Expression, StringType => Expression] = { + case columnDef: ColumnDefinition if hasDefaultStringType(columnDef.dataType) => + newType => columnDef.copy(dataType = replaceDefaultStringType(columnDef.dataType, newType)) + + case cast: Cast if hasDefaultStringType(cast.dataType) => + newType => cast.copy(dataType = replaceDefaultStringType(cast.dataType, newType)) + + case Literal(value, dt) if hasDefaultStringType(dt) => + newType => Literal(value, replaceDefaultStringType(dt, newType)) + } + + private def hasDefaultStringType(dataType: DataType): Boolean = + dataType.existsRecursively(isDefaultStringType) + + private def isDefaultStringType(dataType: DataType): Boolean = { + dataType match { + case st: StringType => + // should only return true for StringType object and not StringType("UTF8_BINARY") + st.eq(StringType) || st.isInstanceOf[TemporaryStringType] + case _ => false + } + } + + private def replaceDefaultStringType(dataType: DataType, newType: StringType): DataType = { + dataType.transformRecursively { + case currentType: StringType if isDefaultStringType(currentType) => + if (currentType == newType) { + TemporaryStringType() + } else { + newType + } + } + } + + private def replaceColumnTypes( + colTypes: Seq[QualifiedColType], + newType: StringType): Seq[QualifiedColType] = { + colTypes.map { + case colWithDefault if hasDefaultStringType(colWithDefault.dataType) => + val replaced = replaceDefaultStringType(colWithDefault.dataType, newType) + colWithDefault.copy(dataType = replaced) + + case col => col + } + } +} + +case class TemporaryStringType() extends StringType(1) { + override def toString: String = s"TemporaryStringType($collationId)" +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala index 0e1e71a658c8b..2cf3c6390d5fb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{AliasHelper, EvalHelper, Expression} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.logical.{CTERelationRef, LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.trees.TreePattern.{UNRESOLVED_IDENTIFIER, UNRESOLVED_IDENTIFIER_WITH_CTE} +import org.apache.spark.sql.catalyst.trees.TreePattern.UNRESOLVED_IDENTIFIER import org.apache.spark.sql.types.StringType /** @@ -30,18 +30,9 @@ import org.apache.spark.sql.types.StringType object ResolveIdentifierClause extends Rule[LogicalPlan] with AliasHelper with EvalHelper { override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUpWithPruning( - _.containsAnyPattern(UNRESOLVED_IDENTIFIER, UNRESOLVED_IDENTIFIER_WITH_CTE)) { + _.containsPattern(UNRESOLVED_IDENTIFIER)) { case p: PlanWithUnresolvedIdentifier if p.identifierExpr.resolved && p.childrenResolved => p.planBuilder.apply(evalIdentifierExpr(p.identifierExpr), p.children) - case u @ UnresolvedWithCTERelations(p, cteRelations) => - this.apply(p) match { - case u @ UnresolvedRelation(Seq(table), _, _) => - cteRelations.find(r => plan.conf.resolver(r._1, table)).map { case (_, d) => - // Add a `SubqueryAlias` for hint-resolving rules to match relation names. - SubqueryAlias(table, CTERelationRef(d.id, d.resolved, d.output, d.isStreaming)) - }.getOrElse(u) - case other => other - } case other => other.transformExpressionsWithPruning(_.containsAnyPattern(UNRESOLVED_IDENTIFIER)) { case e: ExpressionWithUnresolvedIdentifier if e.identifierExpr.resolved => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala index 62f3997491c07..b9e9e49a39647 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala @@ -29,8 +29,12 @@ import org.apache.spark.sql.catalyst.trees.AlwaysProcess object ResolveInlineTables extends Rule[LogicalPlan] with EvalHelper { override def apply(plan: LogicalPlan): LogicalPlan = { plan.resolveOperatorsWithPruning(AlwaysProcess.fn, ruleId) { - case table: UnresolvedInlineTable if table.expressionsResolved => + case table: UnresolvedInlineTable if canResolveTable(table) => EvaluateUnresolvedInlineTable.evaluateUnresolvedInlineTable(table) } } + + private def canResolveTable(table: UnresolvedInlineTable): Boolean = { + table.expressionsResolved && !ResolveDefaultStringTypes.needsResolution(table) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveLateralColumnAliasReference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveLateralColumnAliasReference.scala index da8065eab606d..cb26820a0c79d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveLateralColumnAliasReference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveLateralColumnAliasReference.scala @@ -17,6 +17,10 @@ package org.apache.spark.sql.catalyst.analysis +import java.util.LinkedHashSet + +import scala.jdk.CollectionConverters._ + import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.WindowExpression.hasWindowExpression import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression @@ -147,7 +151,7 @@ object ResolveLateralColumnAliasReference extends Rule[LogicalPlan] { && pOriginal.projectList.exists(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) => val p @ Project(projectList, child) = pOriginal.mapChildren(apply0) var aliasMap = AttributeMap.empty[AliasEntry] - val referencedAliases = collection.mutable.Set.empty[AliasEntry] + val referencedAliases = new LinkedHashSet[AliasEntry] def unwrapLCAReference(e: NamedExpression): NamedExpression = { e.transformWithPruning(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) { case lcaRef: LateralColumnAliasReference if aliasMap.contains(lcaRef.a) => @@ -156,7 +160,7 @@ object ResolveLateralColumnAliasReference extends Rule[LogicalPlan] { // and unwrap the LateralColumnAliasReference to the NamedExpression inside // If there is chaining, don't resolve and save to future rounds if (!aliasEntry.alias.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) { - referencedAliases += aliasEntry + referencedAliases.add(aliasEntry) lcaRef.ne } else { lcaRef @@ -182,7 +186,7 @@ object ResolveLateralColumnAliasReference extends Rule[LogicalPlan] { val outerProjectList = collection.mutable.Seq(newProjectList: _*) val innerProjectList = collection.mutable.ArrayBuffer(child.output.map(_.asInstanceOf[NamedExpression]): _*) - referencedAliases.foreach { case AliasEntry(alias: Alias, idx) => + referencedAliases.forEach { case AliasEntry(alias: Alias, idx) => outerProjectList.update(idx, alias.toAttribute) innerProjectList += alias } @@ -222,7 +226,7 @@ object ResolveLateralColumnAliasReference extends Rule[LogicalPlan] { if (!aggregateExpressions.forall(eligibleToLiftUp)) { agg } else { - val newAggExprs = collection.mutable.Set.empty[NamedExpression] + val newAggExprs = new LinkedHashSet[NamedExpression] val expressionMap = collection.mutable.LinkedHashMap.empty[Expression, NamedExpression] // Extract the expressions to keep in the Aggregate. Return the transformed expression // fully substituted with the attribute reference to the extracted expressions. @@ -249,11 +253,11 @@ object ResolveLateralColumnAliasReference extends Rule[LogicalPlan] { } } val ne = expressionMap.getOrElseUpdate(aggExpr.canonicalized, assignAlias(aggExpr)) - newAggExprs += ne + newAggExprs.add(ne) ne.toAttribute case e if groupingExpressions.exists(_.semanticEquals(e)) => val ne = expressionMap.getOrElseUpdate(e.canonicalized, assignAlias(e)) - newAggExprs += ne + newAggExprs.add(ne) ne.toAttribute case e => e.mapChildren(extractExpressions) } @@ -262,7 +266,7 @@ object ResolveLateralColumnAliasReference extends Rule[LogicalPlan] { extractExpressions(_).asInstanceOf[NamedExpression]) Project( projectList = projectExprs, - child = agg.copy(aggregateExpressions = newAggExprs.toSeq) + child = agg.copy(aggregateExpressions = newAggExprs.asScala.toSeq) ) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableSpec.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableSpec.scala index cc9979ad4c5e5..05158fbee3de6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableSpec.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableSpec.scala @@ -92,6 +92,7 @@ object ResolveTableSpec extends Rule[LogicalPlan] { options = newOptions.toMap, location = u.location, comment = u.comment, + collation = u.collation, serde = u.serde, external = u.external) withNewSpec(newTableSpec) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SQLFunctionExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SQLFunctionExpression.scala new file mode 100644 index 0000000000000..37981f47287da --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SQLFunctionExpression.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.catalog.SQLFunction +import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression, Unevaluable} +import org.apache.spark.sql.catalyst.trees.TreePattern.{SQL_FUNCTION_EXPRESSION, SQL_SCALAR_FUNCTION, TreePattern} +import org.apache.spark.sql.types.DataType + +/** + * Represent a SQL function expression resolved from the catalog SQL function builder. + */ +case class SQLFunctionExpression( + name: String, + function: SQLFunction, + inputs: Seq[Expression], + returnType: Option[DataType]) extends Expression with Unevaluable { + override def children: Seq[Expression] = inputs + override def dataType: DataType = returnType.get + override def nullable: Boolean = true + override def prettyName: String = name + override def toString: String = s"$name(${children.mkString(", ")})" + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression]): SQLFunctionExpression = copy(inputs = newChildren) + final override val nodePatterns: Seq[TreePattern] = Seq(SQL_FUNCTION_EXPRESSION) +} + +/** + * A wrapper node for a SQL scalar function expression. + */ +case class SQLScalarFunction(function: SQLFunction, inputs: Seq[Expression], child: Expression) + extends UnaryExpression with Unevaluable { + override def dataType: DataType = child.dataType + override def toString: String = s"${function.name}(${inputs.mkString(", ")})" + override def sql: String = s"${function.name}(${inputs.map(_.sql).mkString(", ")})" + override protected def withNewChildInternal(newChild: Expression): SQLScalarFunction = { + copy(child = newChild) + } + final override val nodePatterns: Seq[TreePattern] = Seq(SQL_SCALAR_FUNCTION) + // The `inputs` is for display only and does not matter in execution. + override lazy val canonicalized: Expression = copy(inputs = Nil, child = child.canonicalized) + override lazy val deterministic: Boolean = { + function.deterministic.getOrElse(true) && children.forall(_.deterministic) + } +} + +/** + * Provide a way to keep state during analysis for resolving nested SQL functions. + * + * @param nestedSQLFunctionDepth The nested depth in the SQL function resolution. A SQL function + * expression should only be expanded as a [[SQLScalarFunction]] if + * the nested depth is 0. + */ +case class SQLFunctionContext(nestedSQLFunctionDepth: Int = 0) + +object SQLFunctionContext { + + private val value = new ThreadLocal[SQLFunctionContext]() { + override def initialValue: SQLFunctionContext = SQLFunctionContext() + } + + def get: SQLFunctionContext = value.get() + + def reset(): Unit = value.remove() + + private def set(context: SQLFunctionContext): Unit = value.set(context) + + def withSQLFunction[A](f: => A): A = { + val originContext = value.get() + val context = originContext.copy( + nestedSQLFunctionDepth = originContext.nestedSQLFunctionDepth + 1) + set(context) + try f finally { set(originContext) } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SQLFunctionNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SQLFunctionNode.scala new file mode 100644 index 0000000000000..38059d9810a7b --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SQLFunctionNode.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.catalog.SQLFunction +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode} +import org.apache.spark.sql.catalyst.trees.TreePattern.{FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION, SQL_TABLE_FUNCTION, TreePattern} +import org.apache.spark.sql.errors.DataTypeErrors.toSQLId +import org.apache.spark.sql.errors.QueryCompilationErrors + +/** + * A container for holding a SQL function query plan and its function identifier. + * + * @param function: the SQL function that this node represents. + * @param child: the SQL function body. + */ +case class SQLFunctionNode( + function: SQLFunction, + child: LogicalPlan) extends UnaryNode { + override def output: Seq[Attribute] = child.output + override def stringArgs: Iterator[Any] = Iterator(function.name, child) + override protected def withNewChildInternal(newChild: LogicalPlan): SQLFunctionNode = + copy(child = newChild) + + // Throw a reasonable error message when trying to call a SQL UDF with TABLE argument(s). + if (child.containsPattern(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION)) { + throw QueryCompilationErrors + .tableValuedArgumentsNotYetImplementedForSqlFunctions("call", toSQLId(function.name.funcName)) + } +} + +/** + * Represent a SQL table function plan resolved from the catalog SQL table function builder. + */ +case class SQLTableFunction( + name: String, + function: SQLFunction, + inputs: Seq[Expression], + override val output: Seq[Attribute]) extends LeafNode { + final override val nodePatterns: Seq[TreePattern] = Seq(SQL_TABLE_FUNCTION) + + // Throw a reasonable error message when trying to call a SQL UDF with TABLE argument(s) because + // this functionality is not implemented yet. + if (inputs.exists(_.containsPattern(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION))) { + throw QueryCompilationErrors + .tableValuedArgumentsNotYetImplementedForSqlFunctions("call", toSQLId(name)) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index c30aa9bf91a1d..4769970b51421 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -77,6 +77,8 @@ object TypeCoercion extends TypeCoercionBase { case (NullType, t1) => Some(t1) case (t1, NullType) => Some(t1) + case(s1: StringType, s2: StringType) => StringHelper.tightestCommonString(s1, s2) + case (t1: IntegralType, t2: DecimalType) if t2.isWiderThan(t1) => Some(t2) case (t1: DecimalType, t2: IntegralType) if t1.isWiderThan(t2) => @@ -149,6 +151,7 @@ object TypeCoercion extends TypeCoercionBase { case (DecimalType.Fixed(_, s), _: StringType) if s > 0 => Some(DoubleType) case (_: StringType, DecimalType.Fixed(_, s)) if s > 0 => Some(DoubleType) + case (s1: StringType, s2: StringType) => StringHelper.tightestCommonString(s1, s2) case (l: StringType, r: AtomicType) if canPromoteAsInBinaryComparison(r) => Some(r) case (l: AtomicType, r: StringType) if canPromoteAsInBinaryComparison(l) => Some(l) case (l, r) => None @@ -190,6 +193,12 @@ object TypeCoercion extends TypeCoercionBase { // Cast null type (usually from null literals) into target types case (NullType, target) => target.defaultConcreteType + case (s1: StringType, s2: StringType) => + if (s1.collationId == s2.collationId && StringHelper.isMoreConstrained(s1, s2)) { + s2 + } else { + null + } // If the function accepts any numeric type and the input is a string, we follow the hive // convention and cast that input into a double case (_: StringType, NumericType) => NumericType.defaultConcreteType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionHelper.scala index 5b4d76a2a73ed..3b3cf748014b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionHelper.scala @@ -44,6 +44,7 @@ import org.apache.spark.sql.catalyst.expressions.{ MapConcat, MapZipWith, NaNvl, + RandStr, RangeFrame, ScalaUDF, Sequence, @@ -318,7 +319,8 @@ abstract class TypeCoercionHelper { } case aj @ ArrayJoin(arr, d, nr) - if !AbstractArrayType(StringTypeWithCollation).acceptsType(arr.dataType) && + if !AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true)). + acceptsType(arr.dataType) && ArrayType.acceptsType(arr.dataType) => val containsNull = arr.dataType.asInstanceOf[ArrayType].containsNull implicitCast(arr, ArrayType(StringType, containsNull)) match { @@ -399,6 +401,11 @@ abstract class TypeCoercionHelper { NaNvl(Cast(l, DoubleType), r) case NaNvl(l, r) if r.dataType == NullType => NaNvl(l, Cast(r, l.dataType)) + case r: RandStr if r.length.dataType != IntegerType => + implicitCast(r.length, IntegerType).map { casted => + r.copy(length = casted) + }.getOrElse(r) + case other => other } } @@ -415,7 +422,7 @@ abstract class TypeCoercionHelper { if conf.concatBinaryAsString || !children.map(_.dataType).forall(_ == BinaryType) => val newChildren = c.children.map { e => - implicitCast(e, SQLConf.get.defaultStringType).getOrElse(e) + implicitCast(e, StringType).getOrElse(e) } c.copy(children = newChildren) case other => other @@ -465,7 +472,7 @@ abstract class TypeCoercionHelper { if (conf.eltOutputAsString || !children.tail.map(_.dataType).forall(_ == BinaryType)) { children.tail.map { e => - implicitCast(e, SQLConf.get.defaultStringType).getOrElse(e) + implicitCast(e, StringType).getOrElse(e) } } else { children.tail diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala index 4f33c26d5c3c3..f7ab41bd6f96c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.analysis +import java.util.Locale + import org.apache.spark.internal.{Logging, MDC} import org.apache.spark.internal.LogKeys.{ANALYSIS_ERROR, QUERY_PLAN} import org.apache.spark.sql.AnalysisException @@ -103,6 +105,7 @@ object UnsupportedOperationChecker extends Logging { case d: Deduplicate if d.isStreaming && d.keys.exists(hasEventTimeCol) => true case d: DeduplicateWithinWatermark if d.isStreaming => true case t: TransformWithState if t.isStreaming => true + case t: TransformWithStateInPandas if t.isStreaming => true case _ => false } @@ -139,6 +142,38 @@ object UnsupportedOperationChecker extends Logging { } } + private def checkAvroSupportForStatefulOperator(p: LogicalPlan): Option[String] = p match { + // TODO: remove operators from this list as support for avro encoding is added + case s: Aggregate if s.isStreaming => Some("aggregation") + // Since the Distinct node will be replaced to Aggregate in the optimizer rule + // [[ReplaceDistinctWithAggregate]], here we also need to check all Distinct node by + // assuming it as Aggregate. + case d @ Distinct(_: LogicalPlan) if d.isStreaming => Some("distinct") + case _ @ Join(left, right, _, _, _) if left.isStreaming && right.isStreaming => Some("join") + case f: FlatMapGroupsWithState if f.isStreaming => Some("flatMapGroupsWithState") + case f: FlatMapGroupsInPandasWithState if f.isStreaming => + Some("applyInPandasWithState") + case d: Deduplicate if d.isStreaming => Some("dropDuplicates") + case d: DeduplicateWithinWatermark if d.isStreaming => Some("dropDuplicatesWithinWatermark") + case _ => None + } + + // Rule to check that avro encoding format is not supported in case any + // non-transformWithState stateful streaming operators are present in the query. + def checkSupportedStoreEncodingFormats(plan: LogicalPlan): Unit = { + val storeEncodingFormat = SQLConf.get.stateStoreEncodingFormat + if (storeEncodingFormat.toLowerCase(Locale.ROOT) == "avro") { + plan.foreach { subPlan => + val operatorOpt = checkAvroSupportForStatefulOperator(subPlan) + if (operatorOpt.isDefined) { + val errorMsg = "State store encoding format as avro is not supported for " + + s"operator=${operatorOpt.get} used within the query" + throwError(errorMsg)(plan) + } + } + } + } + def checkForStreaming(plan: LogicalPlan, outputMode: OutputMode): Unit = { if (!plan.isStreaming) { throwError( @@ -198,6 +233,11 @@ object UnsupportedOperationChecker extends Logging { "DataFrames/Datasets")(plan) } + // check to see that if store encoding format is set to true, then we have no stateful + // operators in the query or only variants of operators that support avro encoding such as + // transformWithState. + checkSupportedStoreEncodingFormats(plan) + val aggregates = collectStreamingAggregates(plan) // Disallow some output mode outputMode match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ViewResolution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ViewResolution.scala new file mode 100644 index 0000000000000..89ef29ddaaf1c --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ViewResolution.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, View} +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf + +object ViewResolution { + def resolve( + view: View, + resolveChild: LogicalPlan => LogicalPlan, + checkAnalysis: LogicalPlan => Unit): View = { + // The view's child should be a logical plan parsed from the `desc.viewText`, the variable + // `viewText` should be defined, or else we throw an error on the generation of the View + // operator. + + // Resolve all the UnresolvedRelations and Views in the child. + val newChild = AnalysisContext.withAnalysisContext(view.desc) { + val nestedViewDepth = AnalysisContext.get.nestedViewDepth + val maxNestedViewDepth = AnalysisContext.get.maxNestedViewDepth + if (nestedViewDepth > maxNestedViewDepth) { + throw QueryCompilationErrors.viewDepthExceedsMaxResolutionDepthError( + view.desc.identifier, + maxNestedViewDepth, + view + ) + } + SQLConf.withExistingConf(View.effectiveSQLConf(view.desc.viewSQLConfigs, view.isTempView)) { + resolveChild(view.child) + } + } + + // Fail the analysis eagerly because outside AnalysisContext, the unresolved operators + // inside a view maybe resolved incorrectly. + checkAnalysis(newChild) + + view.copy(child = newChild) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala index c92171ec5c750..b452ca15bed58 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala @@ -54,15 +54,18 @@ class SubstituteExecuteImmediate(val catalogManager: CatalogManager) def resolveVariable(e: Expression): Expression = { /** - * We know that the expression is either UnresolvedAttribute or Alias, as passed from the - * parser. If it is an UnresolvedAttribute, we look it up in the catalog and return it. If it - * is an Alias, we resolve the child and return an Alias with the same name. + * We know that the expression is either UnresolvedAttribute, Alias or Parameter, as passed from + * the parser. If it is an UnresolvedAttribute, we look it up in the catalog and return it. If + * it is an Alias, we resolve the child and return an Alias with the same name. If it is + * a Parameter, we leave it as is because the parameter belongs to another parameterized + * query and should be resolved later. */ e match { case u: UnresolvedAttribute => getVariableReference(u, u.nameParts) case a: Alias => Alias(resolveVariable(a.child), a.name)() + case p: Parameter => p case other => throw QueryCompilationErrors.unsupportedParameterExpression(other) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala index f24227abbb651..2cfc2a8c90dc5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.expressions.{Alias, CreateArray, CreateMap, CreateNamedStruct, Expression, LeafExpression, Literal, MapFromArrays, MapFromEntries, SubqueryExpression, Unevaluable, VariableReference} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SupervisingCommand} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMAND, PARAMETER, PARAMETERIZED_QUERY, TreePattern, UNRESOLVED_IDENTIFIER_WITH_CTE, UNRESOLVED_WITH} +import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMAND, PARAMETER, PARAMETERIZED_QUERY, TreePattern, UNRESOLVED_WITH} import org.apache.spark.sql.errors.QueryErrorsBase import org.apache.spark.sql.types.DataType @@ -104,18 +104,6 @@ case class PosParameterizedQuery(child: LogicalPlan, args: Seq[Expression]) copy(child = newChild) } -/** - * Base class for rules that process parameterized queries. - */ -abstract class ParameterizedQueryProcessor extends Rule[LogicalPlan] { - def assertUnresolvedPlanHasSingleParameterizedQuery(plan: LogicalPlan): Unit = { - if (plan.containsPattern(PARAMETERIZED_QUERY)) { - val parameterizedQueries = plan.collect { case p: ParameterizedQuery => p } - assert(parameterizedQueries.length == 1) - } - } -} - /** * Moves `ParameterizedQuery` inside `SupervisingCommand` for their supervised plans to be * resolved later by the analyzer. @@ -127,10 +115,8 @@ abstract class ParameterizedQueryProcessor extends Rule[LogicalPlan] { * `PosParameterizedQuery(ExplainCommand(ExplainCommand(SomeQuery(...))))` => * `ExplainCommand(ExplainCommand(PosParameterizedQuery(SomeQuery(...))))` */ -object MoveParameterizedQueriesDown extends ParameterizedQueryProcessor { +object MoveParameterizedQueriesDown extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { - assertUnresolvedPlanHasSingleParameterizedQuery(plan) - plan.resolveOperatorsWithPruning(_.containsPattern(PARAMETERIZED_QUERY)) { case pq: ParameterizedQuery if pq.exists(isSupervisingCommand) => moveParameterizedQueryIntoSupervisingCommand(pq) @@ -161,7 +147,7 @@ object MoveParameterizedQueriesDown extends ParameterizedQueryProcessor { * by collection constructor functions such as `map()`, `array()`, `struct()` * from the user-specified arguments. */ -object BindParameters extends ParameterizedQueryProcessor with QueryErrorsBase { +object BindParameters extends Rule[LogicalPlan] with QueryErrorsBase { private def checkArgs(args: Iterable[(String, Expression)]): Unit = { def isNotAllowed(expr: Expression): Boolean = expr.exists { case _: Literal | _: CreateArray | _: CreateNamedStruct | @@ -176,20 +162,23 @@ object BindParameters extends ParameterizedQueryProcessor with QueryErrorsBase { } } - private def bind(p: LogicalPlan)(f: PartialFunction[Expression, Expression]): LogicalPlan = { - p.resolveExpressionsWithPruning(_.containsPattern(PARAMETER)) (f orElse { - case sub: SubqueryExpression => sub.withNewPlan(bind(sub.plan)(f)) - }) + private def bind(p0: LogicalPlan)(f: PartialFunction[Expression, Expression]): LogicalPlan = { + var stop = false + p0.resolveOperatorsDownWithPruning(_.containsPattern(PARAMETER) && !stop) { + case p1 => + stop = p1.isInstanceOf[ParameterizedQuery] + p1.transformExpressionsWithPruning(_.containsPattern(PARAMETER)) (f orElse { + case sub: SubqueryExpression => sub.withNewPlan(bind(sub.plan)(f)) + }) + } } override def apply(plan: LogicalPlan): LogicalPlan = { - assertUnresolvedPlanHasSingleParameterizedQuery(plan) - plan.resolveOperatorsWithPruning(_.containsPattern(PARAMETERIZED_QUERY)) { // We should wait for `CTESubstitution` to resolve CTE before binding parameters, as CTE // relations are not children of `UnresolvedWith`. case NameParameterizedQuery(child, argNames, argValues) - if !child.containsAnyPattern(UNRESOLVED_WITH, UNRESOLVED_IDENTIFIER_WITH_CTE) && + if !child.containsPattern(UNRESOLVED_WITH) && argValues.forall(_.resolved) => if (argNames.length != argValues.length) { throw SparkException.internalError(s"The number of argument names ${argNames.length} " + @@ -200,7 +189,7 @@ object BindParameters extends ParameterizedQueryProcessor with QueryErrorsBase { bind(child) { case NamedParameter(name) if args.contains(name) => args(name) } case PosParameterizedQuery(child, args) - if !child.containsAnyPattern(UNRESOLVED_WITH, UNRESOLVED_IDENTIFIER_WITH_CTE) && + if !child.containsPattern(UNRESOLVED_WITH) && args.forall(_.resolved) => val indexedArgs = args.zipWithIndex checkArgs(indexedArgs.map(arg => (s"_${arg._2}", arg._1))) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala new file mode 100644 index 0000000000000..7b652437dbd8b --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{AliasResolution, UnresolvedAlias} +import org.apache.spark.sql.catalyst.expressions.{ + Alias, + Cast, + CreateNamedStruct, + Expression, + NamedExpression +} + +/** + * Resolver class that resolves unresolved aliases and handles user-specified aliases. + */ +class AliasResolver(expressionResolver: ExpressionResolver, scopes: NameScopeStack) + extends TreeNodeResolver[UnresolvedAlias, Expression] + with ResolvesExpressionChildren { + + /** + * Resolves [[UnresolvedAlias]] by handling two specific cases: + * - Alias(CreateNamedStruct(...)) - instead of calling [[CreateNamedStructResolver]] which will + * clean up its inner aliases, we manually resolve [[CreateNamedStruct]]'s children, because we + * need to preserve inner aliases until after the alias name is computed. This is a hack because + * fixed-point analyzer computes [[Alias]] name before removing inner aliases. + * - Alias(...) - recursively call [[ExpressionResolver]] to resolve the child expression. + * + * After the children are resolved, call [[AliasResolution]] to compute the alias name. Finally, + * clean up inner aliases from [[CreateNamedStruct]]. + */ + override def resolve(unresolvedAlias: UnresolvedAlias): NamedExpression = { + val aliasWithResolvedChildren = withResolvedChildren( + unresolvedAlias, { + case createNamedStruct: CreateNamedStruct => + withResolvedChildren(createNamedStruct, expressionResolver.resolve) + case other => expressionResolver.resolve(other) + } + ) + + val resolvedAlias = + AliasResolution.resolve(aliasWithResolvedChildren).asInstanceOf[NamedExpression] + + scopes.top.addAlias(resolvedAlias.name) + AliasResolver.cleanupAliases(resolvedAlias) + } + + /** + * Handle already resolved [[Alias]] nodes, i.e. user-specified aliases. We disallow stacking + * of [[Alias]] nodes by collapsing them so that only the top node remains. + * + * For an example query like: + * + * {{{ SELECT 1 AS a }}} + * + * parsed plan will be: + * + * Project [Alias(1, a)] + * +- OneRowRelation + * + */ + def handleResolvedAlias(alias: Alias): Alias = { + val aliasWithResolvedChildren = withResolvedChildren(alias, expressionResolver.resolve) + scopes.top.addAlias(aliasWithResolvedChildren.name) + AliasResolver.collapseAlias(aliasWithResolvedChildren) + } +} + +object AliasResolver { + + /** + * For a query like: + * + * {{{ SELECT STRUCT(1 AS a, 2 AS b) AS st }}} + * + * After resolving [[CreateNamedStruct]] the plan will be: + * CreateNamedStruct(Seq("a", Alias(1, "a"), "b", Alias(2, "b"))) + * + * For a query like: + * + * {{{ df.select($"col1".cast("int").cast("double")) }}} + * + * After resolving top-most [[Alias]] the plan will be: + * Alias(Cast(Alias(Cast(col1, int), col1)), double), col1) + * + * Both examples contain inner aliases that are not expected in the analyzed logical plan, + * therefore need to be removed. However, in both examples inner aliases are necessary in order + * for the outer alias to compute its name. To achieve this, we delay removal of inner aliases + * until after the outer alias name is computed. + * + * For cases where there are no dependencies on inner alias, inner alias should be removed by the + * resolver that produces it. + */ + private def cleanupAliases(namedExpression: NamedExpression): NamedExpression = + namedExpression + .withNewChildren(namedExpression.children.map { + case cast @ Cast(alias: Alias, _, _, _) => + cast.copy(child = alias.child) + case createNamedStruct: CreateNamedStruct => + CreateNamedStructResolver.cleanupAliases(createNamedStruct) + case other => other + }) + .asInstanceOf[NamedExpression] + + /** + * If an [[Alias]] node appears on top of another [[Alias]], remove the bottom one. Here we don't + * handle a case where a node of different type appears between two [[Alias]] nodes: in this + * case, removal of inner alias (if it is unnecessary) should be handled by respective node's + * resolver, in order to preserve the bottom-up contract. + */ + private def collapseAlias(alias: Alias): Alias = + alias.child match { + case innerAlias: Alias => + val metadata = if (alias.metadata.isEmpty) { + None + } else { + Some(alias.metadata) + } + alias.copy(child = innerAlias.child)( + exprId = alias.exprId, + qualifier = alias.qualifier, + explicitMetadata = metadata, + nonInheritableMetadataKeys = alias.nonInheritableMetadataKeys + ) + case _ => alias + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AnalyzerBridgeState.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AnalyzerBridgeState.scala new file mode 100644 index 0000000000000..d3e93c82dfa21 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AnalyzerBridgeState.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.HashMap + +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +/** + * The [[AnalyzerBridgeState]] is a state passed from legacy [[Analyzer]] to the single-pass + * [[Resolver]]. + * + * @param relationsWithResolvedMetadata A map from [[UnresolvedRelation]] to the relations with + * resolved metadata. It allows us to reuse the relation metadata and avoid duplicate + * catalog/table lookups in dual-run mode (when + * [[ANALYZER_SINGLE_PASS_RESOLVER_RELATION_BRIDGING_ENABLED]] is true). + */ +case class AnalyzerBridgeState( + relationsWithResolvedMetadata: AnalyzerBridgeState.RelationsWithResolvedMetadata = + new AnalyzerBridgeState.RelationsWithResolvedMetadata) + +object AnalyzerBridgeState { + type RelationsWithResolvedMetadata = HashMap[UnresolvedRelation, LogicalPlan] +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AttributeScopeStack.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AttributeScopeStack.scala new file mode 100644 index 0000000000000..6f9d6defd2edb --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AttributeScopeStack.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.ArrayDeque + +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet} + +/** + * The [[AttributeScopeStack]] is used to validate that the attribute which was encountered by the + * [[ExpressionResolutionValidator]] is in the current operator's visibility scope. We use + * [[AttributeSet]] as scope implementation here to check the equality of attributes based on their + * expression IDs. + * + * E.g. for the following SQL query: + * {{{ + * SELECT a, a, a + col2 FROM (SELECT col1 as a, col2 FROM VALUES (1, 2)); + * }}} + * + * Having the following logical plan: + * {{{ + * Project [a#2, a#2, (a#2 + col2#1) AS (a + col2)#3] + * +- SubqueryAlias __auto_generated_subquery_name + * +- Project [col1#0 AS a#2, col2#1] + * +- LocalRelation [col1#0, col2#1] + * }}} + * + * The [[LocalRelation]] outputs attributes with IDs #0 and #1, which can be referenced by the lower + * [[Project]]. This [[Project]] produces a new attribute ID #2 for an alias and retains the old + * ID #1 for col2. The upper [[Project]] references `a` twice using the same ID #2 and produces a + * new ID #3 for an alias of `a + col2`. + */ +class AttributeScopeStack { + private val stack = new ArrayDeque[AttributeSet] + push() + + /** + * Get the relevant attribute scope in the context of the current operator. + */ + def top: AttributeSet = { + stack.peek() + } + + /** + * Overwrite current relevant scope with a sequence of attributes which is an output of some + * operator. `attributes` can have duplicate IDs if the output of the operator contains multiple + * occurrences of the same attribute. + */ + def overwriteTop(attributes: Seq[Attribute]): Unit = { + stack.pop() + stack.push(AttributeSet(attributes)) + } + + /** + * Execute `body` in the context of a fresh attribute scope. Used by [[Project]] and [[Aggregate]] + * validation code since those operators introduce a new scope with fresh expression IDs. + */ + def withNewScope[R](body: => R): Unit = { + push() + try { + body + } finally { + pop() + } + } + + private def push(): Unit = { + stack.push(AttributeSet(Seq.empty)) + } + + private def pop(): Unit = { + stack.pop() + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/BinaryArithmeticResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/BinaryArithmeticResolver.scala new file mode 100644 index 0000000000000..7d9c6752094d7 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/BinaryArithmeticResolver.scala @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{ + AnsiStringPromotionTypeCoercion, + AnsiTypeCoercion, + BinaryArithmeticWithDatetimeResolver, + DecimalPrecisionTypeCoercion, + DivisionTypeCoercion, + IntegralDivisionTypeCoercion, + StringPromotionTypeCoercion, + TypeCoercion +} +import org.apache.spark.sql.catalyst.expressions.{ + Add, + BinaryArithmetic, + DateAdd, + Divide, + Expression, + Multiply, + Subtract, + SubtractDates +} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DateType, StringType} + +/** + * [[BinaryArithmeticResolver]] is invoked by [[ExpressionResolver]] in order to resolve + * [[BinaryArithmetic]] nodes. During resolution, calling [[BinaryArithmeticWithDatetimeResolver]] + * and applying type coercion can result in [[BinaryArithmetic]] producing some other type of node + * or a subtree of nodes. In such cases a downwards traversal is necessary, but not going deeper + * than the original expression's children, since all nodes below that point are guaranteed to be + * already resolved. + * + * For example, given a query: + * + * SELECT '4 11:11' - INTERVAL '4 22:12' DAY TO MINUTE + * + * [[BinaryArithmeticResolver]] is called for the following expression: + * + * Subtract( + * Literal('4 11:11', StringType), + * Literal(Interval('4 22:12' DAY TO MINUTE), DayTimeIntervalType(0,2)) + * ) + * + * After calling [[BinaryArithmeticWithDatetimeResolver]] and applying type coercion, + * the expression is transformed into: + * + * Cast( + * DatetimeSub( + * TimeAdd( + * Literal('4 11:11', StringType), + * UnaryMinus( + * Literal(Interval('4 22:12' DAY TO MINUTE), DayTimeIntervalType(0,2)) + * ) + * ) + * ) + * ) + * + * A single [[Subtract]] node is replaced with a subtree of nodes. In order to resolve this subtree + * we need to invoke [[ExpressionResolver]] recursively on the top-most node's children. The + * top-most node itself is not resolved recursively in order to avoid recursive calls to + * [[BinaryArithmeticResolver]] and other sub-resolvers. To prevent a case where we resolve the + * same node twice, we need to mark nodes that will act as a limit for the downwards traversal by + * applying a [[ExpressionResolver.SINGLE_PASS_SUBTREE_BOUNDARY]] tag to them. These children + * along with all the nodes below them are guaranteed to be resolved at this point. When + * [[ExpressionResolver]] reaches one of the tagged nodes, it returns identity rather than + * resolving it. Finally, after resolving the subtree, we need to resolve the top-most node itself, + * which in this case means applying a timezone, if necessary. + */ +class BinaryArithmeticResolver( + expressionResolver: ExpressionResolver, + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver) + extends TreeNodeResolver[BinaryArithmetic, Expression] + with ProducesUnresolvedSubtree { + + private val shouldTrackResolvedNodes = + conf.getConf(SQLConf.ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED) + + private val typeCoercionRules: Seq[Expression => Expression] = + if (conf.ansiEnabled) { + BinaryArithmeticResolver.ANSI_TYPE_COERCION_RULES + } else { + BinaryArithmeticResolver.TYPE_COERCION_RULES + } + private val typeCoercionResolver: TypeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + override def resolve(unresolvedBinaryArithmetic: BinaryArithmetic): Expression = { + val binaryArithmeticWithResolvedChildren: BinaryArithmetic = + withResolvedChildren(unresolvedBinaryArithmetic, expressionResolver.resolve) + val binaryArithmeticWithResolvedSubtree: Expression = + withResolvedSubtree(binaryArithmeticWithResolvedChildren, expressionResolver.resolve) { + transformBinaryArithmeticNode(binaryArithmeticWithResolvedChildren) + } + val binaryArithmeticWithResolvedTimezone = timezoneAwareExpressionResolver.withResolvedTimezone( + binaryArithmeticWithResolvedSubtree, + conf.sessionLocalTimeZone + ) + reallocateKnownNodesForTracking(binaryArithmeticWithResolvedTimezone) + } + + /** + * Transform [[BinaryArithmetic]] node by calling [[BinaryArithmeticWithDatetimeResolver]] and + * applying type coercion. Initial node can be replaced with some other type of node or a subtree + * of nodes. + */ + private def transformBinaryArithmeticNode(binaryArithmetic: BinaryArithmetic): Expression = { + val binaryArithmeticWithDateTypeReplaced: Expression = + replaceDateType(binaryArithmetic) + val binaryArithmeticWithTypeCoercion: Expression = + typeCoercionResolver.resolve(binaryArithmeticWithDateTypeReplaced) + // In case that original expression's children types are DateType and StringType, fixed-point + // fails to resolve the expression with a single application of + // [[BinaryArithmeticWithDatetimeResolver]]. Therefore, single-pass resolver needs to invoke + // [[BinaryArithmeticWithDatetimeResolver.resolve]], type coerce and only after that fix the + // date/string case. Instead of invoking [[BinaryArithmeticWithDatetimeResolver]] again, we + // handle the case directly. + ( + binaryArithmetic.left.dataType, + binaryArithmetic.right.dataType + ) match { + case (_: DateType, _: StringType) => + binaryArithmeticWithTypeCoercion match { + case add: Add => DateAdd(add.left, add.right) + case subtract: Subtract => SubtractDates(subtract.left, subtract.right) + case other => other + } + case _ => binaryArithmeticWithTypeCoercion + } + } + + /** + * When DateType like operand is given to [[BinaryArithmetic]], apply + * [[BinaryArithmeticWithDatetimeResolver]] in order to replace the [[BinaryArithmetic]] with + * the appropriate equivalent for DateTime types. + */ + private def replaceDateType(expression: Expression) = expression match { + case arithmetic @ (_: Add | _: Subtract | _: Multiply | _: Divide) => + BinaryArithmeticWithDatetimeResolver.resolve(arithmetic) + case other => other + } + + /** + * Since [[TracksResolvedNodes]] requires all the expressions in the tree to be unique objects, + * we reallocate the known nodes in [[ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED]] mode, + * otherwise we preserve the old object to avoid unnecessary memory allocations. + */ + private def reallocateKnownNodesForTracking(expression: Expression): Expression = { + if (shouldTrackResolvedNodes) { + expression match { + case add: Add => add.copy() + case subtract: Subtract => subtract.copy() + case multiply: Multiply => multiply.copy() + case divide: Divide => divide.copy() + case _ => expression + } + } else { + expression + } + } +} + +object BinaryArithmeticResolver { + // Ordering in the list of type coercions should be in sync with the list in [[TypeCoercion]]. + private val TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + StringPromotionTypeCoercion.apply, + DecimalPrecisionTypeCoercion.apply, + DivisionTypeCoercion.apply, + IntegralDivisionTypeCoercion.apply, + TypeCoercion.ImplicitTypeCoercion.apply, + TypeCoercion.DateTimeOperationsTypeCoercion.apply + ) + + // Ordering in the list of type coercions should be in sync with the list in [[AnsiTypeCoercion]]. + private val ANSI_TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + AnsiStringPromotionTypeCoercion.apply, + DecimalPrecisionTypeCoercion.apply, + DivisionTypeCoercion.apply, + IntegralDivisionTypeCoercion.apply, + AnsiTypeCoercion.ImplicitTypeCoercion.apply, + AnsiTypeCoercion.AnsiDateTimeOperationsTypeCoercion.apply + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/BridgedRelationsProvider.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/BridgedRelationsProvider.scala new file mode 100644 index 0000000000000..bc7a9df064c33 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/BridgedRelationsProvider.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.RelationResolution +import org.apache.spark.sql.connector.catalog.CatalogManager + +/** + * The [[BridgedRelationMetadataProvider]] is a [[RelationMetadataProvider]] that just reuses + * resolved metadata from the [[AnalyzerBridgeState]]. This is used in the single-pass [[Resolver]] + * to avoid duplicate catalog/table lookups in dual-run mode, so metadata is simply reused from the + * fixed-point [[Analyzer]] run. We strictly rely on the [[AnalyzerBridgeState]] to avoid any + * blocking calls here. + */ +class BridgedRelationMetadataProvider( + override val catalogManager: CatalogManager, + override val relationResolution: RelationResolution, + analyzerBridgeState: AnalyzerBridgeState +) extends RelationMetadataProvider { + override val relationsWithResolvedMetadata = getRelationsFromBridgeState(analyzerBridgeState) + + private def getRelationsFromBridgeState( + analyzerBridgeState: AnalyzerBridgeState): RelationsWithResolvedMetadata = { + val result = new RelationsWithResolvedMetadata + analyzerBridgeState.relationsWithResolvedMetadata.forEach( + (unresolvedRelation, relationWithResolvedMetadata) => { + result.put( + relationIdFromUnresolvedRelation(unresolvedRelation), + relationWithResolvedMetadata + ) + } + ) + result + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ConditionalExpressionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ConditionalExpressionResolver.scala new file mode 100644 index 0000000000000..75ba1b7a01a5c --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ConditionalExpressionResolver.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.analysis.{AnsiTypeCoercion, TypeCoercion} +import org.apache.spark.sql.catalyst.expressions.{ConditionalExpression, Expression} + +/** + * Resolver for [[If]], [[CaseWhen]] and [[Coalesce]] expressions. + */ +class ConditionalExpressionResolver( + expressionResolver: ExpressionResolver, + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver) + extends TreeNodeResolver[ConditionalExpression, Expression] + with ResolvesExpressionChildren + with SQLConfHelper { + + private val typeCoercionRules: Seq[Expression => Expression] = + if (conf.ansiEnabled) { + ConditionalExpressionResolver.ANSI_TYPE_COERCION_RULES + } else { + ConditionalExpressionResolver.TYPE_COERCION_RULES + } + private val typeCoercionResolver: TypeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + override def resolve(unresolvedConditionalExpression: ConditionalExpression): Expression = { + val conditionalExpressionWithResolvedChildren = + withResolvedChildren(unresolvedConditionalExpression, expressionResolver.resolve) + + typeCoercionResolver.resolve(conditionalExpressionWithResolvedChildren) + } +} + +object ConditionalExpressionResolver { + // Ordering in the list of type coercions should be in sync with the list in [[TypeCoercion]]. + private val TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + TypeCoercion.CaseWhenTypeCoercion.apply, + TypeCoercion.FunctionArgumentTypeCoercion.apply, + TypeCoercion.IfTypeCoercion.apply + ) + + // Ordering in the list of type coercions should be in sync with the list in [[AnsiTypeCoercion]]. + private val ANSI_TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + AnsiTypeCoercion.CaseWhenTypeCoercion.apply, + AnsiTypeCoercion.FunctionArgumentTypeCoercion.apply, + AnsiTypeCoercion.IfTypeCoercion.apply + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/CreateNamedStructResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/CreateNamedStructResolver.scala new file mode 100644 index 0000000000000..12c3c71b5e8be --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/CreateNamedStructResolver.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.expressions.{Alias, CreateNamedStruct, Expression} + +/** + * Resolves [[CreateNamedStruct]] nodes by recursively resolving children. If [[CreateNamedStruct]] + * is not directly under an [[Alias]], removes aliases from struct fields. Otherwise, let + * [[AliasResolver]] handle the removal. + */ +class CreateNamedStructResolver(expressionResolver: ExpressionResolver) + extends TreeNodeResolver[CreateNamedStruct, Expression] + with ResolvesExpressionChildren { + + override def resolve(createNamedStruct: CreateNamedStruct): Expression = { + val createNamedStructWithResolvedChildren = + withResolvedChildren(createNamedStruct, expressionResolver.resolve) + CreateNamedStructResolver.cleanupAliases(createNamedStructWithResolvedChildren) + } +} + +object CreateNamedStructResolver { + + /** + * For a query like: + * + * {{{ SELECT STRUCT(1 AS a, 2 AS b) }}} + * + * [[CreateNamedStruct]] will be: CreateNamedStruct(Seq("a", Alias(1, "a"), "b", Alias(2, "b"))) + * + * Because inner aliases are not expected in the analyzed logical plan, we need to remove them + * here. However, we only do so if [[CreateNamedStruct]] is not directly under an [[Alias]], in + * which case the removal will be handled by [[AliasResolver]]. This is because in single-pass, + * [[Alias]] is resolved after [[CreateNamedStruct]] and in order to compute the correct output + * name, it needs to know complete structure of the child. + */ + def cleanupAliases(createNamedStruct: CreateNamedStruct): CreateNamedStruct = { + createNamedStruct + .withNewChildren(createNamedStruct.children.map { + case a: Alias if a.metadata.isEmpty => a.child + case other => other + }) + .asInstanceOf[CreateNamedStruct] + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/DelegatesResolutionToExtensions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/DelegatesResolutionToExtensions.scala new file mode 100644 index 0000000000000..7d57e4683df40 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/DelegatesResolutionToExtensions.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.errors.QueryCompilationErrors + +/** + * The [[DelegatesResolutionToExtensions]] is a trait which provides a method to delegate the + * resolution of unresolved operators to a list of [[ResolverExtension]]s. + */ +trait DelegatesResolutionToExtensions { + + protected val extensions: Seq[ResolverExtension] + + /** + * Find the suitable extension for `unresolvedOperator` resolution and resolve it with that + * extension. Usually extensions return resolved relation nodes, so we generically update the name + * scope without matching for specific relations, for simplicity. + * + * We match the extension once to reduce the number of + * [[ResolverExtension.resolveOperator.isDefinedAt]] calls, because those can be expensive. + * + * @returns `Some(resolutionResult)` if the extension was found and `unresolvedOperator` was + * resolved, `None` otherwise. + * + * @throws `AMBIGUOUS_RESOLVER_EXTENSION` if there were several matched extensions for this + * operator. + */ + def tryDelegateResolutionToExtension(unresolvedOperator: LogicalPlan): Option[LogicalPlan] = { + var resolutionResult: Option[LogicalPlan] = None + var matchedExtension: Option[ResolverExtension] = None + extensions.foreach { extension => + matchedExtension match { + case None => + resolutionResult = extension.resolveOperator.lift(unresolvedOperator) + + if (resolutionResult.isDefined) { + matchedExtension = Some(extension) + } + case Some(matchedExtension) => + if (extension.resolveOperator.isDefinedAt(unresolvedOperator)) { + throw QueryCompilationErrors + .ambiguousResolverExtension( + unresolvedOperator, + Seq(matchedExtension, extension).map(_.getClass.getSimpleName) + ) + .withPosition(unresolvedOperator.origin) + } + } + } + + resolutionResult + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExplicitlyUnsupportedResolverFeature.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExplicitlyUnsupportedResolverFeature.scala new file mode 100644 index 0000000000000..e6279c9740395 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExplicitlyUnsupportedResolverFeature.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +/** + * This is an addon to [[ResolverGuard]] functionality for features that cannot be determined by + * only looking at the unresolved plan. [[Resolver]] will throw this control-flow exception + * when it encounters some explicitly unsupported feature. Later behavior depends on the value of + * [[HybridAnalyzer.checkSupportedSinglePassFeatures]] flag: + * - If it is true: It will later be caught by [[HybridAnalyzer]] to abort single-pass + * analysis without comparing single-pass and fixed-point results. The motivation for this + * feature is the same as for the [[ResolverGuard]] - we want to have an explicit allowlist of + * unimplemented features that we are aware of, and `UNSUPPORTED_SINGLE_PASS_ANALYZER_FEATURE` + * will signal us the rest of the gaps. + * - If it is false: It will be thrown by the [[HybridAnalyzer]] in order to get better sense + * of coverage. + * + * For example, [[UnresolvedRelation]] can be intermediately resolved by [[ResolveRelations]] as + * [[UnresolvedCatalogRelation]] or a [[View]] (among all others). Say that for now the views + * are not implemented, and we are aware of that, so [[ExplicitlyUnsupportedResolverFeature]] will + * be thrown in the middle of the single-pass analysis to abort it. + */ +class ExplicitlyUnsupportedResolverFeature(reason: String) + extends Exception( + s"The single-pass analyzer cannot process this query or command because it does not yet " + + s"support $reason." + ) { + override def getStackTrace(): Array[StackTraceElement] = new Array[StackTraceElement](0) + override def fillInStackTrace(): Throwable = this +} + +/** + * This object contains all the metadata on explicitly unsupported resolver features. + */ +object ExplicitlyUnsupportedResolverFeature { + val OPERATORS = Set( + "org.apache.spark.sql.catalyst.plans.logical.View", + "org.apache.spark.sql.catalyst.streaming.StreamingRelationV2", + "org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation", + "org.apache.spark.sql.execution.streaming.StreamingRelation" + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExpressionResolutionValidator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExpressionResolutionValidator.scala new file mode 100644 index 0000000000000..8c80992e2fa2c --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExpressionResolutionValidator.scala @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.expressions.{ + Alias, + ArrayDistinct, + ArrayInsert, + ArrayJoin, + ArrayMax, + ArrayMin, + ArraysZip, + AttributeReference, + BinaryExpression, + ConditionalExpression, + CreateArray, + CreateMap, + CreateNamedStruct, + Expression, + ExtractANSIIntervalDays, + GetArrayStructFields, + GetMapValue, + GetStructField, + Literal, + MapConcat, + MapContainsKey, + MapEntries, + MapFromEntries, + MapKeys, + MapValues, + NamedExpression, + Predicate, + RuntimeReplaceable, + StringRPad, + StringToMap, + TimeZoneAwareExpression, + UnaryMinus +} +import org.apache.spark.sql.types.BooleanType + +/** + * The [[ExpressionResolutionValidator]] performs the validation work on the expression tree for the + * [[ResolutionValidator]]. These two components work together recursively validating the + * logical plan. You can find more info in the [[ResolutionValidator]] scaladoc. + */ +class ExpressionResolutionValidator(resolutionValidator: ResolutionValidator) { + + /** + * Validate resolved expression tree. The principle is the same as + * [[ResolutionValidator.validate]]. + */ + def validate(expression: Expression): Unit = { + expression match { + case attributeReference: AttributeReference => + validateAttributeReference(attributeReference) + case alias: Alias => + validateAlias(alias) + case getMapValue: GetMapValue => + validateGetMapValue(getMapValue) + case binaryExpression: BinaryExpression => + validateBinaryExpression(binaryExpression) + case extractANSIIntervalDay: ExtractANSIIntervalDays => + validateExtractANSIIntervalDays(extractANSIIntervalDay) + case literal: Literal => + validateLiteral(literal) + case predicate: Predicate => + validatePredicate(predicate) + case stringRPad: StringRPad => + validateStringRPad(stringRPad) + case unaryMinus: UnaryMinus => + validateUnaryMinus(unaryMinus) + case getStructField: GetStructField => + validateGetStructField(getStructField) + case createNamedStruct: CreateNamedStruct => + validateCreateNamedStruct(createNamedStruct) + case getArrayStructFields: GetArrayStructFields => + validateGetArrayStructFields(getArrayStructFields) + case createMap: CreateMap => + validateCreateMap(createMap) + case stringToMap: StringToMap => + validateStringToMap(stringToMap) + case mapContainsKey: MapContainsKey => + validateMapContainsKey(mapContainsKey) + case mapConcat: MapConcat => + validateMapConcat(mapConcat) + case mapKeys: MapKeys => + validateMapKeys(mapKeys) + case mapValues: MapValues => + validateMapValues(mapValues) + case mapEntries: MapEntries => + validateMapEntries(mapEntries) + case mapFromEntries: MapFromEntries => + validateMapFromEntries(mapFromEntries) + case createArray: CreateArray => + validateCreateArray(createArray) + case arrayDistinct: ArrayDistinct => + validateArrayDistinct(arrayDistinct) + case arrayInsert: ArrayInsert => + validateArrayInsert(arrayInsert) + case arrayJoin: ArrayJoin => + validateArrayJoin(arrayJoin) + case arrayMax: ArrayMax => + validateArrayMax(arrayMax) + case arrayMin: ArrayMin => + validateArrayMin(arrayMin) + case arraysZip: ArraysZip => + validateArraysZip(arraysZip) + case conditionalExpression: ConditionalExpression => + validateConditionalExpression(conditionalExpression) + case runtimeReplaceable: RuntimeReplaceable => + validateRuntimeReplaceable(runtimeReplaceable) + case timezoneExpression: TimeZoneAwareExpression => + validateTimezoneExpression(timezoneExpression) + } + } + + def validateProjectList(projectList: Seq[NamedExpression]): Unit = { + projectList.foreach { + case attributeReference: AttributeReference => + validateAttributeReference(attributeReference) + case alias: Alias => + validateAlias(alias) + } + } + + private def validatePredicate(predicate: Predicate) = { + predicate.children.foreach(validate) + assert( + predicate.dataType == BooleanType, + s"Output type of a predicate must be a boolean, but got: ${predicate.dataType.typeName}" + ) + assert( + predicate.checkInputDataTypes().isSuccess, + "Input types of a predicate must be valid, but got: " + + predicate.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateStringRPad(stringRPad: StringRPad) = { + validate(stringRPad.first) + validate(stringRPad.second) + validate(stringRPad.third) + assert( + stringRPad.checkInputDataTypes().isSuccess, + "Input types of rpad must be valid, but got: " + + stringRPad.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateAttributeReference(attributeReference: AttributeReference): Unit = { + assert( + resolutionValidator.attributeScopeStack.top.contains(attributeReference), + s"Attribute $attributeReference is missing from attribute scope: " + + s"${resolutionValidator.attributeScopeStack.top}" + ) + } + + private def validateAlias(alias: Alias): Unit = { + validate(alias.child) + } + + private def validateBinaryExpression(binaryExpression: BinaryExpression): Unit = { + validate(binaryExpression.left) + validate(binaryExpression.right) + assert( + binaryExpression.checkInputDataTypes().isSuccess, + "Input types of a binary expression must be valid, but got: " + + binaryExpression.children.map(_.dataType.typeName).mkString(", ") + ) + + binaryExpression match { + case timezoneExpression: TimeZoneAwareExpression => + assert(timezoneExpression.timeZoneId.nonEmpty, "Timezone expression must have a timezone") + case _ => + } + } + + private def validateConditionalExpression(conditionalExpression: ConditionalExpression): Unit = + conditionalExpression.children.foreach(validate) + + private def validateExtractANSIIntervalDays( + extractANSIIntervalDays: ExtractANSIIntervalDays): Unit = { + validate(extractANSIIntervalDays.child) + } + + private def validateLiteral(literal: Literal): Unit = {} + + private def validateUnaryMinus(unaryMinus: UnaryMinus): Unit = { + validate(unaryMinus.child) + assert( + unaryMinus.checkInputDataTypes().isSuccess, + "Input types of a unary minus must be valid, but got: " + + unaryMinus.child.dataType.typeName.mkString(", ") + ) + } + + private def validateGetStructField(getStructField: GetStructField): Unit = { + validate(getStructField.child) + } + + private def validateCreateNamedStruct(createNamedStruct: CreateNamedStruct): Unit = { + createNamedStruct.children.foreach(validate) + assert( + createNamedStruct.checkInputDataTypes().isSuccess, + "Input types of CreateNamedStruct must be valid, but got: " + + createNamedStruct.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateGetArrayStructFields(getArrayStructFields: GetArrayStructFields): Unit = { + validate(getArrayStructFields.child) + } + + private def validateGetMapValue(getMapValue: GetMapValue): Unit = { + validate(getMapValue.child) + validate(getMapValue.key) + assert( + getMapValue.checkInputDataTypes().isSuccess, + "Input types of GetMapValue must be valid, but got: " + + getMapValue.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateCreateMap(createMap: CreateMap): Unit = { + createMap.children.foreach(validate) + assert( + createMap.checkInputDataTypes().isSuccess, + "Input types of CreateMap must be valid, but got: " + + createMap.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateStringToMap(stringToMap: StringToMap): Unit = { + validate(stringToMap.text) + validate(stringToMap.pairDelim) + validate(stringToMap.keyValueDelim) + } + + private def validateMapContainsKey(mapContainsKey: MapContainsKey): Unit = { + validate(mapContainsKey.left) + validate(mapContainsKey.right) + assert( + mapContainsKey.checkInputDataTypes().isSuccess, + "Input types of MapContainsKey must be valid, but got: " + + mapContainsKey.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateMapConcat(mapConcat: MapConcat): Unit = { + mapConcat.children.foreach(validate) + assert( + mapConcat.checkInputDataTypes().isSuccess, + "Input types of MapConcat must be valid, but got: " + + mapConcat.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateMapKeys(mapKeys: MapKeys): Unit = { + validate(mapKeys.child) + } + + private def validateMapValues(mapValues: MapValues): Unit = { + validate(mapValues.child) + } + + private def validateMapEntries(mapEntries: MapEntries): Unit = { + validate(mapEntries.child) + } + + private def validateMapFromEntries(mapFromEntries: MapFromEntries): Unit = { + mapFromEntries.children.foreach(validate) + assert( + mapFromEntries.checkInputDataTypes().isSuccess, + "Input types of MapFromEntries must be valid, but got: " + + mapFromEntries.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateCreateArray(createArray: CreateArray): Unit = { + createArray.children.foreach(validate) + assert( + createArray.checkInputDataTypes().isSuccess, + "Input types of CreateArray must be valid, but got: " + + createArray.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateArrayDistinct(arrayDistinct: ArrayDistinct): Unit = { + validate(arrayDistinct.child) + assert( + arrayDistinct.checkInputDataTypes().isSuccess, + "Input types of ArrayDistinct must be valid, but got: " + + arrayDistinct.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateArrayInsert(arrayInsert: ArrayInsert): Unit = { + validate(arrayInsert.srcArrayExpr) + validate(arrayInsert.posExpr) + validate(arrayInsert.itemExpr) + assert( + arrayInsert.checkInputDataTypes().isSuccess, + "Input types of ArrayInsert must be valid, but got: " + + arrayInsert.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateArrayJoin(arrayJoin: ArrayJoin): Unit = { + validate(arrayJoin.array) + validate(arrayJoin.delimiter) + if (arrayJoin.nullReplacement.isDefined) { + validate(arrayJoin.nullReplacement.get) + } + } + + private def validateArrayMax(arrayMax: ArrayMax): Unit = { + validate(arrayMax.child) + assert( + arrayMax.checkInputDataTypes().isSuccess, + "Input types of ArrayMax must be valid, but got: " + + arrayMax.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateArrayMin(arrayMin: ArrayMin): Unit = { + validate(arrayMin.child) + assert( + arrayMin.checkInputDataTypes().isSuccess, + "Input types of ArrayMin must be valid, but got: " + + arrayMin.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateArraysZip(arraysZip: ArraysZip): Unit = { + arraysZip.children.foreach(validate) + arraysZip.names.foreach(validate) + assert( + arraysZip.checkInputDataTypes().isSuccess, + "Input types of ArraysZip must be valid, but got: " + + arraysZip.children.map(_.dataType.typeName).mkString(", ") + ) + } + + private def validateRuntimeReplaceable(runtimeReplaceable: RuntimeReplaceable): Unit = { + runtimeReplaceable.children.foreach(validate) + } + + private def validateTimezoneExpression(timezoneExpression: TimeZoneAwareExpression): Unit = { + timezoneExpression.children.foreach(validate) + assert(timezoneExpression.timeZoneId.nonEmpty, "Timezone expression must have a timezone") + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExpressionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExpressionResolver.scala new file mode 100644 index 0000000000000..1d072509626b7 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ExpressionResolver.scala @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{ + withPosition, + FunctionResolution, + UnresolvedAlias, + UnresolvedAttribute, + UnresolvedFunction, + UnresolvedStar +} +import org.apache.spark.sql.catalyst.expressions.{ + Alias, + AttributeReference, + BinaryArithmetic, + ConditionalExpression, + CreateNamedStruct, + Expression, + ExtractANSIIntervalDays, + InheritAnalysisRules, + Literal, + NamedExpression, + Predicate, + RuntimeReplaceable, + TimeAdd, + TimeZoneAwareExpression, + UnaryMinus +} +import org.apache.spark.sql.catalyst.trees.TreeNodeTag +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.MetadataBuilder + +/** + * The [[ExpressionResolver]] is used by the [[Resolver]] during the analysis to resolve + * expressions. + * + * The functions here generally traverse unresolved [[Expression]] nodes recursively, + * constructing and returning the resolved [[Expression]] nodes bottom-up. + * This is the primary entry point for implementing expression analysis, + * wherein the [[resolve]] method accepts a fully unresolved [[Expression]] and returns + * a fully resolved [[Expression]] in response with all data types and attribute + * reference ID assigned for valid requests. This resolver also takes responsibility + * to detect any errors in the initial SQL query or DataFrame and return appropriate + * error messages including precise parse locations wherever possible. + * + * @param resolver [[Resolver]] is passed from the parent to resolve other + * operators which are nested in expressions. + * @param scopes [[NameScopeStack]] to resolve the expression tree in the correct scope. + * @param functionResolution [[FunctionResolution]] to resolve function expressions. + * @param planLogger [[PlanLogger]] to log expression tree resolution events. + */ +class ExpressionResolver( + resolver: Resolver, + scopes: NameScopeStack, + functionResolution: FunctionResolution, + planLogger: PlanLogger) + extends TreeNodeResolver[Expression, Expression] + with ProducesUnresolvedSubtree + with ResolvesExpressionChildren + with TracksResolvedNodes[Expression] { + private val shouldTrackResolvedNodes = + conf.getConf(SQLConf.ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED) + private val aliasResolver = new AliasResolver(this, scopes) + private val createNamedStructResolver = new CreateNamedStructResolver(this) + private val timezoneAwareExpressionResolver = new TimezoneAwareExpressionResolver(this) + private val conditionalExpressionResolver = + new ConditionalExpressionResolver(this, timezoneAwareExpressionResolver) + private val predicateResolver = + new PredicateResolver(this, timezoneAwareExpressionResolver) + private val binaryArithmeticResolver = { + new BinaryArithmeticResolver( + this, + timezoneAwareExpressionResolver + ) + } + private val functionResolver = new FunctionResolver( + this, + timezoneAwareExpressionResolver, + functionResolution + ) + private val timeAddResolver = new TimeAddResolver(this, timezoneAwareExpressionResolver) + private val unaryMinusResolver = new UnaryMinusResolver(this, timezoneAwareExpressionResolver) + + /** + * This method is an expression analysis entry point. The method first checks if the expression + * has already been resolved (necessary because of partially-unresolved subtrees, see + * [[ProducesUnresolvedSubtree]]). If not already resolved, method takes an unresolved + * [[Expression]] and chooses the right `resolve*` method using pattern matching on the + * `unresolvedExpression` type. This pattern matching enumerates all the expression node types + * that are supported by the single-pass analysis. + * When developers introduce a new [[Expression]] type to the Catalyst, they should implement + * a corresponding `resolve*` method in the [[ExpressionResolver]] and add it to this pattern + * match list. + * + * [[resolve]] will be called recursively during the expression tree traversal eventually + * producing a fully resolved expression subtree or a descriptive error message. + * + * [[resolve]] can recursively call `resolver` to resolve nested operators (e.g. scalar + * subqueries): + * + * {{{ SELECT * FROM VALUES (1), (2) WHERE col1 IN (SELECT 1); }}} + * + * In this case `IN` is an expression and `SELECT 1` is a nested operator tree for which + * the [[ExpressionResolver]] would invoke the [[Resolver]]. + */ + override def resolve(unresolvedExpression: Expression): Expression = { + planLogger.logExpressionTreeResolutionEvent(unresolvedExpression, "Unresolved expression tree") + + if (unresolvedExpression + .getTagValue(ExpressionResolver.SINGLE_PASS_SUBTREE_BOUNDARY) + .nonEmpty) { + unresolvedExpression + } else { + throwIfNodeWasResolvedEarlier(unresolvedExpression) + + val resolvedExpression = unresolvedExpression match { + case unresolvedBinaryArithmetic: BinaryArithmetic => + binaryArithmeticResolver.resolve(unresolvedBinaryArithmetic) + case unresolvedExtractANSIIntervalDays: ExtractANSIIntervalDays => + resolveExtractANSIIntervalDays(unresolvedExtractANSIIntervalDays) + case unresolvedNamedExpression: NamedExpression => + resolveNamedExpression(unresolvedNamedExpression) + case unresolvedFunction: UnresolvedFunction => + functionResolver.resolve(unresolvedFunction) + case unresolvedLiteral: Literal => + resolveLiteral(unresolvedLiteral) + case unresolvedPredicate: Predicate => + predicateResolver.resolve(unresolvedPredicate) + case unresolvedTimeAdd: TimeAdd => + timeAddResolver.resolve(unresolvedTimeAdd) + case unresolvedUnaryMinus: UnaryMinus => + unaryMinusResolver.resolve(unresolvedUnaryMinus) + case createNamedStruct: CreateNamedStruct => + createNamedStructResolver.resolve(createNamedStruct) + case unresolvedConditionalExpression: ConditionalExpression => + conditionalExpressionResolver.resolve(unresolvedConditionalExpression) + case unresolvedRuntimeReplaceable: RuntimeReplaceable => + resolveRuntimeReplaceable(unresolvedRuntimeReplaceable) + case unresolvedTimezoneExpression: TimeZoneAwareExpression => + timezoneAwareExpressionResolver.resolve(unresolvedTimezoneExpression) + case _ => + withPosition(unresolvedExpression) { + throwUnsupportedSinglePassAnalyzerFeature(unresolvedExpression) + } + } + + markNodeAsResolved(resolvedExpression) + + planLogger.logExpressionTreeResolution(unresolvedExpression, resolvedExpression) + + resolvedExpression + } + } + + private def resolveNamedExpression( + unresolvedNamedExpression: Expression, + isTopOfProjectList: Boolean = false): Expression = + unresolvedNamedExpression match { + case alias: Alias => + aliasResolver.handleResolvedAlias(alias) + case unresolvedAlias: UnresolvedAlias => + aliasResolver.resolve(unresolvedAlias) + case unresolvedAttribute: UnresolvedAttribute => + resolveAttribute(unresolvedAttribute, isTopOfProjectList) + case unresolvedStar: UnresolvedStar => + withPosition(unresolvedStar) { + throwInvalidStarUsageError(unresolvedStar) + } + case attributeReference: AttributeReference => + handleResolvedAttributeReference(attributeReference) + case _ => + withPosition(unresolvedNamedExpression) { + throwUnsupportedSinglePassAnalyzerFeature(unresolvedNamedExpression) + } + } + + /** + * The [[Project]] list can contain different unresolved expressions before the resolution, which + * will be resolved using generic [[resolve]]. However, [[UnresolvedStar]] is a special case, + * because it is expanded into a sequence of [[NamedExpression]]s. Because of that this method + * returns a sequence and doesn't conform to generic [[resolve]] interface - it's called directly + * from the [[Resolver]] during [[Project]] resolution. + * + * The output sequence can be larger than the input sequence due to [[UnresolvedStar]] expansion. + */ + def resolveProjectList(unresolvedProjectList: Seq[NamedExpression]): Seq[NamedExpression] = { + unresolvedProjectList.flatMap { + case unresolvedStar: UnresolvedStar => + resolveStar(unresolvedStar) + case other => + Seq(resolveNamedExpression(other, isTopOfProjectList = true).asInstanceOf[NamedExpression]) + } + } + + /** + * [[UnresolvedAttribute]] resolution relies on [[NameScope]] to lookup the attribute by its + * multipart name. The resolution can result in three different outcomes which are handled in the + * [[NameTarget.pickCandidate]]: + * + * - No results from the [[NameScope]] mean that the attribute lookup failed as in: + * {{{ SELECT col1 FROM (SELECT 1 as col2); }}} + * + * - Several results from the [[NameScope]] mean that the reference is ambiguous as in: + * {{{ SELECT col1 FROM (SELECT 1 as col1), (SELECT 2 as col1); }}} + * + * - Single result from the [[NameScope]] means that the attribute was found as in: + * {{{ SELECT col1 FROM VALUES (1); }}} + * + * If the attribute is at the top of the project list (which is indicated by + * [[isTopOfProjectList]]), we preserve the [[Alias]] or remove it otherwise. + */ + private def resolveAttribute( + unresolvedAttribute: UnresolvedAttribute, + isTopOfProjectList: Boolean): Expression = + withPosition(unresolvedAttribute) { + if (scopes.top.isExistingAlias(unresolvedAttribute.nameParts.head)) { + // Temporarily disable referencing aliases until we support LCA resolution. + throw new ExplicitlyUnsupportedResolverFeature("unsupported expression: LateralColumnAlias") + } + + val nameTarget: NameTarget = scopes.top.matchMultipartName(unresolvedAttribute.nameParts) + + val candidate = nameTarget.pickCandidate(unresolvedAttribute) + if (isTopOfProjectList && nameTarget.aliasName.isDefined) { + Alias(candidate, nameTarget.aliasName.get)() + } else { + candidate + } + } + + /** + * [[AttributeReference]] is already resolved if it's passed to us from DataFrame `col(...)` + * function, for example. + */ + private def handleResolvedAttributeReference(attributeReference: AttributeReference) = + tryStripAmbiguousSelfJoinMetadata(attributeReference) + + /** + * [[ExtractANSIIntervalDays]] resolution doesn't require any specific resolution logic apart + * from resolving its children. + */ + private def resolveExtractANSIIntervalDays( + unresolvedExtractANSIIntervalDays: ExtractANSIIntervalDays) = + withResolvedChildren(unresolvedExtractANSIIntervalDays, resolve) + + /** + * [[UnresolvedStar]] resolution relies on the [[NameScope]]'s ability to get the attributes by a + * multipart name ([[UnresolvedStar]]'s `target` field): + * + * - Star target is defined: + * + * {{{ + * SELECT t.* FROM VALUES (1) AS t; + * -> + * Project [col1#19] + * }}} + * + * + * - Star target is not defined: + * + * {{{ + * SELECT * FROM (SELECT 1 as col1), (SELECT 2 as col2); + * -> + * Project [col1#19, col2#20] + * }}} + */ + def resolveStar(unresolvedStar: UnresolvedStar): Seq[NamedExpression] = + withPosition(unresolvedStar) { + scopes.top.expandStar(unresolvedStar) + } + + /** + * [[Literal]] resolution doesn't require any specific resolution logic at this point. + * + * Since [[TracksResolvedNodes]] requires all the expressions in the tree to be unique objects, + * we reallocate the literal in [[ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED]] mode, + * otherwise we preserve the old object to avoid unnecessary memory allocations. + */ + private def resolveLiteral(literal: Literal): Expression = { + if (shouldTrackResolvedNodes) { + literal.copy() + } else { + literal + } + } + + /** + * When [[RuntimeReplaceable]] is mixed in with [[InheritAnalysisRules]], child expression will + * be runtime replacement. In that case we need to resolve the children of the expression. + * otherwise, no resolution is necessary because replacement is already resolved. + */ + private def resolveRuntimeReplaceable(unresolvedRuntimeReplaceable: RuntimeReplaceable) = + unresolvedRuntimeReplaceable match { + case inheritAnalysisRules: InheritAnalysisRules => + withResolvedChildren(inheritAnalysisRules, resolve) + case other => other + } + + /** + * [[DetectAmbiguousSelfJoin]] rule in the fixed-point Analyzer detects ambiguous references in + * self-joins based on special metadata added by [[Dataset]] code (see SPARK-27547). Just strip + * this for now since we don't support joins yet. + */ + private def tryStripAmbiguousSelfJoinMetadata(attributeReference: AttributeReference) = { + val metadata = attributeReference.metadata + if (ExpressionResolver.AMBIGUOUS_SELF_JOIN_METADATA.exists(metadata.contains(_))) { + val metadataBuilder = new MetadataBuilder().withMetadata(metadata) + for (metadataKey <- ExpressionResolver.AMBIGUOUS_SELF_JOIN_METADATA) { + metadataBuilder.remove(metadataKey) + } + attributeReference.withMetadata(metadataBuilder.build()) + } else { + attributeReference + } + } + + private def throwUnsupportedSinglePassAnalyzerFeature(unresolvedExpression: Expression): Nothing = + throw QueryCompilationErrors.unsupportedSinglePassAnalyzerFeature( + s"${unresolvedExpression.getClass} expression resolution" + ) + + private def throwInvalidStarUsageError(unresolvedStar: UnresolvedStar): Nothing = + // TODO(vladimirg-db): Use parent operator name instead of "query" + throw QueryCompilationErrors.invalidStarUsageError("query", Seq(unresolvedStar)) +} + +object ExpressionResolver { + private val AMBIGUOUS_SELF_JOIN_METADATA = Seq("__dataset_id", "__col_position") + val SINGLE_PASS_SUBTREE_BOUNDARY = TreeNodeTag[Unit]("single_pass_subtree_boundary") +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/FunctionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/FunctionResolver.scala new file mode 100644 index 0000000000000..b7311b83e872e --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/FunctionResolver.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{ + AnsiTypeCoercion, + CollationTypeCoercion, + FunctionResolution, + TypeCoercion, + UnresolvedFunction, + UnresolvedStar +} +import org.apache.spark.sql.catalyst.expressions.Expression + +/** + * A resolver for [[UnresolvedFunction]]s that resolves functions to concrete [[Expression]]s. + * It resolves the children of the function first by calling [[ExpressionResolver.resolve]] on them + * if they are not [[UnresolvedStar]]s. If the children are [[UnresolvedStar]]s, it resolves them + * using [[ExpressionResolver.resolveStar]]. Examples are following: + * + * - Function doesn't contain any [[UnresolvedStar]]: + * {{{ SELECT ARRAY(col1) FROM VALUES (1); }}} + * it is resolved only using [[ExpressionResolver.resolve]]. + * - Function contains [[UnresolvedStar]]: + * {{{ SELECT ARRAY(*) FROM VALUES (1); }}} + * it is resolved using [[ExpressionResolver.resolveStar]]. + * + * It applies appropriate [[TypeCoercion]] (or [[AnsiTypeCoercion]]) rules after resolving the + * function using the [[FunctionResolution]] code. + */ +class FunctionResolver( + expressionResolver: ExpressionResolver, + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver, + functionResolution: FunctionResolution) + extends TreeNodeResolver[UnresolvedFunction, Expression] + with ProducesUnresolvedSubtree { + + private val typeCoercionRules: Seq[Expression => Expression] = + if (conf.ansiEnabled) { + FunctionResolver.ANSI_TYPE_COERCION_RULES + } else { + FunctionResolver.TYPE_COERCION_RULES + } + private val typeCoercionResolver: TypeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + override def resolve(unresolvedFunction: UnresolvedFunction): Expression = { + val functionWithResolvedChildren = + unresolvedFunction.copy(arguments = unresolvedFunction.arguments.flatMap { + case s: UnresolvedStar => expressionResolver.resolveStar(s) + case other => Seq(expressionResolver.resolve(other)) + }) + val resolvedFunction = functionResolution.resolveFunction(functionWithResolvedChildren) + typeCoercionResolver.resolve(resolvedFunction) + } +} + +object FunctionResolver { + // Ordering in the list of type coercions should be in sync with the list in [[TypeCoercion]]. + private val TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + CollationTypeCoercion.apply, + TypeCoercion.InTypeCoercion.apply, + TypeCoercion.FunctionArgumentTypeCoercion.apply, + TypeCoercion.IfTypeCoercion.apply, + TypeCoercion.ImplicitTypeCoercion.apply + ) + + // Ordering in the list of type coercions should be in sync with the list in [[AnsiTypeCoercion]]. + private val ANSI_TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + CollationTypeCoercion.apply, + AnsiTypeCoercion.InTypeCoercion.apply, + AnsiTypeCoercion.FunctionArgumentTypeCoercion.apply, + AnsiTypeCoercion.IfTypeCoercion.apply, + AnsiTypeCoercion.ImplicitTypeCoercion.apply + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/HybridAnalyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/HybridAnalyzer.scala new file mode 100644 index 0000000000000..039c07f5edbc2 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/HybridAnalyzer.scala @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import scala.util.control.NonFatal + +import org.apache.spark.sql.catalyst.{QueryPlanningTracker, SQLConfHelper} +import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, Analyzer} +import org.apache.spark.sql.catalyst.plans.NormalizePlan +import org.apache.spark.sql.catalyst.plans.logical.{AnalysisHelper, LogicalPlan} +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf + +/** + * The HybridAnalyzer routes the unresolved logical plan between the legacy Analyzer and + * a single-pass Analyzer when the query that we are processing is being run from unit tests + * depending on the testing flags set and the structure of this unresolved logical plan: + * - If the "spark.sql.analyzer.singlePassResolver.soloRunEnabled" is "true", the + * [[HybridAnalyzer]] will unconditionally run the single-pass Analyzer, which would + * usually result in some unexpected behavior and failures. This flag is used only for + * development. + * - If the "spark.sql.analyzer.singlePassResolver.dualRunEnabled" is "true", the + * [[HybridAnalyzer]] will invoke the legacy analyzer and optionally _also_ the fixed-point + * one depending on the structure of the unresolved plan. This decision is based on which + * features are supported by the single-pass Analyzer, and the checking is implemented in + * the [[ResolverGuard]]. After that we validate the results using the following + * logic: + * - If the fixed-point Analyzer fails and the single-pass one succeeds, we throw an + * appropriate exception (please check the + * [[QueryCompilationErrors.fixedPointFailedSinglePassSucceeded]] method) + * - If both the fixed-point and the single-pass Analyzers failed, we throw the exception + * from the fixed-point Analyzer. + * - If the single-pass Analyzer failed, we throw an exception from its failure. + * - If both the fixed-point and the single-pass Analyzers succeeded, we compare the logical + * plans and output schemas, and return the resolved plan from the fixed-point Analyzer. + * - Otherwise we run the legacy analyzer. + * */ +class HybridAnalyzer( + legacyAnalyzer: Analyzer, + resolverGuard: ResolverGuard, + resolver: Resolver, + checkSupportedSinglePassFeatures: Boolean = true) + extends SQLConfHelper { + private var singlePassResolutionDuration: Option[Long] = None + private var fixedPointResolutionDuration: Option[Long] = None + + def apply(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { + val dualRun = + conf.getConf(SQLConf.ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER) && + checkResolverGuard(plan) + + withTrackedAnalyzerBridgeState(dualRun) { + if (dualRun) { + resolveInDualRun(plan, tracker) + } else if (conf.getConf(SQLConf.ANALYZER_SINGLE_PASS_RESOLVER_ENABLED)) { + resolveInSinglePass(plan) + } else { + resolveInFixedPoint(plan, tracker) + } + } + } + + def getSinglePassResolutionDuration: Option[Long] = singlePassResolutionDuration + + def getFixedPointResolutionDuration: Option[Long] = fixedPointResolutionDuration + + /** + * Call `body` in the context of tracked [[AnalyzerBridgeState]]. Set the new bridge state + * depending on whether we are in dual-run mode or not: + * - If [[dualRun]] and [[ANALYZER_SINGLE_PASS_RESOLVER_RELATION_BRIDGING_ENABLED]] are true, + * create and set a new [[AnalyzerBridgeState]]. + * - Otherwise, reset [[AnalyzerBridgeState]]. + * + * Finally, set the bridge state back to the previous one after the `body` is executed to avoid + * disrupting the possible upper-level [[Analyzer]] invocation in case it's recursive + * [[Analyzer]] call. + * */ + private def withTrackedAnalyzerBridgeState(dualRun: Boolean)( + body: => LogicalPlan): LogicalPlan = { + val bridgeRelations = dualRun && conf.getConf( + SQLConf.ANALYZER_SINGLE_PASS_RESOLVER_RELATION_BRIDGING_ENABLED + ) + + val prevSinglePassResolverBridgeState = AnalysisContext.get.getSinglePassResolverBridgeState + + AnalysisContext.get.setSinglePassResolverBridgeState(if (bridgeRelations) { + Some(new AnalyzerBridgeState) + } else { + None + }) + + try { + body + } finally { + AnalysisContext.get.setSinglePassResolverBridgeState(prevSinglePassResolverBridgeState) + } + } + + /** + * This method is used to run both the legacy Analyzer and single-pass Analyzer, + * and then compare the results or check the errors. For more context please check the + * [[HybridAnalyzer]] scaladoc. + * */ + private def resolveInDualRun(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { + var fixedPointException: Option[Throwable] = None + val fixedPointResult = try { + val (resolutionDuration, result) = recordDuration { + Some(resolveInFixedPoint(plan, tracker)) + } + fixedPointResolutionDuration = Some(resolutionDuration) + result + } catch { + case NonFatal(e) => + fixedPointException = Some(e) + None + } + + var singlePassException: Option[Throwable] = None + val singlePassResult = try { + val (resolutionDuration, result) = recordDuration { + Some(resolveInSinglePass(plan)) + } + singlePassResolutionDuration = Some(resolutionDuration) + result + } catch { + case NonFatal(e) => + singlePassException = Some(e) + None + } + + fixedPointException match { + case Some(fixedPointEx) => + singlePassException match { + case Some(_) => + throw fixedPointEx + case None => + throw QueryCompilationErrors.fixedPointFailedSinglePassSucceeded( + singlePassResult.get, + fixedPointEx + ) + } + case None => + singlePassException match { + case Some(singlePassEx: ExplicitlyUnsupportedResolverFeature) + if checkSupportedSinglePassFeatures => + fixedPointResult.get + case Some(singlePassEx) => + throw singlePassEx + case None => + validateLogicalPlans(fixedPointResult.get, singlePassResult.get) + fixedPointResult.get + } + } + } + + /** + * This method is used to run the single-pass Analyzer which will return the resolved plan + * or throw an exception if the resolution fails. Both cases are handled in the caller method. + * */ + private def resolveInSinglePass(plan: LogicalPlan): LogicalPlan = { + val resolvedPlan = resolver.lookupMetadataAndResolve( + plan, + analyzerBridgeState = AnalysisContext.get.getSinglePassResolverBridgeState + ) + if (conf.getConf(SQLConf.ANALYZER_SINGLE_PASS_RESOLVER_VALIDATION_ENABLED)) { + val validator = new ResolutionValidator + validator.validatePlan(resolvedPlan) + } + resolvedPlan + } + + /** + * This method is used to run the legacy Analyzer which will return the resolved plan + * or throw an exception if the resolution fails. Both cases are handled in the caller method. + * */ + private def resolveInFixedPoint(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { + val resolvedPlan = legacyAnalyzer.executeAndTrack(plan, tracker) + QueryPlanningTracker.withTracker(tracker) { + legacyAnalyzer.checkAnalysis(resolvedPlan) + } + resolvedPlan + } + + private def validateLogicalPlans(fixedPointResult: LogicalPlan, singlePassResult: LogicalPlan) = { + if (fixedPointResult.schema != singlePassResult.schema) { + throw QueryCompilationErrors.hybridAnalyzerOutputSchemaComparisonMismatch( + fixedPointResult.schema, + singlePassResult.schema + ) + } + if (normalizePlan(fixedPointResult) != normalizePlan(singlePassResult)) { + throw QueryCompilationErrors.hybridAnalyzerLogicalPlanComparisonMismatch( + fixedPointResult, + singlePassResult + ) + } + } + + private def normalizePlan(plan: LogicalPlan) = AnalysisHelper.allowInvokingTransformsInAnalyzer { + NormalizePlan(plan) + } + + private def checkResolverGuard(plan: LogicalPlan): Boolean = + !checkSupportedSinglePassFeatures || resolverGuard.apply(plan) + + private def recordDuration[T](thunk: => T): (Long, T) = { + val start = System.nanoTime() + val res = thunk + (System.nanoTime() - start, res) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/IdentifierMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/IdentifierMap.scala new file mode 100644 index 0000000000000..899eb7d71e813 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/IdentifierMap.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.Locale + +/** + * The [[IdentifierMap]] is an implementation of a [[KeyTransformingMap]] that uses SQL/DataFrame + * identifiers as keys. The implementation is case-insensitive for keys. + */ +private class IdentifierMap[V] extends KeyTransformingMap[String, V] { + override def mapKey(key: String): String = key.toLowerCase(Locale.ROOT) +} + +/** + * The [[OptionalIdentifierMap]] is an implementation of a [[KeyTransformingMap]] that uses optional + * SQL/DataFrame identifiers as keys. The implementation is case-insensitive for non-empty keys. + */ +private class OptionalIdentifierMap[V] extends KeyTransformingMap[Option[String], V] { + override def mapKey(key: Option[String]): Option[String] = + key.map(_.toLowerCase(Locale.ROOT)) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/KeyTransformingMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/KeyTransformingMap.scala new file mode 100644 index 0000000000000..ff6e118fcc3c9 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/KeyTransformingMap.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import scala.collection.mutable + +/** + * The [[KeyTransformingMap]] is a partial implementation of [[mutable.Map]] that transforms input + * keys with a custom [[mapKey]] method. + */ +private abstract class KeyTransformingMap[K, V] { + private val impl = new mutable.HashMap[K, V] + + def get(key: K): Option[V] = impl.get(mapKey(key)) + + def contains(key: K): Boolean = impl.contains(mapKey(key)) + + def iterator: Iterator[(K, V)] = impl.iterator + + def +=(kv: (K, V)): this.type = { + impl += (mapKey(kv._1) -> kv._2) + this + } + + def -=(key: K): this.type = { + impl -= mapKey(key) + this + } + + def mapKey(key: K): K +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/LimitExpressionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/LimitExpressionResolver.scala new file mode 100644 index 0000000000000..a25616ba50b6a --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/LimitExpressionResolver.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.errors.QueryErrorsBase +import org.apache.spark.sql.types.IntegerType + +/** + * The [[LimitExpressionResolver]] is a resolver that resolves a [[LocalLimit]] or [[GlobalLimit]] + * expression and performs all the necessary validation. + */ +class LimitExpressionResolver(expressionResolver: TreeNodeResolver[Expression, Expression]) + extends TreeNodeResolver[Expression, Expression] + with QueryErrorsBase { + + /** + * Resolve a limit expression of [[GlobalLimit]] or [[LocalLimit]] and perform validation. + */ + override def resolve(unresolvedLimitExpression: Expression): Expression = { + val resolvedLimitExpression = expressionResolver.resolve(unresolvedLimitExpression) + validateLimitExpression(resolvedLimitExpression, expressionName = "limit") + resolvedLimitExpression + } + + /** + * Validate a resolved limit expression of [[GlobalLimit]] or [[LocalLimit]]: + * - The expression has to be foldable + * - The result data type has to be [[IntegerType]] + * - The evaluated expression has to be non-null + * - The evaluated expression has to be positive + * + * The `foldable` check is implemented in some expressions + * as a recursive expression tree traversal. + * It is not an ideal approach for the single-pass [[ExpressionResolver]], + * but __is__ practical, since: + * - We have to call `eval` here anyway, and it's recursive + * - In practice `LIMIT` expression trees are very small + */ + private def validateLimitExpression(expression: Expression, expressionName: String): Unit = { + if (!expression.foldable) { + throwInvalidLimitLikeExpressionIsUnfoldable(expressionName, expression) + } + if (expression.dataType != IntegerType) { + throwInvalidLimitLikeExpressionDataType(expressionName, expression) + } + expression.eval() match { + case null => + throwInvalidLimitLikeExpressionIsNull(expressionName, expression) + case value: Int if value < 0 => + throwInvalidLimitLikeExpressionIsNegative(expressionName, expression, value) + case _ => + } + } + + private def throwInvalidLimitLikeExpressionIsUnfoldable( + name: String, + expression: Expression): Nothing = + throw new AnalysisException( + errorClass = "INVALID_LIMIT_LIKE_EXPRESSION.IS_UNFOLDABLE", + messageParameters = Map( + "name" -> name, + "expr" -> toSQLExpr(expression) + ), + origin = expression.origin + ) + + private def throwInvalidLimitLikeExpressionDataType( + name: String, + expression: Expression): Nothing = + throw new AnalysisException( + errorClass = "INVALID_LIMIT_LIKE_EXPRESSION.DATA_TYPE", + messageParameters = Map( + "name" -> name, + "expr" -> toSQLExpr(expression), + "dataType" -> toSQLType(expression.dataType) + ), + origin = expression.origin + ) + + private def throwInvalidLimitLikeExpressionIsNull(name: String, expression: Expression): Nothing = + throw new AnalysisException( + errorClass = "INVALID_LIMIT_LIKE_EXPRESSION.IS_NULL", + messageParameters = Map("name" -> name, "expr" -> toSQLExpr(expression)), + origin = expression.origin + ) + + private def throwInvalidLimitLikeExpressionIsNegative( + name: String, + expression: Expression, + value: Int): Nothing = + throw new AnalysisException( + errorClass = "INVALID_LIMIT_LIKE_EXPRESSION.IS_NEGATIVE", + messageParameters = + Map("name" -> name, "expr" -> toSQLExpr(expression), "v" -> toSQLValue(value, IntegerType)), + origin = expression.origin + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/MetadataResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/MetadataResolver.scala new file mode 100644 index 0000000000000..e1334fc56575e --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/MetadataResolver.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.ArrayDeque + +import org.apache.spark.sql.catalyst.analysis.{withPosition, RelationResolution, UnresolvedRelation} +import org.apache.spark.sql.catalyst.expressions.{Expression, PlanExpression} +import org.apache.spark.sql.catalyst.plans.logical.{AnalysisHelper, LogicalPlan} +import org.apache.spark.sql.connector.catalog.CatalogManager + +/** + * The [[MetadataResolver]] performs relation metadata resolution based on the unresolved plan + * at the start of the analysis phase. Usually it does RPC calls to some table catalog and to table + * metadata itself. + * + * [[RelationsWithResolvedMetadata]] is a map from relation ID to the relations with resolved + * metadata. It's produced by [[resolve]] and is used later in [[Resolver]] to replace + * [[UnresolvedRelation]]s. + * + * This object is one-shot per SQL query or DataFrame program resolution. + */ +class MetadataResolver( + override val catalogManager: CatalogManager, + override val relationResolution: RelationResolution, + override val extensions: Seq[ResolverExtension] = Seq.empty) + extends RelationMetadataProvider + with DelegatesResolutionToExtensions { + override val relationsWithResolvedMetadata = new RelationsWithResolvedMetadata + + /** + * Resolves the relation metadata for `unresolvedPlan`. Usually this involves several blocking + * calls for the [[UnresolvedRelation]]s present in that tree. During the `unresolvedPlan` + * traversal we fill [[relationsWithResolvedMetadata]] with resolved metadata by relation id. + * This map will be used to resolve the plan in single-pass by the [[Resolver]] using + * [[getRelationWithResolvedMetadata]]. If the generic metadata resolution using + * [[RelationResolution]] wasn't successful, we resort to using [[extensions]]. + * Otherwise, we fail with an exception. + */ + def resolve(unresolvedPlan: LogicalPlan): Unit = { + traverseLogicalPlanTree(unresolvedPlan) { unresolvedOperator => + unresolvedOperator match { + case unresolvedRelation: UnresolvedRelation => + val relationId = relationIdFromUnresolvedRelation(unresolvedRelation) + + if (!relationsWithResolvedMetadata.containsKey(relationId)) { + val relationWithResolvedMetadata = resolveRelation(unresolvedRelation).orElse { + // In case the generic metadata resolution returned `None`, we try to check if any + // of the [[extensions]] matches this `unresolvedRelation`, and resolve it using + // that extension. + tryDelegateResolutionToExtension(unresolvedRelation) + } + + relationWithResolvedMetadata match { + case Some(relationWithResolvedMetadata) => + relationsWithResolvedMetadata.put( + relationId, + relationWithResolvedMetadata + ) + case None => + withPosition(unresolvedRelation) { + unresolvedRelation.tableNotFound(unresolvedRelation.multipartIdentifier) + } + } + } + case _ => + } + } + } + + /** + * Resolves the metadata for the given unresolved relation and returns a relation with the + * resolved metadata. This method is blocking. + */ + private def resolveRelation(unresolvedRelation: UnresolvedRelation): Option[LogicalPlan] = + AnalysisHelper.allowInvokingTransformsInAnalyzer { + relationResolution.resolveRelation( + u = unresolvedRelation + ) + } + + /** + * Traverse the logical plan tree from `root` in a pre-order DFS manner and apply `visitor` to + * each node. + */ + private def traverseLogicalPlanTree(root: LogicalPlan)(visitor: LogicalPlan => Unit) = { + val stack = new ArrayDeque[Either[LogicalPlan, Expression]] + stack.push(Left(root)) + + while (!stack.isEmpty) { + stack.pop() match { + case Left(logicalPlan) => + visitor(logicalPlan) + + for (child <- logicalPlan.children) { + stack.push(Left(child)) + } + for (expression <- logicalPlan.expressions) { + stack.push(Right(expression)) + } + case Right(expression) => + for (child <- expression.children) { + stack.push(Right(child)) + } + + expression match { + case planExpression: PlanExpression[_] => + planExpression.plan match { + case plan: LogicalPlan => + stack.push(Left(plan)) + case _ => + } + case _ => + } + } + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/NameScope.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/NameScope.scala new file mode 100644 index 0000000000000..8abf4e04b8836 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/NameScope.scala @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.{ArrayDeque, ArrayList, HashSet} + +import scala.collection.mutable + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.analysis.{Resolver => NameComparator, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{ + Alias, + Attribute, + AttributeSeq, + Expression, + NamedExpression +} +import org.apache.spark.sql.errors.QueryCompilationErrors + +/** + * The [[NameScope]] is used during the analysis to control the visibility of names: plan names + * and output attributes. New [[NameScope]] can be created both in the [[Resolver]] and in + * the [[ExpressionResolver]] using the [[NameScopeStack]] api. The name resolution for identifiers + * is case-insensitive. + * + * In this example: + * + * {{{ + * WITH table_1_cte AS ( + * SELECT + * col1, + * col2, + * col2 + * FROM + * table_1 + * ) + * SELECT + * table_1_cte.col1, + * table_2.col1 + * FROM + * table_1_cte + * INNER JOIN + * table_2 + * ON + * table_1_cte.col2 = table_2.col3 + * ; + * }}} + * + * there are two named subplans in the scope: table_1_cte -> [col1, col2, col2] and + * table_2 -> [col1, col3]. + * + * State breakout: + * - `planOutputs`: list of named plan outputs. Order matters here (e.g. to correctly expand `*`). + * Can contain duplicate names, since it's possible to select same column twice, or to select + * columns with the same name from different relations. [[OptionalIdentifierMap]] is used here, + * since some plans don't have an explicit name, so output attributes from those plans will reside + * under the `None` key. + * In our example it will be {{{ [(table_1_cte, [col1, col2, col2]), (table_2, [col1, col3])] }}} + * + * - `planNameToOffset`: mapping from plan output names to their offsets in the `planOutputs` array. + * It's used to lookup attributes by plan output names (multipart names are not supported yet). + * In our example it will be {{{ [table_1_cte -> 0, table_2 -> 1] }}} + */ +class NameScope extends SQLConfHelper { + private val planOutputs = new ArrayList[PlanOutput]() + private val planNameToOffset = new OptionalIdentifierMap[Int] + private val nameComparator: NameComparator = conf.resolver + private val existingAliases = new HashSet[String] + + /** + * Register the named plan output in this [[NameScope]]. The named plan is usually a + * [[NamedRelation]]. `attributes` sequence can contain duplicate names both for this named plan + * and for the scope in general, despite the fact that their further resolution _may_ throw an + * error in case of ambiguous reference. After calling this method, the code can lookup the + * attributes using `get*` methods of this [[NameScope]]. + * + * Duplicate plan names are merged into the same [[PlanOutput]]. For example, this query: + * + * {{{ SELECT t.* FROM (SELECT * FROM VALUES (1)) as t, (SELECT * FROM VALUES (2)) as t; }}} + * + * will have the following output schema: + * + * {{{ [col1, col1] }}} + * + * Same logic applies for the unnamed plan outputs. This query: + * + * {{{ SELECT * FROM (SELECT * FROM VALUES (1)), (SELECT * FROM VALUES (2)); }}} + * + * will have the same output schema: + * + * {{{ [col1, col1] }}} + * + * @param name The name of this named plan. + * @param attributes The output of this named plan. Can contain duplicate names. + */ + def update(name: String, attributes: Seq[Attribute]): Unit = { + update(attributes, Some(name)) + } + + /** + * Register the unnamed plan output in this [[NameScope]]. Some examples of the unnamed plan are + * [[Project]] and [[Aggregate]]. + * + * See the [[update]] method for more details. + * + * @param attributes The output of the unnamed plan. Can contain duplicate names. + */ + def +=(attributes: Seq[Attribute]): Unit = { + update(attributes) + } + + /** + * Get all the attributes from all the plans registered in this [[NameScope]]. The output can + * contain duplicate names. This is used for star (`*`) resolution. + */ + def getAllAttributes: Seq[Attribute] = { + val attributes = new mutable.ArrayBuffer[Attribute] + + planOutputs.forEach(planOutput => { + attributes.appendAll(planOutput.attributes) + }) + + attributes.toSeq + } + + /** + * Expand the [[UnresolvedStar]] using `planOutputs`. The expected use case for this method is + * star expansion inside [[Project]]. Since [[Project]] has only one child, we assert that the + * size of `planOutputs` is 1, otherwise the query is malformed. + * + * Some examples of queries with a star: + * + * - Star without a target: + * {{{ SELECT * FROM VALUES (1, 2, 3) AS t(a, b, c); }}} + * - Star with a multipart name target: + * {{{ SELECT catalog1.database1.table1.* FROM catalog1.database1.table1; }}} + * - Star with a struct target: + * {{{ SELECT d.* FROM VALUES (named_struct('a', 1, 'b', 2)) AS t(d); }}} + * - Star as an argument to a function: + * {{{ SELECT concat_ws('', *) AS result FROM VALUES (1, 2, 3) AS t(a, b, c); }}} + * + * It is resolved by correctly resolving the star qualifier. + * Please check [[UnresolvedStarBase.expandStar]] for more details. + * + * @param unresolvedStar [[UnresolvedStar]] to expand. + * @return The output of a plan expanded from the star. + */ + def expandStar(unresolvedStar: UnresolvedStar): Seq[NamedExpression] = { + if (planOutputs.size != 1) { + throw QueryCompilationErrors.invalidStarUsageError("query", Seq(unresolvedStar)) + } + + planOutputs.get(0).expandStar(unresolvedStar) + } + + /** + * Get all matched attributes by a multipart name. It returns [[Attribute]]s when we resolve a + * simple column or an alias name from a lower operator. However this function can also return + * [[Alias]]es in case we access a struct field or a map value using some key. + * + * Example that contains those major use-cases: + * + * {{{ + * SELECT col1, a, col2.field, col3.struct.field, col4.key + * FROM (SELECT *, col5 AS a FROM t); + * }}} + * + * has a Project list that looks like this: + * + * {{{ + * AttributeReference(col1), + * AttributeReference(a), + * Alias(col2.field, field), + * Alias(col3.struct.field, field), + * Alias(col4[CAST(key AS INT)], key) + * }}} + * + * Also, see [[AttributeSeq.resolve]] for more details. + * + * Since there can be several identical attribute names for several named plans, this function + * can return multiple values: + * - 0 values: No matched attributes + * - 1 value: Unique attribute matched + * - 1+ values: Ambiguity, several attributes matched + * + * One example of a query with an attribute that has a multipart name: + * + * {{{ SELECT catalog1.database1.table1.col1 FROM catalog1.database1.table1; }}} + * + * @param multipartName Multipart attribute name. Can be of several forms: + * - `catalog.database.table.column` + * - `database.table.column` + * - `table.column` + * - `column` + * @return All the attributes matched by the `multipartName`, encapsulated in a [[NameTarget]]. + */ + def matchMultipartName(multipartName: Seq[String]): NameTarget = { + val candidates = new mutable.ArrayBuffer[Expression] + val allAttributes = new mutable.ArrayBuffer[Attribute] + var aliasName: Option[String] = None + + planOutputs.forEach(planOutput => { + allAttributes.appendAll(planOutput.attributes) + val nameTarget = planOutput.matchMultipartName(multipartName) + if (nameTarget.aliasName.isDefined) { + aliasName = nameTarget.aliasName + } + candidates.appendAll(nameTarget.candidates) + }) + + NameTarget(candidates.toSeq, aliasName, allAttributes.toSeq) + } + + /** + * Add an alias, by name, to the list of existing aliases. + */ + def addAlias(aliasName: String): Unit = existingAliases.add(aliasName.toLowerCase()) + + /** + * Returns whether an alias exists in the current scope. + */ + def isExistingAlias(aliasName: String): Boolean = + existingAliases.contains(aliasName.toLowerCase()) + + private def update(attributes: Seq[Attribute], name: Option[String] = None): Unit = { + planNameToOffset.get(name) match { + case Some(index) => + val prevPlanOutput = planOutputs.get(index) + planOutputs.set( + index, + new PlanOutput(prevPlanOutput.attributes ++ attributes, name, nameComparator) + ) + case None => + val index = planOutputs.size + planOutputs.add(new PlanOutput(attributes, name, nameComparator)) + planNameToOffset += (name -> index) + } + } +} + +/** + * The [[NameScopeStack]] is a stack of [[NameScope]]s managed by the [[Resolver]] and the + * [[ExpressionResolver]]. Usually a top scope is used for name resolution, but in case of + * correlated subqueries we can lookup names in the parent scopes. Low-level scope creation is + * managed internally, and only high-level api like [[withNewScope]] is available to the resolvers. + * Freshly-created [[NameScopeStack]] contains an empty root [[NameScope]]. + */ +class NameScopeStack extends SQLConfHelper { + private val stack = new ArrayDeque[NameScope] + push() + + /** + * Get the top scope, which is a default choice for name resolution. + */ + def top: NameScope = { + stack.peek() + } + + /** + * Completely overwrite the top scope state with a named plan output. + * + * See [[NameScope.update]] for more details. + */ + def overwriteTop(name: String, attributes: Seq[Attribute]): Unit = { + val newScope = new NameScope + newScope.update(name, attributes) + + stack.pop() + stack.push(newScope) + } + + /** + * Completely overwrite the top scope state with an unnamed plan output. + * + * See [[NameScope.+=]] for more details. + */ + def overwriteTop(attributes: Seq[Attribute]): Unit = { + val newScope = new NameScope + newScope += attributes + + stack.pop() + stack.push(newScope) + } + + /** + * Execute `body` in a context of a fresh scope. It's used during the [[Project]] or the + * [[Aggregate]] resolution to avoid calling [[push]] and [[pop]] explicitly. + */ + def withNewScope[R](body: => R): R = { + push() + try { + body + } finally { + pop() + } + } + + /** + * Push a new scope to the stack. Introduced by the [[Project]] or the [[Aggregate]]. + */ + private def push(): Unit = { + stack.push(new NameScope) + } + + /** + * Pop a scope from the stack. Called when the resolution process for the pushed scope is done. + */ + private def pop(): Unit = { + stack.pop() + } +} + +/** + * [[PlanOutput]] represents a sequence of attributes from a plan ([[NamedRelation]], [[Project]], + * [[Aggregate]], etc). + * + * It is created from `attributes`, which is an output of a named plan, optional plan `name` and a + * resolver provided by the [[NameScopeStack]]. + * + * @param attributes Plan output. Can contain duplicate names. + * @param name Plan name. Non-empty for named plans like [[NamedRelation]] or [[SubqueryAlias]], + * `None` otherwise. + */ +class PlanOutput( + val attributes: Seq[Attribute], + val name: Option[String], + val nameComparator: NameComparator) { + + /** + * attributesForResolution is an [[AttributeSeq]] that is used for resolution of + * multipart attribute names. It's created from the `attributes` when [[NameScope]] is updated. + */ + private val attributesForResolution: AttributeSeq = + AttributeSeq.fromNormalOutput(attributes) + + /** + * Find attributes by the multipart name. + * + * See [[NameScope.matchMultipartName]] for more details. + * + * @param multipartName Multipart attribute name. + * @return Matched attributes or [[Seq.empty]] otherwise. + */ + def matchMultipartName(multipartName: Seq[String]): NameTarget = { + val (candidates, nestedFields) = + attributesForResolution.getCandidatesForResolution(multipartName, nameComparator) + val resolvedCandidates = attributesForResolution.resolveCandidates( + multipartName, + nameComparator, + candidates, + nestedFields + ) + resolvedCandidates match { + case Seq(Alias(child, aliasName)) => + NameTarget(Seq(child), Some(aliasName)) + case other => + NameTarget(other, None) + } + } + + /** + * Method to expand an unresolved star. See [[NameScope.expandStar]] for more details. + * + * @param unresolvedStar Star to resolve. + * @return Attributes expanded from the star. + */ + def expandStar(unresolvedStar: UnresolvedStar): Seq[NamedExpression] = { + unresolvedStar.expandStar( + childOperatorOutput = attributes, + childOperatorMetadataOutput = Seq.empty, + resolve = + (nameParts, nameComparator) => attributesForResolution.resolve(nameParts, nameComparator), + suggestedAttributes = attributes, + resolver = nameComparator, + cleanupNestedAliasesDuringStructExpansion = true + ) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/NameTarget.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/NameTarget.scala new file mode 100644 index 0000000000000..3b31c9b1a9110 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/NameTarget.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} +import org.apache.spark.sql.catalyst.util.StringUtils.orderSuggestedIdentifiersBySimilarity +import org.apache.spark.sql.errors.QueryCompilationErrors + +/** + * Class that represents results of name resolution or star expansion. It encapsulates: + * - `candidates` - A list of candidates that are possible matches for a given name. + * - `aliasName` - If the candidates size is 1 and it's type is `ExtractValue` (which means that + * it's a recursive type), then the `aliasName` should be the name with which the candidate is + * aliased. Otherwise, `aliasName` should be `None`. + * - `allAttributes` - A list of all attributes which is used to generate suggestions for + * unresolved column error. + * + * Example: + * + * - Attribute resolution: + * {{{ SELECT col1 FROM VALUES (1); }}} will have a [[NameTarget]] with a single candidate `col1`. + * `aliasName` would be `None` in this case because the column is not of recursive type. + * + * - Recursive attribute resolution: + * {{{ SELECT col1.col1 FROM VALUES(STRUCT(1,2), 3) }}} will have a [[NameTarget]] with a + * single candidate `col1` and an `aliasName` of `Some("col1")`. + */ +case class NameTarget( + candidates: Seq[Expression], + aliasName: Option[String] = None, + allAttributes: Seq[Attribute] = Seq.empty) { + + /** + * Picks a candidate from the list of candidates based on the given unresolved attribute. + * Its behavior is as follows (based on the number of candidates): + * + * - If there is only one candidate, it will be returned. + * + * - If there are multiple candidates, an ambiguous reference error will be thrown. + * + * - If there are no candidates, an unresolved column error will be thrown. + */ + def pickCandidate(unresolvedAttribute: UnresolvedAttribute): Expression = { + candidates match { + case Seq() => + throwUnresolvedColumnError(unresolvedAttribute) + case Seq(candidate) => + candidate + case _ => + throw QueryCompilationErrors.ambiguousReferenceError( + unresolvedAttribute.name, + candidates.collect { case attribute: AttributeReference => attribute } + ) + } + } + + private def throwUnresolvedColumnError(unresolvedAttribute: UnresolvedAttribute): Nothing = + throw QueryCompilationErrors.unresolvedColumnError( + unresolvedAttribute.name, + proposal = orderSuggestedIdentifiersBySimilarity( + unresolvedAttribute.name, + candidates = allAttributes.map(attribute => attribute.qualifier :+ attribute.name) + ) + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/PlanLogger.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/PlanLogger.scala new file mode 100644 index 0000000000000..f778915008dbb --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/PlanLogger.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.internal.{Logging, MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys.{MESSAGE, QUERY_PLAN} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.util.sideBySide +import org.apache.spark.sql.internal.SQLConf + +/** + * [[PlanLogger]] is used by the [[Resolver]] to log intermediate resolution results. + */ +class PlanLogger extends Logging { + private val planChangeLogLevel = SQLConf.get.planChangeLogLevel + private val expressionTreeChangeLogLevel = SQLConf.get.expressionTreeChangeLogLevel + + def logPlanResolutionEvent(plan: LogicalPlan, event: String): Unit = { + log(() => log""" + |=== Plan resolution: ${MDC(MESSAGE, event)} === + |${MDC(QUERY_PLAN, plan.treeString)} + """.stripMargin, planChangeLogLevel) + } + + def logPlanResolution(unresolvedPlan: LogicalPlan, resolvedPlan: LogicalPlan): Unit = { + log( + () => + log""" + |=== Unresolved plan -> Resolved plan === + |${MDC( + QUERY_PLAN, + sideBySide( + unresolvedPlan.treeString, + resolvedPlan.treeString + ).mkString("\n") + )} + """.stripMargin, + planChangeLogLevel + ) + } + + def logExpressionTreeResolutionEvent(expressionTree: Expression, event: String): Unit = { + log( + () => log""" + |=== Expression tree resolution: ${MDC(MESSAGE, event)} === + |${MDC(QUERY_PLAN, expressionTree.treeString)} + """.stripMargin, + expressionTreeChangeLogLevel + ) + } + + def logExpressionTreeResolution( + unresolvedExpressionTree: Expression, + resolvedExpressionTree: Expression): Unit = { + log( + () => + log""" + |=== Unresolved expression tree -> Resolved expression tree === + |${MDC( + QUERY_PLAN, + sideBySide( + unresolvedExpressionTree + .withNewChildren(resolvedExpressionTree.children) + .treeString, + resolvedExpressionTree.treeString + ).mkString("\n") + )} + """.stripMargin, + expressionTreeChangeLogLevel + ) + } + + private def log(createMessage: () => MessageWithContext, logLevel: String): Unit = + logLevel match { + case "TRACE" => logTrace(createMessage().message) + case "DEBUG" => logDebug(createMessage().message) + case "INFO" => logInfo(createMessage()) + case "WARN" => logWarning(createMessage()) + case "ERROR" => logError(createMessage()) + case _ => logTrace(createMessage().message) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/PredicateResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/PredicateResolver.scala new file mode 100644 index 0000000000000..d94559496d04e --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/PredicateResolver.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{ + AnsiStringPromotionTypeCoercion, + AnsiTypeCoercion, + ApplyCharTypePaddingHelper, + BooleanEqualityTypeCoercion, + CollationTypeCoercion, + DecimalPrecisionTypeCoercion, + DivisionTypeCoercion, + IntegralDivisionTypeCoercion, + StringPromotionTypeCoercion, + TypeCoercion +} +import org.apache.spark.sql.catalyst.expressions.{Expression, Predicate, StringRPad} +import org.apache.spark.sql.internal.SQLConf + +/** + * Resolver class for resolving all [[Predicate]] expressions. Recursively resolves all children + * and applies selected type coercions to the expression. + */ +class PredicateResolver( + expressionResolver: ExpressionResolver, + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver) + extends TreeNodeResolver[Predicate, Expression] + with ResolvesExpressionChildren { + + private val typeCoercionRules = if (conf.ansiEnabled) { + PredicateResolver.ANSI_TYPE_COERCION_RULES + } else { + PredicateResolver.TYPE_COERCION_RULES + } + private val typeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + override def resolve(unresolvedPredicate: Predicate): Expression = { + val predicateWithResolvedChildren = + withResolvedChildren(unresolvedPredicate, expressionResolver.resolve) + val predicateWithTypeCoercion = typeCoercionResolver.resolve(predicateWithResolvedChildren) + val predicateWithCharTypePadding = { + ApplyCharTypePaddingHelper.singleNodePaddingForStringComparison( + predicateWithTypeCoercion, + !conf.getConf(SQLConf.LEGACY_NO_CHAR_PADDING_IN_PREDICATE) + ) + } + predicateWithCharTypePadding.children.collectFirst { + case rpad: StringRPad => rpad + } match { + // In the fixed-point Analyzer [[ApplyCharTypePadding]] is called after [[ResolveAliases]] + // and therefore padding doesn't affect the alias. In single-pass resolver we need to call + // this code before we resolve the alias, which will cause the alias to include the pad in + // its name: + // + // fixed-point: + // expression: rpad('12', 3, ' ') = '12 ' + // alias: '12' = '12 ' + // + // single-pass: + // expression: rpad('12', 3, ' ') = '12 ' + // alias: rpad('12', 3, ' ') = '12 ' + // + // Disabling this case until the aliasing is fixed. + case Some(_) => throw new ExplicitlyUnsupportedResolverFeature("CharTypePaddingAliasing") + + case _ => predicateWithCharTypePadding + } + } +} + +object PredicateResolver { + // Ordering in the list of type coercions should be in sync with the list in [[TypeCoercion]]. + private val TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + CollationTypeCoercion.apply, + TypeCoercion.InTypeCoercion.apply, + StringPromotionTypeCoercion.apply, + DecimalPrecisionTypeCoercion.apply, + BooleanEqualityTypeCoercion.apply, + DivisionTypeCoercion.apply, + IntegralDivisionTypeCoercion.apply, + TypeCoercion.ImplicitTypeCoercion.apply, + TypeCoercion.DateTimeOperationsTypeCoercion.apply + ) + + // Ordering in the list of type coercions should be in sync with the list in [[AnsiTypeCoercion]]. + private val ANSI_TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + CollationTypeCoercion.apply, + AnsiTypeCoercion.InTypeCoercion.apply, + AnsiStringPromotionTypeCoercion.apply, + DecimalPrecisionTypeCoercion.apply, + DivisionTypeCoercion.apply, + IntegralDivisionTypeCoercion.apply, + AnsiTypeCoercion.ImplicitTypeCoercion.apply, + AnsiTypeCoercion.AnsiDateTimeOperationsTypeCoercion.apply + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ProducesUnresolvedSubtree.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ProducesUnresolvedSubtree.scala new file mode 100644 index 0000000000000..8d85804a93634 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ProducesUnresolvedSubtree.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.expressions.Expression + +/** + * A mixin trait for expression resolvers that as part of their resolution, replace single node + * with a subtree of nodes. This step is necessary because the underlying legacy code that is being + * called produces partially-unresolved subtrees. In order to resolve the subtree a callback + * resolver is called recursively. This callback must ensure that no node is resolved twice in + * order to not break the single-pass invariant. This is done by tagging the limits of this + * traversal with [[ExpressionResolver.SINGLE_PASS_SUBTREE_BOUNDARY]] tag. This tag is applied to + * the original expression's children, which are guaranteed to be resolved at the time of given + * expression's resolution. When callback resolver encounters the node that is tagged, it should + * return identity instead of trying to resolve it. + */ +trait ProducesUnresolvedSubtree extends ResolvesExpressionChildren { + + /** + * Helper method used to resolve a subtree that is generated as part of the resolution of some + * node. Method ensures that the downwards traversal never visits previously resolved nodes by + * tracking the limits of the traversal with a tag. Invokes a resolver callback to resolve + * children, but DOES NOT resolve the root of the subtree. + */ + protected def withResolvedSubtree( + expression: Expression, + expressionResolver: Expression => Expression)(body: => Expression): Expression = { + expression.children.foreach { child => + child.setTagValue(ExpressionResolver.SINGLE_PASS_SUBTREE_BOUNDARY, ()) + } + + val result = body + + withResolvedChildren(result, expressionResolver) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/RelationId.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/RelationId.scala new file mode 100644 index 0000000000000..eab97dd8345cb --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/RelationId.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +/** + * The [[RelationId]] is a unique identifier for a relation. It is used to lookup the relations + * which were processed by the [[MetadataResolver]] to substitute the unresolved relations in single + * pass during the analysis phase. + */ +case class RelationId( + multipartIdentifier: Seq[String], + options: CaseInsensitiveStringMap = CaseInsensitiveStringMap.empty, + isStreaming: Boolean = false +) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/RelationMetadataProvider.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/RelationMetadataProvider.scala new file mode 100644 index 0000000000000..cf352842fd106 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/RelationMetadataProvider.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.HashMap + +import org.apache.spark.sql.catalyst.analysis.{withPosition, RelationResolution, UnresolvedRelation} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.LookupCatalog +import org.apache.spark.util.ArrayImplicits._ + +/** + * [[RelationMetadataProvider]] provides relations with resolved metadata based on the + * corresponding [[UnresolvedRelation]]s. It is used by [[Resolver]] to replace + * [[UnresolvedRelation]] with a specific [[LogicalPlan]] with resolved metadata, e.g. with + * [[UnresolvedCatalogRelation]] or [[View]]. + */ +trait RelationMetadataProvider extends LookupCatalog { + type RelationsWithResolvedMetadata = HashMap[RelationId, LogicalPlan] + + /** + * [[relationResolution]] is used by the [[RelationMetadataProvider]] to expand relation + * identifiers in [[relationIdFromUnresolvedRelation]]. + */ + protected val relationResolution: RelationResolution + + /** + * [[relationsWithResolvedMetadata]] is a map from relation ID to the specific [[LogicalPlan]] + * with resolved metadata, like [[UnresolvedCatalogRelation]] or [[View]]. It's filled by the + * specific [[RelationMetadataProvider]] implementation and is queried in + * [[getRelationWithResolvedMetadata]]. + */ + protected val relationsWithResolvedMetadata: RelationsWithResolvedMetadata + + /** + * Get the [[LogicalPlan]] with resolved metadata for the given [[UnresolvedRelation]]. + * + * [[java.util.HashMap]] returns `null` if the key is not found, so we wrap it in an [[Option]]. + */ + def getRelationWithResolvedMetadata( + unresolvedRelation: UnresolvedRelation): Option[LogicalPlan] = { + Option( + relationsWithResolvedMetadata.get( + relationIdFromUnresolvedRelation(unresolvedRelation) + ) + ) + } + + /** + * Returns the [[RelationId]] for the given [[UnresolvedRelation]]. Here we use + * [[relationResolution]] to expand the [[UnresolvedRelation]] identifier fully, so that our + * [[RelationId]] uniquely identifies the [[unresolvedRelation]]. + * + * This method is public, because it's used in [[MetadataResolverSuite]]. + */ + def relationIdFromUnresolvedRelation(unresolvedRelation: UnresolvedRelation): RelationId = { + relationResolution.expandIdentifier(unresolvedRelation.multipartIdentifier) match { + case CatalogAndIdentifier(catalog, ident) => + RelationId( + multipartIdentifier = + Seq(catalog.name()) ++ ident.namespace().toImmutableArraySeq ++ Seq(ident.name()), + options = unresolvedRelation.options, + isStreaming = unresolvedRelation.isStreaming + ) + case _ => + withPosition(unresolvedRelation) { + unresolvedRelation.tableNotFound(unresolvedRelation.multipartIdentifier) + } + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolutionValidator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolutionValidator.scala new file mode 100644 index 0000000000000..6c4de2e6e58d7 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolutionValidator.scala @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, ResolvedInlineTable} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.{ + Filter, + GlobalLimit, + LocalLimit, + LocalRelation, + LogicalPlan, + OneRowRelation, + Project, + SubqueryAlias +} +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.types.BooleanType + +/** + * The [[ResolutionValidator]] performs the validation work after the logical plan tree is + * resolved by the [[Resolver]]. Each `resolve*` method in the [[Resolver]] must + * have its `validate*` counterpart in the [[ResolutionValidator]]. The validation code asserts the + * conditions that must never be false no matter which SQL query or DataFrame program was provided. + * The validation approach is single-pass, post-order, complementary to the resolution process. + */ +class ResolutionValidator { + private val expressionResolutionValidator = new ExpressionResolutionValidator(this) + + private[resolver] var attributeScopeStack = new AttributeScopeStack + + /** + * Validate the resolved logical `plan` - assert invariants that should never be false no + * matter which SQL query or DataFrame program was provided. New operators must be added here as + * soon as [[Resolver]] supports them. We check this by throwing an exception for + * unknown operators. + */ + def validatePlan(plan: LogicalPlan): Unit = wrapErrors(plan) { + validate(plan) + } + + private def validate(operator: LogicalPlan): Unit = { + operator match { + case project: Project => + validateProject(project) + case filter: Filter => + validateFilter(filter) + case subqueryAlias: SubqueryAlias => + validateSubqueryAlias(subqueryAlias) + case globalLimit: GlobalLimit => + validateGlobalLimit(globalLimit) + case localLimit: LocalLimit => + validateLocalLimit(localLimit) + case inlineTable: ResolvedInlineTable => + validateInlineTable(inlineTable) + case localRelation: LocalRelation => + validateRelation(localRelation) + case oneRowRelation: OneRowRelation => + validateRelation(oneRowRelation) + // [[LogicalRelation]], [[HiveTableRelation]] and other specific relations can't be imported + // because of a potential circular dependency, so we match a generic Catalyst + // [[MultiInstanceRelation]] instead. + case multiInstanceRelation: MultiInstanceRelation => + validateRelation(multiInstanceRelation) + } + } + + private def validateProject(project: Project): Unit = { + attributeScopeStack.withNewScope { + validate(project.child) + expressionResolutionValidator.validateProjectList(project.projectList) + } + + handleOperatorOutput(project) + } + + private def validateFilter(filter: Filter): Unit = { + validate(filter.child) + + assert( + filter.condition.dataType == BooleanType, + s"Output type of a filter must be a boolean, but got: ${filter.condition.dataType.typeName}" + ) + expressionResolutionValidator.validate(filter.condition) + } + + private def validateSubqueryAlias(subqueryAlias: SubqueryAlias): Unit = { + validate(subqueryAlias.child) + + handleOperatorOutput(subqueryAlias) + } + + private def validateGlobalLimit(globalLimit: GlobalLimit): Unit = { + validate(globalLimit.child) + expressionResolutionValidator.validate(globalLimit.limitExpr) + } + + private def validateLocalLimit(localLimit: LocalLimit): Unit = { + validate(localLimit.child) + expressionResolutionValidator.validate(localLimit.limitExpr) + } + + private def validateInlineTable(inlineTable: ResolvedInlineTable): Unit = { + inlineTable.rows.foreach(row => { + row.foreach(expression => { + expressionResolutionValidator.validate(expression) + }) + }) + + handleOperatorOutput(inlineTable) + } + + private def validateRelation(relation: LogicalPlan): Unit = { + handleOperatorOutput(relation) + } + + private def handleOperatorOutput(operator: LogicalPlan): Unit = { + attributeScopeStack.overwriteTop(operator.output) + + operator.output.foreach(attribute => { + assert( + attribute.isInstanceOf[AttributeReference], + s"Output of an operator must be a reference to an attribute, but got: " + + s"${attribute.getClass.getSimpleName}" + ) + expressionResolutionValidator.validate(attribute) + }) + } + + private def wrapErrors[R](plan: LogicalPlan)(body: => R): Unit = { + try { + body + } catch { + case ex: Throwable => + throw QueryCompilationErrors.resolutionValidationError(ex, plan) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/Resolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/Resolver.scala new file mode 100644 index 0000000000000..37b875abaade6 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/Resolver.scala @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.EvaluateUnresolvedInlineTable +import org.apache.spark.sql.catalyst.analysis.{ + withPosition, + FunctionResolution, + NamedRelation, + RelationResolution, + ResolvedInlineTable, + UnresolvedInlineTable, + UnresolvedRelation +} +import org.apache.spark.sql.catalyst.plans.logical.{ + Filter, + GlobalLimit, + LocalLimit, + LocalRelation, + LogicalPlan, + OneRowRelation, + Project, + SubqueryAlias +} +import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} +import org.apache.spark.sql.types.BooleanType + +/** + * The Resolver implements a single-pass bottom-up analysis algorithm in the Catalyst. + * + * The functions here generally traverse the [[LogicalPlan]] nodes recursively, + * constructing and returning the resolved [[LogicalPlan]] nodes bottom-up. + * This is the primary entry point for implementing SQL and DataFrame plan analysis, + * wherein the [[resolve]] method accepts a fully unresolved [[LogicalPlan]] and returns + * a fully resolved [[LogicalPlan]] in response with all data types and attribute + * reference ID assigned for valid requests. This resolver also takes responsibility + * to detect any errors in the initial SQL query or DataFrame and return appropriate + * error messages including precise parse locations wherever possible. + * + * The Resolver is a one-shot object per each SQL/DataFrame logical plan, the calling code must + * re-create it for every new analysis run. + * + * @param catalogManager [[CatalogManager]] for relation and identifier resolution. + * @param extensions A list of [[ResolverExtension]] that can resolve external operators. + */ +class Resolver( + catalogManager: CatalogManager, + override val extensions: Seq[ResolverExtension] = Seq.empty, + metadataResolverExtensions: Seq[ResolverExtension] = Seq.empty) + extends TreeNodeResolver[LogicalPlan, LogicalPlan] + with QueryErrorsBase + with ResolvesOperatorChildren + with TracksResolvedNodes[LogicalPlan] + with DelegatesResolutionToExtensions { + private val scopes = new NameScopeStack + private val planLogger = new PlanLogger + private val relationResolution = Resolver.createRelationResolution(catalogManager) + private val functionResolution = new FunctionResolution(catalogManager, relationResolution) + private val expressionResolver = + new ExpressionResolver(this, scopes, functionResolution, planLogger) + private val limitExpressionResolver = new LimitExpressionResolver(expressionResolver) + + /** + * [[relationMetadataProvider]] is used to resolve metadata for relations. It's initialized with + * the default implementation [[MetadataResolver]] here and is called in + * [[lookupMetadataAndResolve]] on the unresolved logical plan to visit it (both operators and + * expressions) to resolve the metadata and populate its internal state. It's later queried by + * [[resolveRelation]] to get the plan with resolved metadata (for example, a [[View]] or an + * [[UnresolvedCatalogRelation]]) based on the [[UnresolvedRelation]]. + * + * If the [[AnalyzerBridgeState]] is provided, we reset this provider to the + * [[BridgedRelationMetadataProvider]] and later stick to it forever without resorting to the + * actual blocking metadata resolution. + */ + private var relationMetadataProvider: RelationMetadataProvider = new MetadataResolver( + catalogManager, + relationResolution, + metadataResolverExtensions + ) + + /** + * This method is an analysis entry point. It resolves the metadata and invokes [[resolve]], + * which does most of the analysis work. + */ + def lookupMetadataAndResolve( + unresolvedPlan: LogicalPlan, + analyzerBridgeState: Option[AnalyzerBridgeState] = None): LogicalPlan = { + planLogger.logPlanResolutionEvent(unresolvedPlan, "Lookup metadata and resolve") + + relationMetadataProvider = analyzerBridgeState match { + case Some(analyzerBridgeState) => + new BridgedRelationMetadataProvider( + catalogManager, + relationResolution, + analyzerBridgeState + ) + case None => + relationMetadataProvider + } + + relationMetadataProvider match { + case metadataResolver: MetadataResolver => + metadataResolver.resolve(unresolvedPlan) + case _ => + } + + resolve(unresolvedPlan) + } + + /** + * This method takes an unresolved [[LogicalPlan]] and chooses the right `resolve*` method using + * pattern matching on the `unresolvedPlan` type. This pattern matching enumerates all the + * operator node types that are supported by the single-pass analysis. + * + * When developers introduce a new unresolved node type to the Catalyst, they should implement + * a corresponding `resolve*` method in the [[Resolver]] and add it to this pattern match + * list. + * + * [[resolve]] will be called recursively during the unresolved plan traversal eventually + * producing a fully resolved plan or a descriptive error message. + */ + override def resolve(unresolvedPlan: LogicalPlan): LogicalPlan = { + planLogger.logPlanResolutionEvent(unresolvedPlan, "Unresolved plan") + + throwIfNodeWasResolvedEarlier(unresolvedPlan) + + val resolvedPlan = + unresolvedPlan match { + case unresolvedProject: Project => + resolveProject(unresolvedProject) + case unresolvedFilter: Filter => + resolveFilter(unresolvedFilter) + case unresolvedSubqueryAlias: SubqueryAlias => + resolveSubqueryAlias(unresolvedSubqueryAlias) + case unresolvedGlobalLimit: GlobalLimit => + resolveGlobalLimit(unresolvedGlobalLimit) + case unresolvedLocalLimit: LocalLimit => + resolveLocalLimit(unresolvedLocalLimit) + case unresolvedRelation: UnresolvedRelation => + resolveRelation(unresolvedRelation) + case unresolvedInlineTable: UnresolvedInlineTable => + resolveInlineTable(unresolvedInlineTable) + // See the reason why we have to match both [[LocalRelation]] and [[ResolvedInlineTable]] + // in the [[resolveInlineTable]] scaladoc + case resolvedInlineTable: ResolvedInlineTable => + updateNameScopeWithPlanOutput(resolvedInlineTable) + case localRelation: LocalRelation => + updateNameScopeWithPlanOutput(localRelation) + case unresolvedOneRowRelation: OneRowRelation => + updateNameScopeWithPlanOutput(unresolvedOneRowRelation) + case _ => + tryDelegateResolutionToExtension(unresolvedPlan).getOrElse { + handleUnmatchedOperator(unresolvedPlan) + } + } + + markNodeAsResolved(resolvedPlan) + + planLogger.logPlanResolution(unresolvedPlan, resolvedPlan) + + resolvedPlan + } + + /** + * [[Project]] introduces a new scope to resolve its subtree and project list expressions. After + * those are resolved in the child scope we overwrite current scope with resolved [[Project]]'s + * output to expose new names to the parent operators. + */ + private def resolveProject(unresolvedProject: Project): LogicalPlan = { + val resolvedProject = scopes.withNewScope { + val resolvedChild = resolve(unresolvedProject.child) + val resolvedProjectList = + expressionResolver.resolveProjectList(unresolvedProject.projectList) + Project(resolvedProjectList, resolvedChild) + } + + withPosition(unresolvedProject) { + scopes.overwriteTop(resolvedProject.output) + } + + resolvedProject + } + + /** + * [[Filter]] has a single child and a single condition and we resolve them in this respective + * order. + */ + private def resolveFilter(unresolvedFilter: Filter): LogicalPlan = { + val resolvedChild = resolve(unresolvedFilter.child) + val resolvedCondition = expressionResolver.resolve(unresolvedFilter.condition) + + val resolvedFilter = Filter(resolvedCondition, resolvedChild) + if (resolvedFilter.condition.dataType != BooleanType) { + withPosition(unresolvedFilter) { + throwDatatypeMismatchFilterNotBoolean(resolvedFilter) + } + } + + resolvedFilter + } + + /** + * [[SubqueryAlias]] has a single child and an identifier. We need to resolve the child and update + * the scope with the output, since upper expressions can reference [[SubqueryAlias]]es output by + * its identifier. + */ + private def resolveSubqueryAlias(unresolvedSubqueryAlias: SubqueryAlias): LogicalPlan = { + val resolvedSubqueryAlias = + SubqueryAlias(unresolvedSubqueryAlias.identifier, resolve(unresolvedSubqueryAlias.child)) + withPosition(unresolvedSubqueryAlias) { + scopes.overwriteTop(unresolvedSubqueryAlias.alias, resolvedSubqueryAlias.output) + } + resolvedSubqueryAlias + } + + /** + * Resolve [[GlobalLimit]]. We have to resolve its child and resolve and validate its limit + * expression. + */ + private def resolveGlobalLimit(unresolvedGlobalLimit: GlobalLimit): LogicalPlan = { + val resolvedChild = resolve(unresolvedGlobalLimit.child) + + val resolvedLimitExpr = withPosition(unresolvedGlobalLimit) { + limitExpressionResolver.resolve(unresolvedGlobalLimit.limitExpr) + } + + GlobalLimit(resolvedLimitExpr, resolvedChild) + } + + /** + * Resolve [[LocalLimit]]. We have to resolve its child and resolve and validate its limit + * expression. + */ + private def resolveLocalLimit(unresolvedLocalLimit: LocalLimit): LogicalPlan = { + val resolvedChild = resolve(unresolvedLocalLimit.child) + + val resolvedLimitExpr = withPosition(unresolvedLocalLimit) { + limitExpressionResolver.resolve(unresolvedLocalLimit.limitExpr) + } + + LocalLimit(resolvedLimitExpr, resolvedChild) + } + + /** + * [[UnresolvedRelation]] was previously looked up by the [[MetadataResolver]] and now we need to: + * - Get the specific relation with metadata from `relationsWithResolvedMetadata`, like + * [[UnresolvedCatalogRelation]], or throw an error if it wasn't found + * - Resolve it further, usually using extensions, like [[DataSourceResolver]] + */ + private def resolveRelation(unresolvedRelation: UnresolvedRelation): LogicalPlan = { + relationMetadataProvider.getRelationWithResolvedMetadata(unresolvedRelation) match { + case Some(relationWithResolvedMetadata) => + planLogger.logPlanResolutionEvent( + relationWithResolvedMetadata, + "Relation metadata retrieved" + ) + + withPosition(unresolvedRelation) { + resolve(relationWithResolvedMetadata) + } + case None => + withPosition(unresolvedRelation) { + unresolvedRelation.tableNotFound(unresolvedRelation.multipartIdentifier) + } + } + } + + /** + * [[UnresolvedInlineTable]] resolution requires all the rows to be resolved first. After that we + * use [[EvaluateUnresolvedInlineTable]] and try to evaluate the row expressions if possible to + * get [[LocalRelation]] right away. Sometimes it's not possible because of expressions like + * `current_date()` which are evaluated in the optimizer (SPARK-46380). + * + * Note: By default if all the inline table expressions can be evaluated eagerly, the parser + * would produce a [[LocalRelation]] and the analysis would just skip this step and go straight + * to `resolveLocalRelation` (SPARK-48967, SPARK-49269). + */ + private def resolveInlineTable(unresolvedInlineTable: UnresolvedInlineTable): LogicalPlan = { + val withResolvedExpressions = UnresolvedInlineTable( + unresolvedInlineTable.names, + unresolvedInlineTable.rows.map(row => { + row.map(expressionResolver.resolve(_)) + }) + ) + + val resolvedRelation = EvaluateUnresolvedInlineTable + .evaluateUnresolvedInlineTable(withResolvedExpressions) + + withPosition(unresolvedInlineTable) { + resolve(resolvedRelation) + } + } + + /** + * To finish the operator resolution we add its output to the current scope. This is usually + * done for relations. [[NamedRelation]]'s output should be added to the scope under its name. + */ + private def updateNameScopeWithPlanOutput(relation: LogicalPlan): LogicalPlan = { + withPosition(relation) { + relation match { + case namedRelation: NamedRelation => + scopes.top.update(namedRelation.name, namedRelation.output) + case _ => + scopes.top += relation.output + } + } + relation + } + + override def tryDelegateResolutionToExtension( + unresolvedOperator: LogicalPlan): Option[LogicalPlan] = { + val resolutionResult = super.tryDelegateResolutionToExtension(unresolvedOperator) + resolutionResult.map { resolvedOperator => + updateNameScopeWithPlanOutput(resolvedOperator) + } + } + + /** + * Check if the unresolved operator is explicitly unsupported and throw + * [[ExplicitlyUnsupportedResolverFeature]] in that case. Otherwise, throw + * [[QueryCompilationErrors.unsupportedSinglePassAnalyzerFeature]]. + */ + private def handleUnmatchedOperator(unresolvedOperator: LogicalPlan): Nothing = { + if (ExplicitlyUnsupportedResolverFeature.OPERATORS.contains( + unresolvedOperator.getClass.getName + )) { + throw new ExplicitlyUnsupportedResolverFeature( + s"unsupported operator: ${unresolvedOperator.getClass.getName}" + ) + } + throw QueryCompilationErrors + .unsupportedSinglePassAnalyzerFeature( + s"${unresolvedOperator.getClass} operator resolution" + ) + .withPosition(unresolvedOperator.origin) + } + + private def throwDatatypeMismatchFilterNotBoolean(filter: Filter): Nothing = + throw new AnalysisException( + errorClass = "DATATYPE_MISMATCH.FILTER_NOT_BOOLEAN", + messageParameters = Map( + "sqlExpr" -> filter.expressions.map(toSQLExpr).mkString(","), + "filter" -> toSQLExpr(filter.condition), + "type" -> toSQLType(filter.condition.dataType) + ) + ) +} + +object Resolver { + + /** + * Create a new instance of the [[RelationResolution]]. + */ + def createRelationResolution(catalogManager: CatalogManager): RelationResolution = { + new RelationResolution(catalogManager) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverExtension.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverExtension.scala new file mode 100644 index 0000000000000..8bed881ec97a1 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverExtension.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +/** + * The [[ResolverExtension]] is a main interface for single-pass analysis extensions in Catalyst. + * External code that needs specific node types to be resolved has to implement this trait and + * inject the implementation into the [[Analyzer.singlePassResolverExtensions]]. + * + * Note that resolver extensions are responsible for creating attribute references with IDs that + * are unique from any other subplans. This should be straightforward in most cases because + * creating new attribute references will assign [[NamedExpression.newExprId]] by default. + */ +trait ResolverExtension { + + /** + * Resolve the operator if it's supported by this extension. This method is called by the + * single-pass [[Resolver]] on all the configured extensions when it exhausted its match list + * for the known node types. + * + * Guarantees: + * - The implementation can rely on children being resolved + * - We commit to performing the partial function check only at most once per unresolved operator + */ + def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverGuard.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverGuard.scala new file mode 100644 index 0000000000000..b3b3d4def602d --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverGuard.scala @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.analysis.{ + ResolvedInlineTable, + UnresolvedAlias, + UnresolvedAttribute, + UnresolvedFunction, + UnresolvedInlineTable, + UnresolvedRelation, + UnresolvedStar +} +import org.apache.spark.sql.catalyst.expressions.{ + Alias, + AttributeReference, + BinaryArithmetic, + Cast, + ConditionalExpression, + CreateNamedStruct, + Expression, + Literal, + Predicate, + SubqueryExpression +} +import org.apache.spark.sql.catalyst.plans.logical.{ + Filter, + GlobalLimit, + LocalLimit, + LocalRelation, + LogicalPlan, + OneRowRelation, + Project, + SubqueryAlias +} +import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode + +/** + * [[ResolverGuard]] is a class that checks if the operator that is yet to be analyzed + * only consists of operators and expressions that are currently supported by the + * single-pass analyzer. + * + * This is a one-shot object and should not be reused after [[apply]] call. + */ +class ResolverGuard(catalogManager: CatalogManager) extends SQLConfHelper { + + /** + * Check the top level operator of the parsed operator. + */ + def apply(operator: LogicalPlan): Boolean = + checkConfValues() && checkVariables() && checkOperator(operator) + + /** + * Check if all the operators are supported. For implemented ones, recursively check + * their children. For unimplemented ones, return false. + */ + private def checkOperator(operator: LogicalPlan): Boolean = operator match { + case project: Project => + checkProject(project) + case filter: Filter => + checkFilter(filter) + case subqueryAlias: SubqueryAlias => + checkSubqueryAlias(subqueryAlias) + case globalLimit: GlobalLimit => + checkGlobalLimit(globalLimit) + case localLimit: LocalLimit => + checkLocalLimit(localLimit) + case unresolvedRelation: UnresolvedRelation => + checkUnresolvedRelation(unresolvedRelation) + case unresolvedInlineTable: UnresolvedInlineTable => + checkUnresolvedInlineTable(unresolvedInlineTable) + case resolvedInlineTable: ResolvedInlineTable => + checkResolvedInlineTable(resolvedInlineTable) + case localRelation: LocalRelation => + checkLocalRelation(localRelation) + case oneRowRelation: OneRowRelation => + checkOneRowRelation(oneRowRelation) + case _ => + false + } + + /** + * Method used to check if expressions are supported by the new analyzer. + * For LeafNode types, we return true or false. For other ones, check their children. + */ + private def checkExpression(expression: Expression): Boolean = { + expression match { + case alias: Alias => + checkAlias(alias) + case unresolvedBinaryArithmetic: BinaryArithmetic => + checkUnresolvedBinaryArithmetic(unresolvedBinaryArithmetic) + case unresolvedConditionalExpression: ConditionalExpression => + checkUnresolvedConditionalExpression(unresolvedConditionalExpression) + case unresolvedCast: Cast => + checkUnresolvedCast(unresolvedCast) + case unresolvedStar: UnresolvedStar => + checkUnresolvedStar(unresolvedStar) + case unresolvedAlias: UnresolvedAlias => + checkUnresolvedAlias(unresolvedAlias) + case unresolvedAttribute: UnresolvedAttribute => + checkUnresolvedAttribute(unresolvedAttribute) + case unresolvedPredicate: Predicate => + checkUnresolvedPredicate(unresolvedPredicate) + case literal: Literal => + checkLiteral(literal) + case attributeReference: AttributeReference => + checkAttributeReference(attributeReference) + case createNamedStruct: CreateNamedStruct => + checkCreateNamedStruct(createNamedStruct) + case unresolvedFunction: UnresolvedFunction => + checkUnresolvedFunction(unresolvedFunction) + case _ => + false + } + } + + private def checkProject(project: Project) = { + checkOperator(project.child) && project.projectList.forall(checkExpression) + } + + private def checkFilter(unresolvedFilter: Filter) = + checkOperator(unresolvedFilter.child) && checkExpression(unresolvedFilter.condition) + + private def checkSubqueryAlias(subqueryAlias: SubqueryAlias) = + subqueryAlias.identifier.qualifier.isEmpty && checkOperator(subqueryAlias.child) + + private def checkGlobalLimit(globalLimit: GlobalLimit) = + checkOperator(globalLimit.child) && checkExpression(globalLimit.limitExpr) + + private def checkLocalLimit(localLimit: LocalLimit) = + checkOperator(localLimit.child) && checkExpression(localLimit.limitExpr) + + private def checkUnresolvedInlineTable(unresolvedInlineTable: UnresolvedInlineTable) = + unresolvedInlineTable.rows.forall(_.forall(checkExpression)) + + private def checkUnresolvedRelation(unresolvedRelation: UnresolvedRelation) = true + + private def checkResolvedInlineTable(resolvedInlineTable: ResolvedInlineTable) = + resolvedInlineTable.rows.forall(_.forall(checkExpression)) + + // Usually we don't check outputs of operators in unresolved plans, but in this case + // [[LocalRelation]] is resolved in the parser. + private def checkLocalRelation(localRelation: LocalRelation) = + localRelation.output.forall(checkExpression) + + private def checkOneRowRelation(oneRowRelation: OneRowRelation) = true + + private def checkAlias(alias: Alias) = checkExpression(alias.child) + + private def checkUnresolvedBinaryArithmetic(unresolvedBinaryArithmetic: BinaryArithmetic) = + checkExpression(unresolvedBinaryArithmetic.left) && + checkExpression(unresolvedBinaryArithmetic.right) + + private def checkUnresolvedConditionalExpression( + unresolvedConditionalExpression: ConditionalExpression) = + unresolvedConditionalExpression.children.forall(checkExpression) + + private def checkUnresolvedCast(cast: Cast) = checkExpression(cast.child) + + private def checkUnresolvedStar(unresolvedStar: UnresolvedStar) = true + + private def checkUnresolvedAlias(unresolvedAlias: UnresolvedAlias) = + checkExpression(unresolvedAlias.child) + + private def checkUnresolvedAttribute(unresolvedAttribute: UnresolvedAttribute) = + !ResolverGuard.UNSUPPORTED_ATTRIBUTE_NAMES.contains(unresolvedAttribute.nameParts.head) + + private def checkUnresolvedPredicate(unresolvedPredicate: Predicate) = { + unresolvedPredicate match { + case _: SubqueryExpression => false + case other => + other.children.forall(checkExpression) + } + } + + private def checkAttributeReference(attributeReference: AttributeReference) = true + + private def checkCreateNamedStruct(createNamedStruct: CreateNamedStruct) = { + createNamedStruct.children.forall(checkExpression) + } + + private def checkUnresolvedFunction(unresolvedFunction: UnresolvedFunction) = + ResolverGuard.SUPPORTED_FUNCTION_NAMES.contains( + unresolvedFunction.nameParts.head + ) && unresolvedFunction.children.forall(checkExpression) + + private def checkLiteral(literal: Literal) = true + + private def checkConfValues() = + // Case sensitive analysis is not supported. + !conf.caseSensitiveAnalysis && + // Case-sensitive inference is not supported for Hive table schema. + conf.caseSensitiveInferenceMode == HiveCaseSensitiveInferenceMode.NEVER_INFER + + private def checkVariables() = catalogManager.tempVariableManager.isEmpty +} + +object ResolverGuard { + + private val UNSUPPORTED_ATTRIBUTE_NAMES = { + val map = new IdentifierMap[Unit]() + + /** + * Some SQL functions can be called without the braces and thus they are found in the + * parsed operator as UnresolvedAttributes. This list contains the names of those functions + * so we can reject them. Find more information in [[ColumnResolutionHelper.literalFunctions]]. + */ + map += ("current_date", ()) + map += ("current_timestamp", ()) + map += ("current_user", ()) + map += ("user", ()) + map += ("session_user", ()) + map += ("grouping__id", ()) + + /** + * Metadata column resolution is not supported for now + */ + map += ("_metadata", ()) + + map + } + + /** + * Most of the functions are not supported, but we allow some explicitly supported ones. + */ + private val SUPPORTED_FUNCTION_NAMES = { + val map = new IdentifierMap[Unit]() + map += ("array", ()) + // map += ("array_agg", ()) - until aggregate expressions are supported + map += ("array_append", ()) + map += ("array_compact", ()) + map += ("array_contains", ()) + map += ("array_distinct", ()) + map += ("array_except", ()) + map += ("array_insert", ()) + map += ("array_intersect", ()) + map += ("array_join", ()) + map += ("array_max", ()) + map += ("array_min", ()) + map += ("array_position", ()) + map += ("array_prepend", ()) + map += ("array_remove", ()) + map += ("array_repeat", ()) + map += ("array_size", ()) + // map += ("array_sort", ()) - until lambda functions are supported + map += ("array_union", ()) + map += ("arrays_overlap", ()) + map += ("arrays_zip", ()) + map += ("coalesce", ()) + map += ("if", ()) + map += ("map", ()) + map += ("map_concat", ()) + map += ("map_contains_key", ()) + map += ("map_entries", ()) + // map += ("map_filter", ()) - until lambda functions are supported + map += ("map_from_arrays", ()) + map += ("map_from_entries", ()) + map += ("map_keys", ()) + map += ("map_values", ()) + // map += ("map_zip_with", ()) - until lambda functions are supported + map += ("named_struct", ()) + map += ("sort_array", ()) + map += ("str_to_map", ()) + map + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolvesExpressionChildren.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolvesExpressionChildren.scala new file mode 100644 index 0000000000000..c170941ce5348 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolvesExpressionChildren.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.expressions.Expression + +trait ResolvesExpressionChildren { + + /** + * Resolves generic [[Expression]] children and returns its copy with children resolved. + */ + protected def withResolvedChildren[ExpressionType <: Expression]( + unresolvedExpression: ExpressionType, + resolveChild: Expression => Expression): ExpressionType = { + val newChildren = unresolvedExpression.children.map(resolveChild(_)) + unresolvedExpression.withNewChildren(newChildren).asInstanceOf[ExpressionType] + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolvesOperatorChildren.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolvesOperatorChildren.scala new file mode 100644 index 0000000000000..0f548c3c55858 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolvesOperatorChildren.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +/** + * A mixin trait for all operator resolvers that need to resolve their children. + */ +trait ResolvesOperatorChildren { + + /** + * Resolves generic [[LogicalPlan]] children and returns its copy with children resolved. + */ + protected def withResolvedChildren[OperatorType <: LogicalPlan]( + unresolvedOperator: OperatorType, + resolve: LogicalPlan => LogicalPlan): OperatorType = { + val newChildren = unresolvedOperator.children.map(resolve(_)) + unresolvedOperator.withNewChildren(newChildren).asInstanceOf[OperatorType] + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimeAddResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimeAddResolver.scala new file mode 100644 index 0000000000000..bf27f64598723 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimeAddResolver.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{ + AnsiStringPromotionTypeCoercion, + AnsiTypeCoercion, + StringPromotionTypeCoercion, + TypeCoercion +} +import org.apache.spark.sql.catalyst.expressions.{Expression, TimeAdd} + +/** + * Helper resolver for [[TimeAdd]] which is produced by resolving [[BinaryArithmetic]] nodes. + */ +class TimeAddResolver( + expressionResolver: ExpressionResolver, + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver) + extends TreeNodeResolver[TimeAdd, Expression] + with ResolvesExpressionChildren { + + private val typeCoercionRules: Seq[Expression => Expression] = + if (conf.ansiEnabled) { + TimeAddResolver.ANSI_TYPE_COERCION_RULES + } else { + TimeAddResolver.TYPE_COERCION_RULES + } + private val typeCoercionResolver: TypeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + override def resolve(unresolvedTimeAdd: TimeAdd): Expression = { + val timeAddWithResolvedChildren: TimeAdd = + withResolvedChildren(unresolvedTimeAdd, expressionResolver.resolve) + val timeAddWithTypeCoercion: Expression = typeCoercionResolver + .resolve(timeAddWithResolvedChildren) + timezoneAwareExpressionResolver.withResolvedTimezone( + timeAddWithTypeCoercion, + conf.sessionLocalTimeZone + ) + } +} + +object TimeAddResolver { + // Ordering in the list of type coercions should be in sync with the list in [[TypeCoercion]]. + private val TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + StringPromotionTypeCoercion.apply, + TypeCoercion.ImplicitTypeCoercion.apply, + TypeCoercion.DateTimeOperationsTypeCoercion.apply + ) + + // Ordering in the list of type coercions should be in sync with the list in [[AnsiTypeCoercion]]. + private val ANSI_TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + AnsiStringPromotionTypeCoercion.apply, + AnsiTypeCoercion.ImplicitTypeCoercion.apply, + AnsiTypeCoercion.AnsiDateTimeOperationsTypeCoercion.apply + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimezoneAwareExpressionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimezoneAwareExpressionResolver.scala new file mode 100644 index 0000000000000..a45e9e41cbfb1 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimezoneAwareExpressionResolver.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.expressions.{Expression, TimeZoneAwareExpression} + +/** + * Resolves [[TimeZoneAwareExpressions]] by applying the session's local timezone. + * + * This class is responsible for resolving [[TimeZoneAwareExpression]]s by first resolving their + * children and then applying the session's local timezone. Additionally, ensures that any tags from + * the original expression are preserved during the resolution process. + * + * @constructor Creates a new TimezoneAwareExpressionResolver with the given expression resolver. + * @param expressionResolver The [[ExpressionResolver]] used to resolve child expressions. + */ +class TimezoneAwareExpressionResolver(expressionResolver: TreeNodeResolver[Expression, Expression]) + extends TreeNodeResolver[TimeZoneAwareExpression, Expression] + with ResolvesExpressionChildren { + + /** + * Resolves a [[TimeZoneAwareExpression]] by resolving its children and applying a timezone. + * + * @param unresolvedTimezoneExpression The [[TimeZoneAwareExpression]] to resolve. + * @return A resolved [[Expression]] with the session's local timezone applied. + */ + override def resolve(unresolvedTimezoneExpression: TimeZoneAwareExpression): Expression = { + val expressionWithResolvedChildren = + withResolvedChildren(unresolvedTimezoneExpression, expressionResolver.resolve) + withResolvedTimezoneCopyTags(expressionWithResolvedChildren, conf.sessionLocalTimeZone) + } + + /** + * Applies a timezone to a [[TimeZoneAwareExpression]] while preserving original tags. + * + * This method is particularly useful for cases like resolving [[Cast]] expressions where tags + * such as [[USER_SPECIFIED_CAST]] need to be preserved. + * + * @param expression The [[TimeZoneAwareExpression]] to apply the timezone to. + * @param timeZoneId The timezone ID to apply. + * @return A new [[TimeZoneAwareExpression]] with the specified timezone and original tags. + */ + def withResolvedTimezoneCopyTags(expression: Expression, timeZoneId: String): Expression = { + val withTimeZone = withResolvedTimezone(expression, timeZoneId) + withTimeZone.copyTagsFrom(expression) + withTimeZone + } + + /** + * Apply timezone to [[TimeZoneAwareExpression]] expressions. + */ + def withResolvedTimezone(expression: Expression, timeZoneId: String): Expression = + expression match { + case timezoneExpression: TimeZoneAwareExpression if timezoneExpression.timeZoneId.isEmpty => + timezoneExpression.withTimeZone(timeZoneId) + case other => other + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TracksResolvedNodes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TracksResolvedNodes.scala new file mode 100644 index 0000000000000..dd86bf843b4ec --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TracksResolvedNodes.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import java.util.IdentityHashMap + +import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.trees.TreeNode +import org.apache.spark.sql.internal.SQLConf + +/** + * Trait for top-level resolvers that is used to keep track of resolved nodes and throw an error if + * a node is resolved more than once. This is only used in tests because of the memory overhead of + * using a set to track resolved nodes. + */ +trait TracksResolvedNodes[TreeNodeType <: TreeNode[TreeNodeType]] extends SQLConfHelper { + // Using Map because IdentityHashSet is not available in Scala + private val seenResolvedNodes = new IdentityHashMap[TreeNodeType, Unit] + + private val shouldTrackResolvedNodes = + conf.getConf(SQLConf.ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED) + + protected def throwIfNodeWasResolvedEarlier(node: TreeNodeType): Unit = + if (shouldTrackResolvedNodes && seenResolvedNodes.containsKey(node)) { + throw SparkException.internalError( + s"Single-pass resolver attempted to resolve the same node more than once: $node" + ) + } + + protected def markNodeAsResolved(node: TreeNodeType): Unit = { + if (shouldTrackResolvedNodes) { + seenResolvedNodes.put(node, ()) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TreeNodeResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TreeNodeResolver.scala new file mode 100644 index 0000000000000..5991585995cad --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TreeNodeResolver.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.trees.TreeNode + +/** + * Base class for [[TreeNode]] resolvers. All resolvers should extend this class with + * specific [[UnresolvedNode]] and [[ResolvedNode]] types. + */ +trait TreeNodeResolver[UnresolvedNode <: TreeNode[_], ResolvedNode <: TreeNode[_]] + extends SQLConfHelper { + def resolve(unresolvedNode: UnresolvedNode): ResolvedNode +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TypeCoercionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TypeCoercionResolver.scala new file mode 100644 index 0000000000000..cf4c2ef0d7504 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/TypeCoercionResolver.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.expressions.{Cast, Expression} + +/** + * [[TypeCoercionResolver]] is used by other resolvers to uniformly apply type coercions to all + * expressions. [[TypeCoercionResolver]] takes in a sequence of type coercion transformations that + * should be applied to an expression and applies them in order. Finally, [[TypeCoercionResolver]] + * applies timezone to expression's children, as a child could be replaced with Cast(child, type), + * therefore [[Cast]] resolution is needed. Timezone is applied only on children that have been + * re-instantiated by [[TypeCoercionResolver]], because otherwise children have already been + * resolved. + */ +class TypeCoercionResolver( + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver, + typeCoercionRules: Seq[Expression => Expression]) + extends TreeNodeResolver[Expression, Expression] { + + override def resolve(expression: Expression): Expression = { + val oldChildren = expression.children + + val withTypeCoercion = typeCoercionRules.foldLeft(expression) { + case (expr, rule) => rule.apply(expr) + } + + val newChildren = withTypeCoercion.children.zip(oldChildren).map { + case (newChild: Cast, oldChild) if !newChild.eq(oldChild) => + timezoneAwareExpressionResolver.withResolvedTimezone(newChild, conf.sessionLocalTimeZone) + case (newChild, _) => newChild + } + withTypeCoercion.withNewChildren(newChildren) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/UnaryMinusResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/UnaryMinusResolver.scala new file mode 100644 index 0000000000000..739d7cf43c183 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/UnaryMinusResolver.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.sql.catalyst.analysis.{AnsiTypeCoercion, TypeCoercion} +import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryMinus} + +/** + * Resolver for [[UnaryMinus]]. Resolves children and applies type coercion to target node. + */ +class UnaryMinusResolver( + expressionResolver: ExpressionResolver, + timezoneAwareExpressionResolver: TimezoneAwareExpressionResolver) + extends TreeNodeResolver[UnaryMinus, Expression] + with ResolvesExpressionChildren { + + private val typeCoercionRules: Seq[Expression => Expression] = + if (conf.ansiEnabled) { + UnaryMinusResolver.ANSI_TYPE_COERCION_RULES + } else { + UnaryMinusResolver.TYPE_COERCION_RULES + } + private val typeCoercionResolver: TypeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + override def resolve(unresolvedUnaryMinus: UnaryMinus): Expression = { + val unaryMinusWithResolvedChildren: UnaryMinus = + withResolvedChildren(unresolvedUnaryMinus, expressionResolver.resolve) + typeCoercionResolver.resolve(unaryMinusWithResolvedChildren) + } +} + +object UnaryMinusResolver { + // Ordering in the list of type coercions should be in sync with the list in [[TypeCoercion]]. + private val TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + TypeCoercion.ImplicitTypeCoercion.apply, + TypeCoercion.DateTimeOperationsTypeCoercion.apply + ) + + // Ordering in the list of type coercions should be in sync with the list in [[AnsiTypeCoercion]]. + private val ANSI_TYPE_COERCION_RULES: Seq[Expression => Expression] = Seq( + AnsiTypeCoercion.ImplicitTypeCoercion.apply, + AnsiTypeCoercion.AnsiDateTimeOperationsTypeCoercion.apply + ) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 40994f42e71d6..fabe551d054ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -23,14 +23,14 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIden import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.logical.{CTERelationDef, LeafNode, LogicalPlan, UnaryNode} +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId import org.apache.spark.sql.connector.catalog.TableWritePrivilege import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types.{DataType, Metadata, StructType} -import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.util.{CaseInsensitiveStringMap, SchemaUtils} import org.apache.spark.util.ArrayImplicits._ /** @@ -76,17 +76,6 @@ case class PlanWithUnresolvedIdentifier( copy(identifierExpr, newChildren, planBuilder) } -/** - * A logical plan placeholder which delays CTE resolution - * to moment when PlanWithUnresolvedIdentifier gets resolved - */ -case class UnresolvedWithCTERelations( - unresolvedPlan: LogicalPlan, - cteRelations: Seq[(String, CTERelationDef)]) - extends UnresolvedLeafNode { - final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_IDENTIFIER_WITH_CTE) -} - /** * An expression placeholder that holds the identifier clause string expression. It will be * replaced by the actual expression with the evaluated identifier string. @@ -217,7 +206,8 @@ case class ResolvedInlineTable(rows: Seq[Seq[Expression]], output: Seq[Attribute */ case class UnresolvedTableValuedFunction( name: Seq[String], - functionArgs: Seq[Expression]) + functionArgs: Seq[Expression], + override val isStreaming: Boolean = false) extends UnresolvedLeafNode { final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_TABLE_VALUED_FUNCTION) @@ -439,7 +429,7 @@ object UnresolvedFunction { * Represents all of the input attributes to a given relational operator, for example in * "SELECT * FROM ...". A [[Star]] gets automatically expanded during analysis. */ -abstract class Star extends LeafExpression with NamedExpression { +trait Star extends NamedExpression { override def name: String = throw new UnresolvedException("name") override def exprId: ExprId = throw new UnresolvedException("exprId") @@ -461,15 +451,20 @@ abstract class Star extends LeafExpression with NamedExpression { * This is also used to expand structs. For example: * "SELECT record.* from (SELECT struct(a,b,c) as record ...) * - * @param target an optional name that should be the target of the expansion. If omitted all - * targets' columns are produced. This can either be a table name or struct name. This - * is a list of identifiers that is the path of the expansion. - * - * This class provides the shared behavior between the classes for SELECT * ([[UnresolvedStar]]) - * and SELECT * EXCEPT ([[UnresolvedStarExcept]]). [[UnresolvedStar]] is just a case class of this, - * while [[UnresolvedStarExcept]] adds some additional logic to the expand method. - */ -abstract class UnresolvedStarBase(target: Option[Seq[String]]) extends Star with Unevaluable { + * This trait provides the shared behavior among the classes for SELECT * ([[UnresolvedStar]]) + * and SELECT * EXCEPT ([[UnresolvedStarExceptOrReplace]]), etc. [[UnresolvedStar]] is just a case + * class of this, while [[UnresolvedStarExceptOrReplace]] or other classes add some additional logic + * to the expand method. + */ +trait UnresolvedStarBase extends Star with Unevaluable { + + /** + * An optional name that should be the target of the expansion. If omitted all + * targets' columns are produced. This can either be a table name or struct name. This + * is a list of identifiers that is the path of the expansion. + */ + def target: Option[Seq[String]] + /** * Returns true if the nameParts is a subset of the last elements of qualifier of the attribute. * @@ -518,7 +513,9 @@ abstract class UnresolvedStarBase(target: Option[Seq[String]]) extends Star with childOperatorMetadataOutput: Seq[Attribute], resolve: (Seq[String], Resolver) => Option[NamedExpression], suggestedAttributes: Seq[Attribute], - resolver: Resolver): Seq[NamedExpression] = { + resolver: Resolver, + cleanupNestedAliasesDuringStructExpansion: Boolean = false + ): Seq[NamedExpression] = { // If there is no table specified, use all non-hidden input attributes. if (target.isEmpty) return childOperatorOutput @@ -539,11 +536,22 @@ abstract class UnresolvedStarBase(target: Option[Seq[String]]) extends Star with // (i.e. [name].* is both a table and a struct), the struct path can always be qualified. val attribute = resolve(target.get, resolver) if (attribute.isDefined) { + // If cleanupNestedAliasesDuringStructExpansion is true, we remove nested aliases during + // struct expansion. This is something which is done in the CleanupAliases rule but for the + // single-pass analyzer it has to be done here to avoid additional tree traversals. + val normalizedAttribute = if (cleanupNestedAliasesDuringStructExpansion) { + attribute.get match { + case a: Alias => a.child + case other => other + } + } else { + attribute.get + } // This target resolved to an attribute in child. It must be a struct. Expand it. - attribute.get.dataType match { + normalizedAttribute.dataType match { case s: StructType => s.zipWithIndex.map { case (f, i) => - val extract = GetStructField(attribute.get, i) + val extract = GetStructField(normalizedAttribute, i) Alias(extract, f.name)() } @@ -571,9 +579,16 @@ abstract class UnresolvedStarBase(target: Option[Seq[String]]) extends Star with * * @param excepts a list of names that should be excluded from the expansion. * + * @param replacements an optional list of expressions that should be used to replace the + * expressions removed by EXCEPT. If present, the length of this list must + * be the same as the length of the EXCEPT list. This supports replacing + * expressions instead of excluding them from the original SELECT list. */ -case class UnresolvedStarExcept(target: Option[Seq[String]], excepts: Seq[Seq[String]]) - extends UnresolvedStarBase(target) { +case class UnresolvedStarExceptOrReplace( + target: Option[Seq[String]], + excepts: Seq[Seq[String]], + replacements: Option[Seq[NamedExpression]]) + extends LeafExpression with UnresolvedStarBase { /** * We expand the * EXCEPT by the following three steps: @@ -652,7 +667,14 @@ case class UnresolvedStarExcept(target: Option[Seq[String]], excepts: Seq[Seq[St // group the except pairs by the column they refer to. NOTE: no groupMap until scala 2.13 val groupedExcepts: AttributeMap[Seq[Seq[String]]] = AttributeMap(excepts.groupBy(_._1.toAttribute).transform((_, v) => v.map(_._2))) - + // If the 'replacements' list is populated to indicate we should replace excepted columns + // with new expressions, we must have the same number of replacements as excepts. Keep an + // index to track the current replacement. + replacements.foreach { r => + assert(excepts.size == r.size, + "The number of replacements must be the same as the number of excepts") + } + var replacementIndex = 0 // map input columns while searching for the except entry corresponding to the current column columns.map(col => col -> groupedExcepts.get(col.toAttribute)).collect { // pass through columns that don't match anything in groupedExcepts @@ -679,11 +701,15 @@ case class UnresolvedStarExcept(target: Option[Seq[String]], excepts: Seq[Seq[St filterColumns(extractedFields.toImmutableArraySeq, newExcepts)), col.name)() // if there are multiple nestedExcepts but one is empty we must have overlapping except // columns. throw an error. - case (col, Some(nestedExcepts)) if nestedExcepts.size > 1 => + case (_, Some(nestedExcepts)) if nestedExcepts.size > 1 => throw new AnalysisException( errorClass = "EXCEPT_OVERLAPPING_COLUMNS", messageParameters = Map( "columns" -> this.excepts.map(_.mkString(".")).mkString(", "))) + // found a match and the 'replacements' list is populated - replace the column + case (_, Some(_)) if replacements.nonEmpty => + replacementIndex += 1 + replacements.get(replacementIndex - 1) } } @@ -691,6 +717,103 @@ case class UnresolvedStarExcept(target: Option[Seq[String]], excepts: Seq[Seq[St } } +/** + * Represents some of the input attributes to a given relational operator, for example in + * `df.withColumn`. + * + * @param colNames a list of column names that should be replaced or produced. + * + * @param exprs the corresponding expressions for `colNames`. + * + * @param explicitMetadata an optional list of explicit metadata to associate with the columns. + */ +case class UnresolvedStarWithColumns( + colNames: Seq[String], + exprs: Seq[Expression], + explicitMetadata: Option[Seq[Metadata]] = None) + extends UnresolvedStarBase { + + override def target: Option[Seq[String]] = None + override def children: Seq[Expression] = exprs + + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression]): UnresolvedStarWithColumns = + copy(exprs = newChildren) + + override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = { + assert(colNames.size == exprs.size, + s"The size of column names: ${colNames.size} isn't equal to " + + s"the size of expressions: ${exprs.size}") + explicitMetadata.foreach { m => + assert(colNames.size == m.size, + s"The size of column names: ${colNames.size} isn't equal to " + + s"the size of metadata elements: ${m.size}") + } + + SchemaUtils.checkColumnNameDuplication(colNames, resolver) + + val expandedCols = super.expand(input, resolver) + + val columnSeq = explicitMetadata match { + case Some(ms) => colNames.zip(exprs).zip(ms.map(Some(_))) + case _ => colNames.zip(exprs).map((_, None)) + } + + val replacedAndExistingColumns = expandedCols.map { field => + columnSeq.find { case ((colName, _), _) => + resolver(field.name, colName) + } match { + case Some(((colName, expr), m)) => Alias(expr, colName)(explicitMetadata = m) + case _ => field + } + } + + val newColumns = columnSeq.filter { case ((colName, _), _) => + !expandedCols.exists(f => resolver(f.name, colName)) + }.map { + case ((colName, expr), m) => Alias(expr, colName)(explicitMetadata = m) + } + + replacedAndExistingColumns ++ newColumns + } +} + +/** + * Represents some of the input attributes to a given relational operator, for example in + * `df.withColumnRenamed`. + * + * @param existingNames a list of column names that should be replaced. + * If the column does not exist, it is ignored. + * + * @param newNames a list of new column names that should be used to replace the existing columns. + */ +case class UnresolvedStarWithColumnsRenames( + existingNames: Seq[String], + newNames: Seq[String]) + extends LeafExpression with UnresolvedStarBase { + + override def target: Option[Seq[String]] = None + + override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = { + assert(existingNames.size == newNames.size, + s"The size of existing column names: ${existingNames.size} isn't equal to " + + s"the size of new column names: ${newNames.size}") + + val expandedCols = super.expand(input, resolver) + + existingNames.zip(newNames).foldLeft(expandedCols) { + case (attrs, (existingName, newName)) => + attrs.map(attr => + if (resolver(attr.name, existingName)) { + Alias(attr, newName)() + } else { + attr + } + ) + } + } +} + /** * Represents all of the input attributes to a given relational operator, for example in * "SELECT * FROM ...". @@ -702,7 +825,8 @@ case class UnresolvedStarExcept(target: Option[Seq[String]], excepts: Seq[Seq[St * targets' columns are produced. This can either be a table name or struct name. This * is a list of identifiers that is the path of the expansion. */ -case class UnresolvedStar(target: Option[Seq[String]]) extends UnresolvedStarBase(target) +case class UnresolvedStar(target: Option[Seq[String]]) + extends LeafExpression with UnresolvedStarBase /** * Represents all of the input attributes to a given relational operator, for example in @@ -712,7 +836,7 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends UnresolvedStarBas * tables' columns are produced. */ case class UnresolvedRegex(regexPattern: String, table: Option[String], caseSensitive: Boolean) - extends Star with Unevaluable { + extends LeafExpression with Star with Unevaluable { override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = { val pattern = if (caseSensitive) regexPattern else s"(?i)$regexPattern" table match { @@ -770,7 +894,8 @@ case class MultiAlias(child: Expression, names: Seq[String]) * * @param expressions Expressions to expand. */ -case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star with Unevaluable { +case class ResolvedStar(expressions: Seq[NamedExpression]) + extends LeafExpression with Star with Unevaluable { override def newInstance(): NamedExpression = throw new UnresolvedException("newInstance") override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = expressions override def toString: String = expressions.mkString("ResolvedStar(", ", ", ")") @@ -936,6 +1061,28 @@ case class UnresolvedOrdinal(ordinal: Int) final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_ORDINAL) } +/** + * Represents an unresolved ordinal used in the GROUP BY clause of a SQL pipe aggregate operator + * ("|> AGGREGATE"). + * + * In this context, the ordinal refers to the one-based position of the column in the input + * relation. Note that this behavior is different from GROUP BY ordinals in regular SQL, wherein the + * ordinal refers to the one-based position of the column in the SELECT clause. + * + * For example: + * {{{ + * values ('abc', 'def') tab(x, y) + * |> aggregate sum(x) group by 2 + * }}} + * @param ordinal ordinal starts from 1, instead of 0 + */ +case class UnresolvedPipeAggregateOrdinal(ordinal: Int) + extends LeafExpression with Unevaluable with NonSQLExpression { + override def dataType: DataType = throw new UnresolvedException("dataType") + override def nullable: Boolean = throw new UnresolvedException("nullable") + override lazy val resolved = false +} + /** * Represents unresolved having clause, the child for it can be Aggregate, GroupingSets, Rollup * and Cube. It is turned by the analyzer into a Filter. @@ -1004,42 +1151,28 @@ case class UnresolvedTranspose( copy(child = newChild) } -case class UnresolvedOuterReference( - nameParts: Seq[String]) - extends LeafExpression with NamedExpression with Unevaluable { - - def name: String = - nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".") - - override def exprId: ExprId = throw new UnresolvedException("exprId") - override def dataType: DataType = throw new UnresolvedException("dataType") - override def nullable: Boolean = throw new UnresolvedException("nullable") - override def qualifier: Seq[String] = throw new UnresolvedException("qualifier") - override lazy val resolved = false - - override def toAttribute: Attribute = throw new UnresolvedException("toAttribute") - override def newInstance(): UnresolvedOuterReference = this - - final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_OUTER_REFERENCE) +// A marker node to indicate that the logical plan containing this expression should be lazily +// analyzed in the DataFrame. This node will be removed at the beginning of analysis. +case class LazyExpression(child: Expression) extends UnaryExpression with Unevaluable { + override lazy val resolved: Boolean = false + override def dataType: DataType = child.dataType + override protected def withNewChildInternal(newChild: Expression): Expression = { + copy(child = newChild) + } + final override val nodePatterns: Seq[TreePattern] = Seq(LAZY_EXPRESSION) } -case class LazyOuterReference( - nameParts: Seq[String]) - extends LeafExpression with NamedExpression with Unevaluable with LazyAnalysisExpression { - - def name: String = - nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".") - - override def exprId: ExprId = throw new UnresolvedException("exprId") - override def dataType: DataType = throw new UnresolvedException("dataType") +trait UnresolvedPlanId extends LeafExpression with Unevaluable { override def nullable: Boolean = throw new UnresolvedException("nullable") - override def qualifier: Seq[String] = throw new UnresolvedException("qualifier") + override def dataType: DataType = throw new UnresolvedException("dataType") + override lazy val resolved = false - override def toAttribute: Attribute = throw new UnresolvedException("toAttribute") - override def newInstance(): NamedExpression = LazyOuterReference(nameParts) + def planId: Long + def withPlan(plan: LogicalPlan): Expression - override def nodePatternsInternal(): Seq[TreePattern] = Seq(LAZY_OUTER_REFERENCE) + final override val nodePatterns: Seq[TreePattern] = + Seq(UNRESOLVED_PLAN_ID) ++ nodePatternsInternal() - override def prettyName: String = "outer" - override def sql: String = s"$prettyName($name)" + // Subclasses can override this function to provide more TreePatterns. + def nodePatternsInternal(): Seq[TreePattern] = Seq() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala new file mode 100644 index 0000000000000..923373c1856a9 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import scala.collection.mutable + +import org.json4s.JsonAST.{JArray, JString} +import org.json4s.jackson.JsonMethods.{compact, render} + +import org.apache.spark.SparkException +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.catalog.UserDefinedFunction._ +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, ScalarSubquery} +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation, Project} +import org.apache.spark.sql.types.{DataType, StructType} + +/** + * Represent a SQL function. + * + * @param name qualified name of the SQL function + * @param inputParam function input parameters + * @param returnType function return type + * @param exprText function body as an expression + * @param queryText function body as a query + * @param comment function comment + * @param deterministic whether the function is deterministic + * @param containsSQL whether the function has data access routine to be CONTAINS SQL + * @param isTableFunc whether the function is a table function + * @param properties additional properties to be serialized for the SQL function + * @param owner owner of the function + * @param createTimeMs function creation time in milliseconds + */ +case class SQLFunction( + name: FunctionIdentifier, + inputParam: Option[StructType], + returnType: Either[DataType, StructType], + exprText: Option[String], + queryText: Option[String], + comment: Option[String], + deterministic: Option[Boolean], + containsSQL: Option[Boolean], + isTableFunc: Boolean, + properties: Map[String, String], + owner: Option[String] = None, + createTimeMs: Long = System.currentTimeMillis) extends UserDefinedFunction { + + assert(exprText.nonEmpty || queryText.nonEmpty) + assert((isTableFunc && returnType.isRight) || (!isTableFunc && returnType.isLeft)) + + import SQLFunction._ + + override val language: RoutineLanguage = LanguageSQL + + /** + * Optionally get the function body as an expression or query using the given parser. + */ + def getExpressionAndQuery( + parser: ParserInterface, + isTableFunc: Boolean): (Option[Expression], Option[LogicalPlan]) = { + // The RETURN clause of the CREATE FUNCTION statement looks like this in the parser: + // RETURN (query | expression) + // If the 'query' matches and parses as a SELECT clause of one item with no FROM clause, and + // this is a scalar function, we skip a level of subquery expression wrapping by using the + // referenced expression directly. + val parsedExpression = exprText.map(parser.parseExpression) + val parsedQuery = queryText.map(parser.parsePlan) + (parsedExpression, parsedQuery) match { + case (None, Some(Project(expr :: Nil, _: OneRowRelation))) + if !isTableFunc => + (Some(expr), None) + case (Some(ScalarSubquery(Project(expr :: Nil, _: OneRowRelation), _, _, _, _, _, _)), None) + if !isTableFunc => + (Some(expr), None) + case (_, _) => + (parsedExpression, parsedQuery) + } + } + + /** Get scalar function return data type. */ + def getScalarFuncReturnType: DataType = returnType match { + case Left(dataType) => dataType + case Right(_) => + throw SparkException.internalError( + "This function is a table function, not a scalar function.") + } + + /** Get table function return columns. */ + def getTableFuncReturnCols: StructType = returnType match { + case Left(_) => + throw SparkException.internalError( + "This function is a scalar function, not a table function.") + case Right(columns) => columns + } + + /** + * Convert the SQL function to a [[CatalogFunction]]. + */ + def toCatalogFunction: CatalogFunction = { + val props = sqlFunctionToProps ++ properties + CatalogFunction( + identifier = name, + className = SQL_FUNCTION_PREFIX, + resources = propertiesToFunctionResources(props, name)) + } + + /** + * Convert the SQL function to an [[ExpressionInfo]]. + */ + def toExpressionInfo: ExpressionInfo = { + val props = sqlFunctionToProps ++ functionMetadataToProps ++ properties + val usage = mapper.writeValueAsString(props) + new ExpressionInfo( + SQL_FUNCTION_PREFIX, + name.database.orNull, + name.funcName, + usage, + "", + "", + "", + "", + "", + "", + "sql_udf") + } + + /** + * Convert the SQL function fields into properties. + */ + private def sqlFunctionToProps: Map[String, String] = { + val props = new mutable.HashMap[String, String] + val inputParamText = inputParam.map(_.fields.map(_.toDDL).mkString(", ")) + inputParamText.foreach(props.put(INPUT_PARAM, _)) + val returnTypeText = returnType match { + case Left(dataType) => dataType.sql + case Right(columns) => columns.toDDL + } + props.put(RETURN_TYPE, returnTypeText) + exprText.foreach(props.put(EXPRESSION, _)) + queryText.foreach(props.put(QUERY, _)) + comment.foreach(props.put(COMMENT, _)) + deterministic.foreach(d => props.put(DETERMINISTIC, d.toString)) + containsSQL.foreach(x => props.put(CONTAINS_SQL, x.toString)) + props.put(IS_TABLE_FUNC, isTableFunc.toString) + props.toMap + } + + private def functionMetadataToProps: Map[String, String] = { + val props = new mutable.HashMap[String, String] + owner.foreach(props.put(OWNER, _)) + props.put(CREATE_TIME, createTimeMs.toString) + props.toMap + } +} + +object SQLFunction { + + private val SQL_FUNCTION_PREFIX = "sqlFunction." + + private val INPUT_PARAM: String = SQL_FUNCTION_PREFIX + "inputParam" + private val RETURN_TYPE: String = SQL_FUNCTION_PREFIX + "returnType" + private val EXPRESSION: String = SQL_FUNCTION_PREFIX + "expression" + private val QUERY: String = SQL_FUNCTION_PREFIX + "query" + private val COMMENT: String = SQL_FUNCTION_PREFIX + "comment" + private val DETERMINISTIC: String = SQL_FUNCTION_PREFIX + "deterministic" + private val CONTAINS_SQL: String = SQL_FUNCTION_PREFIX + "containsSQL" + private val IS_TABLE_FUNC: String = SQL_FUNCTION_PREFIX + "isTableFunc" + private val OWNER: String = SQL_FUNCTION_PREFIX + "owner" + private val CREATE_TIME: String = SQL_FUNCTION_PREFIX + "createTime" + + private val FUNCTION_CATALOG_AND_NAMESPACE = "catalogAndNamespace.numParts" + private val FUNCTION_CATALOG_AND_NAMESPACE_PART_PREFIX = "catalogAndNamespace.part." + + private val FUNCTION_REFERRED_TEMP_VIEW_NAMES = "referredTempViewNames" + private val FUNCTION_REFERRED_TEMP_FUNCTION_NAMES = "referredTempFunctionsNames" + private val FUNCTION_REFERRED_TEMP_VARIABLE_NAMES = "referredTempVariableNames" + + /** + * Convert a [[CatalogFunction]] into a SQL function. + */ + def fromCatalogFunction(function: CatalogFunction, parser: ParserInterface): SQLFunction = { + try { + val parts = function.resources.collect { case FunctionResource(FileResource, uri) => + val index = uri.substring(0, INDEX_LENGTH).toInt + val body = uri.substring(INDEX_LENGTH) + index -> body + } + val blob = parts.sortBy(_._1).map(_._2).mkString + val props = mapper.readValue(blob, classOf[Map[String, String]]) + val isTableFunc = props(IS_TABLE_FUNC).toBoolean + val returnType = parseReturnTypeText(props(RETURN_TYPE), isTableFunc, parser) + SQLFunction( + name = function.identifier, + inputParam = props.get(INPUT_PARAM).map(parseTableSchema(_, parser)), + returnType = returnType.get, + exprText = props.get(EXPRESSION), + queryText = props.get(QUERY), + comment = props.get(COMMENT), + deterministic = props.get(DETERMINISTIC).map(_.toBoolean), + containsSQL = props.get(CONTAINS_SQL).map(_.toBoolean), + isTableFunc = isTableFunc, + props.filterNot(_._1.startsWith(SQL_FUNCTION_PREFIX))) + } catch { + case e: Exception => + throw new AnalysisException( + errorClass = "CORRUPTED_CATALOG_FUNCTION", + messageParameters = Map( + "identifier" -> s"${function.identifier}", + "className" -> s"${function.className}"), cause = Some(e) + ) + } + } + + def parseDefault(text: String, parser: ParserInterface): Expression = { + parser.parseExpression(text) + } + + /** + * This method returns an optional DataType indicating, when present, either the return type for + * scalar user-defined functions, or a StructType indicating the names and types of the columns in + * the output schema for table functions. If the optional value is empty, this indicates that the + * CREATE FUNCTION statement did not have any RETURNS clause at all (for scalar functions), or + * that it included a RETURNS TABLE clause but without any specified output schema (for table + * functions), prompting the analyzer to infer these metadata instead. + */ + def parseReturnTypeText( + text: String, + isTableFunc: Boolean, + parser: ParserInterface): Option[Either[DataType, StructType]] = { + if (!isTableFunc) { + // This is a scalar user-defined function. + if (text.isEmpty) { + // The CREATE FUNCTION statement did not have any RETURNS clause. + Option.empty[Either[DataType, StructType]] + } else { + // The CREATE FUNCTION statement included a RETURNS clause with an explicit return type. + Some(Left(parseDataType(text, parser))) + } + } else { + // This is a table function. + if (text.equalsIgnoreCase("table")) { + // The CREATE FUNCTION statement had a RETURNS TABLE clause but without any explicit schema. + Option.empty[Either[DataType, StructType]] + } else { + // The CREATE FUNCTION statement included a RETURNS TABLE clause with an explicit schema. + Some(Right(parseTableSchema(text, parser))) + } + } + } + + def isSQLFunction(className: String): Boolean = className == SQL_FUNCTION_PREFIX + + /** + * Convert the current catalog and namespace to properties. + */ + def catalogAndNamespaceToProps( + currentCatalog: String, + currentNamespace: Seq[String]): Map[String, String] = { + val props = new mutable.HashMap[String, String] + val parts = currentCatalog +: currentNamespace + if (parts.nonEmpty) { + props.put(FUNCTION_CATALOG_AND_NAMESPACE, parts.length.toString) + parts.zipWithIndex.foreach { case (name, index) => + props.put(s"$FUNCTION_CATALOG_AND_NAMESPACE_PART_PREFIX$index", name) + } + } + props.toMap + } + + /** + * Convert the temporary object names to properties. + */ + def referredTempNamesToProps( + viewNames: Seq[Seq[String]], + functionsNames: Seq[String], + variableNames: Seq[Seq[String]]): Map[String, String] = { + val viewNamesJson = + JArray(viewNames.map(nameParts => JArray(nameParts.map(JString).toList)).toList) + val functionsNamesJson = JArray(functionsNames.map(JString).toList) + val variableNamesJson = + JArray(variableNames.map(nameParts => JArray(nameParts.map(JString).toList)).toList) + + val props = new mutable.HashMap[String, String] + props.put(FUNCTION_REFERRED_TEMP_VIEW_NAMES, compact(render(viewNamesJson))) + props.put(FUNCTION_REFERRED_TEMP_FUNCTION_NAMES, compact(render(functionsNamesJson))) + props.put(FUNCTION_REFERRED_TEMP_VARIABLE_NAMES, compact(render(variableNamesJson))) + props.toMap + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index a0f7af10fefaf..b123952c5f086 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -19,25 +19,28 @@ package org.apache.spark.sql.catalyst.catalog import java.net.URI import java.util.Locale -import java.util.concurrent.Callable -import java.util.concurrent.TimeUnit +import java.util.concurrent.{Callable, ExecutionException, TimeUnit} import javax.annotation.concurrent.GuardedBy import scala.collection.mutable import scala.util.{Failure, Success, Try} import com.google.common.cache.{Cache, CacheBuilder} +import com.google.common.util.concurrent.UncheckedExecutionException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.spark.SparkException +import org.apache.spark.{SparkException, SparkThrowable} import org.apache.spark.internal.Logging +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder -import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Expression, ExpressionInfo, NamedExpression, UpCast} +import org.apache.spark.sql.catalyst.analysis.TableFunctionRegistry.TableFunctionBuilder +import org.apache.spark.sql.catalyst.catalog.SQLFunction.parseDefault +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Cast, Expression, ExpressionInfo, NamedArgumentExpression, NamedExpression, ScalarSubquery, UpCast} import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, SubqueryAlias, View} +import org.apache.spark.sql.catalyst.plans.logical.{FunctionSignature, InputParameter, LocalRelation, LogicalPlan, NamedParametersSupport, Project, SubqueryAlias, View} import org.apache.spark.sql.catalyst.trees.CurrentOrigin import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, StringUtils} import org.apache.spark.sql.connector.catalog.CatalogManager @@ -45,7 +48,7 @@ import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAM import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE -import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} import org.apache.spark.sql.util.{CaseInsensitiveStringMap, PartitioningUtils} import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils @@ -210,7 +213,13 @@ class SessionCatalog( /** This method provides a way to get a cached plan. */ def getCachedPlan(t: QualifiedTableName, c: Callable[LogicalPlan]): LogicalPlan = { - tableRelationCache.get(t, c) + try { + tableRelationCache.get(t, c) + } catch { + case e @ (_: ExecutionException | _: UncheckedExecutionException) + if e.getCause != null && e.getCause.isInstanceOf[SparkThrowable] => + throw e.getCause + } } /** This method provides a way to get a cached plan if the key exists. */ @@ -1526,10 +1535,146 @@ class SessionCatalog( } } + /** + * Create a user defined function. + */ + def createUserDefinedFunction(function: UserDefinedFunction, ignoreIfExists: Boolean): Unit = { + createFunction(function.toCatalogFunction, ignoreIfExists) + } + // ---------------------------------------------------------------- // | Methods that interact with temporary and metastore functions | // ---------------------------------------------------------------- + /** + * Constructs a [[FunctionBuilder]] based on the provided class that represents a function. + */ + private def makeSQLFunctionBuilder(function: SQLFunction): FunctionBuilder = { + if (function.isTableFunc) { + throw UserDefinedFunctionErrors.notAScalarFunction(function.name.nameParts) + } + (input: Seq[Expression]) => { + val args = rearrangeArguments(function.inputParam, input, function.name.toString) + val returnType = function.getScalarFuncReturnType + SQLFunctionExpression( + function.name.unquotedString, function, args, Some(returnType)) + } + } + + /** + * Constructs a scalar SQL function logical plan. The logical plan will be used to + * construct actual expression from the function inputs and body. + * + * The body of a scalar SQL function can either be an expression or a query returns + * one single column. + * + * Example scalar SQL function with an expression: + * + * CREATE FUNCTION area(width DOUBLE, height DOUBLE) RETURNS DOUBLE + * RETURN width * height; + * + * Query: + * + * SELECT area(a, b) FROM t; + * + * SQL function plan: + * + * Project [CAST(width * height AS DOUBLE) AS area] + * +- Project [CAST(a AS DOUBLE) AS width, CAST(b AS DOUBLE) AS height] + * +- LocalRelation [a, b] + * + * Example scalar SQL function with a subquery: + * + * CREATE FUNCTION foo(x INT) RETURNS INT + * RETURN SELECT SUM(b) FROM t WHERE x = a; + * + * SELECT foo(a) FROM t; + * + * SQL function plan: + * + * Project [scalar-subquery AS foo] + * : +- Aggregate [] [sum(b)] + * : +- Filter [outer(x) = a] + * : +- Relation [a, b] + * +- Project [CAST(a AS INT) AS x] + * +- LocalRelation [a, b] + */ + def makeSQLFunctionPlan( + name: String, + function: SQLFunction, + input: Seq[Expression]): LogicalPlan = { + def metaForFuncInputAlias = { + new MetadataBuilder() + .putString("__funcInputAlias", "true") + .build() + } + assert(!function.isTableFunc) + val funcName = function.name.funcName + + // Use captured SQL configs when parsing a SQL function. + val conf = new SQLConf() + function.getSQLConfigs.foreach { case (k, v) => conf.settings.put(k, v) } + SQLConf.withExistingConf(conf) { + val inputParam = function.inputParam + val returnType = function.getScalarFuncReturnType + val (expression, query) = function.getExpressionAndQuery(parser, isTableFunc = false) + assert(expression.isDefined || query.isDefined) + + // Check function arguments + val paramSize = inputParam.map(_.size).getOrElse(0) + if (input.size > paramSize) { + throw QueryCompilationErrors.wrongNumArgsError( + name, paramSize.toString, input.size) + } + + val inputs = inputParam.map { param => + // Attributes referencing the input parameters inside the function can use the + // function name as a qualifier. E.G.: + // `create function foo(a int) returns int return foo.a` + val qualifier = Seq(funcName) + val paddedInput = input ++ + param.takeRight(paramSize - input.size).map { p => + val defaultExpr = p.getDefault() + if (defaultExpr.isDefined) { + Cast(parseDefault(defaultExpr.get, parser), p.dataType) + } else { + throw QueryCompilationErrors.wrongNumArgsError( + name, paramSize.toString, input.size) + } + } + + paddedInput.zip(param.fields).map { + case (expr, param) => + Alias(Cast(expr, param.dataType), param.name)( + qualifier = qualifier, + // mark the alias as function input + explicitMetadata = Some(metaForFuncInputAlias)) + } + }.getOrElse(Nil) + + val body = if (query.isDefined) ScalarSubquery(query.get) else expression.get + Project(Alias(Cast(body, returnType), funcName)() :: Nil, + Project(inputs, LocalRelation(inputs.flatMap(_.references)))) + } + } + + /** + * Constructs a [[TableFunctionBuilder]] based on the provided class that represents a function. + */ + private def makeSQLTableFunctionBuilder(function: SQLFunction): TableFunctionBuilder = { + if (!function.isTableFunc) { + throw UserDefinedFunctionErrors.notATableFunction(function.name.nameParts) + } + (input: Seq[Expression]) => { + val args = rearrangeArguments(function.inputParam, input, function.name.toString) + val returnParam = function.getTableFuncReturnCols + val output = returnParam.fields.map { param => + AttributeReference(param.name, param.dataType, param.nullable)() + } + SQLTableFunction(function.name.unquotedString, function, args, output.toSeq) + } + } + /** * Constructs a [[FunctionBuilder]] based on the provided function metadata. */ @@ -1544,6 +1689,24 @@ class SessionCatalog( (input: Seq[Expression]) => functionExpressionBuilder.makeExpression(name, clazz, input) } + private def makeUserDefinedScalarFuncBuilder(func: UserDefinedFunction): FunctionBuilder = { + func match { + case f: SQLFunction => makeSQLFunctionBuilder(f) + case _ => + val clsName = func.getClass.getSimpleName + throw UserDefinedFunctionErrors.unsupportedUserDefinedFunction(clsName) + } + } + + private def makeUserDefinedTableFuncBuilder(func: UserDefinedFunction): TableFunctionBuilder = { + func match { + case f: SQLFunction => makeSQLTableFunctionBuilder(f) + case _ => + val clsName = func.getClass.getSimpleName + throw UserDefinedFunctionErrors.unsupportedUserDefinedFunction(clsName) + } + } + /** * Loads resources such as JARs and Files for a function. Every resource is represented * by a tuple (resource type, resource uri). @@ -1591,6 +1754,81 @@ class SessionCatalog( "hive") } + /** + * Registers a temporary or persistent SQL scalar function into a session-specific + * [[FunctionRegistry]]. + */ + def registerSQLScalarFunction( + function: SQLFunction, + overrideIfExists: Boolean): Unit = { + registerUserDefinedFunction[Expression]( + function, + overrideIfExists, + functionRegistry, + makeSQLFunctionBuilder(function)) + } + + /** + * Registers a temporary or persistent SQL table function into a session-specific + * [[TableFunctionRegistry]]. + */ + def registerSQLTableFunction( + function: SQLFunction, + overrideIfExists: Boolean): Unit = { + registerUserDefinedFunction[LogicalPlan]( + function, + overrideIfExists, + tableFunctionRegistry, + makeSQLTableFunctionBuilder(function)) + } + + /** + * Rearranges the arguments of a UDF into positional order. + */ + private def rearrangeArguments( + inputParams: Option[StructType], + expressions: Seq[Expression], + functionName: String) : Seq[Expression] = { + val firstNamedArgumentExpressionIdx = + expressions.indexWhere(_.isInstanceOf[NamedArgumentExpression]) + if (firstNamedArgumentExpressionIdx == -1) { + return expressions + } + + val paramNames: Seq[InputParameter] = + if (inputParams.isDefined) { + inputParams.get.map { + p => p.getDefault() match { + case Some(defaultExpr) => + // This cast is needed to ensure the default value is of the target data type. + InputParameter(p.name, Some(Cast(parseDefault(defaultExpr, parser), p.dataType))) + case None => + InputParameter(p.name) + } + }.toSeq + } else { + Seq() + } + + NamedParametersSupport.defaultRearrange( + FunctionSignature(paramNames), expressions, functionName) + } + + /** + * Registers a temporary or permanent SQL function into a session-specific function registry. + */ + private def registerUserDefinedFunction[T]( + function: UserDefinedFunction, + overrideIfExists: Boolean, + registry: FunctionRegistryBase[T], + functionBuilder: Seq[Expression] => T): Unit = { + if (registry.functionExists(function.name) && !overrideIfExists) { + throw QueryCompilationErrors.functionAlreadyExistsError(function.name) + } + val info = function.toExpressionInfo + registry.registerFunction(function.name, info, functionBuilder) + } + /** * Unregister a temporary or permanent function from a session-specific [[FunctionRegistry]] * or [[TableFunctionRegistry]]. Return true if function exists. @@ -1747,7 +1985,11 @@ class SessionCatalog( requireDbExists(db) if (externalCatalog.functionExists(db, funcName)) { val metadata = externalCatalog.getFunction(db, funcName) - makeExprInfoForHiveFunction(metadata.copy(identifier = qualifiedIdent)) + if (metadata.isUserDefinedFunction) { + UserDefinedFunction.fromCatalogFunction(metadata, parser).toExpressionInfo + } else { + makeExprInfoForHiveFunction(metadata.copy(identifier = qualifiedIdent)) + } } else { failFunctionLookup(name) } @@ -1759,7 +2001,26 @@ class SessionCatalog( */ def resolvePersistentFunction( name: FunctionIdentifier, arguments: Seq[Expression]): Expression = { - resolvePersistentFunctionInternal(name, arguments, functionRegistry, makeFunctionBuilder) + resolvePersistentFunctionInternal[Expression]( + name, + arguments, + functionRegistry, + registerHiveFunc = func => + registerFunction( + func, + overrideIfExists = false, + registry = functionRegistry, + functionBuilder = makeFunctionBuilder(func) + ), + registerUserDefinedFunc = function => { + val builder = makeUserDefinedScalarFuncBuilder(function) + registerUserDefinedFunction[Expression]( + function = function, + overrideIfExists = false, + registry = functionRegistry, + functionBuilder = builder) + } + ) } /** @@ -1768,16 +2029,29 @@ class SessionCatalog( def resolvePersistentTableFunction( name: FunctionIdentifier, arguments: Seq[Expression]): LogicalPlan = { - // We don't support persistent table functions yet. - val builder = (func: CatalogFunction) => failFunctionLookup(name) - resolvePersistentFunctionInternal(name, arguments, tableFunctionRegistry, builder) + resolvePersistentFunctionInternal[LogicalPlan]( + name, + arguments, + tableFunctionRegistry, + // We don't support persistent Hive table functions yet. + registerHiveFunc = (func: CatalogFunction) => failFunctionLookup(name), + registerUserDefinedFunc = function => { + val builder = makeUserDefinedTableFuncBuilder(function) + registerUserDefinedFunction[LogicalPlan]( + function = function, + overrideIfExists = false, + registry = tableFunctionRegistry, + functionBuilder = builder) + } + ) } private def resolvePersistentFunctionInternal[T]( name: FunctionIdentifier, arguments: Seq[Expression], registry: FunctionRegistryBase[T], - createFunctionBuilder: CatalogFunction => FunctionRegistryBase[T]#FunctionBuilder): T = { + registerHiveFunc: CatalogFunction => Unit, + registerUserDefinedFunc: UserDefinedFunction => Unit): T = { // `synchronized` is used to prevent multiple threads from concurrently resolving the // same function that has not yet been loaded into the function registry. This is needed // because calling `registerFunction` twice with `overrideIfExists = false` can lead to @@ -1793,19 +2067,24 @@ class SessionCatalog( // The function has not been loaded to the function registry, which means // that the function is a persistent function (if it actually has been registered // in the metastore). We need to first put the function in the function registry. - val catalogFunction = externalCatalog.getFunction(db, funcName) - loadFunctionResources(catalogFunction.resources) + val catalogFunction = try { + externalCatalog.getFunction(db, funcName) + } catch { + case _: AnalysisException => failFunctionLookup(qualifiedIdent) + } // Please note that qualifiedName is provided by the user. However, // catalogFunction.identifier.unquotedString is returned by the underlying // catalog. So, it is possible that qualifiedName is not exactly the same as // catalogFunction.identifier.unquotedString (difference is on case-sensitivity). // At here, we preserve the input from the user. val funcMetadata = catalogFunction.copy(identifier = qualifiedIdent) - registerFunction( - funcMetadata, - overrideIfExists = false, - registry = registry, - functionBuilder = createFunctionBuilder(funcMetadata)) + if (!catalogFunction.isUserDefinedFunction) { + loadFunctionResources(catalogFunction.resources) + registerHiveFunc(funcMetadata) + } else { + val function = UserDefinedFunction.fromCatalogFunction(funcMetadata, parser) + registerUserDefinedFunc(function) + } // Now, we need to create the Expression. registry.lookupFunction(qualifiedIdent, arguments) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/TempVariableManager.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/TempVariableManager.scala index abe6cede0c550..2c262da1f4449 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/TempVariableManager.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/TempVariableManager.scala @@ -63,6 +63,10 @@ class TempVariableManager extends DataTypeErrorsBase { def clear(): Unit = synchronized { variables.clear() } + + def isEmpty: Boolean = synchronized { + variables.isEmpty + } } case class VariableDefinition(defaultValueSQL: String, currentValue: Literal) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala new file mode 100644 index 0000000000000..a76ca7b15c278 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import com.fasterxml.jackson.annotation.JsonInclude.Include +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} +import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} + +import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.expressions.ExpressionInfo +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.util.CharVarcharUtils +import org.apache.spark.sql.types.{DataType, StructType} + +/** + * The base class for all user defined functions registered via SQL. + */ +trait UserDefinedFunction { + + /** + * Qualified name of the function + */ + def name: FunctionIdentifier + + /** + * Additional properties to be serialized for the function. + * Use this to preserve the runtime configuration that should be used during the function + * execution, such as SQL configs etc. See [[SQLConf]] for more info. + */ + def properties: Map[String, String] + + /** + * Get SQL configs from the function properties. + * Use this to restore the SQL configs that should be used for this function. + */ + def getSQLConfigs: Map[String, String] = { + UserDefinedFunction.propertiesToSQLConfigs(properties) + } + + /** + * Owner of the function + */ + def owner: Option[String] + + /** + * Function creation time in milliseconds since the linux epoch + */ + def createTimeMs: Long + + /** + * The language of the user defined function. + */ + def language: RoutineLanguage + + /** + * Convert the function to a [[CatalogFunction]]. + */ + def toCatalogFunction: CatalogFunction + + /** + * Convert the SQL function to an [[ExpressionInfo]]. + */ + def toExpressionInfo: ExpressionInfo +} + +object UserDefinedFunction { + val SQL_CONFIG_PREFIX = "sqlConfig." + val INDEX_LENGTH: Int = 3 + + // The default Hive Metastore SQL schema length for function resource uri. + private val HIVE_FUNCTION_RESOURCE_URI_LENGTH_THRESHOLD: Int = 4000 + + def parseTableSchema(text: String, parser: ParserInterface): StructType = { + val parsed = parser.parseTableSchema(text) + CharVarcharUtils.failIfHasCharVarchar(parsed).asInstanceOf[StructType] + } + + def parseDataType(text: String, parser: ParserInterface): DataType = { + val dataType = parser.parseDataType(text) + CharVarcharUtils.failIfHasCharVarchar(dataType) + } + + private val _mapper: ObjectMapper = getObjectMapper + + /** + * A shared [[ObjectMapper]] for serializations. + */ + def mapper: ObjectMapper = _mapper + + /** + * Convert the given properties to a list of function resources. + */ + def propertiesToFunctionResources( + props: Map[String, String], + name: FunctionIdentifier): Seq[FunctionResource] = { + val blob = mapper.writeValueAsString(props) + val threshold = HIVE_FUNCTION_RESOURCE_URI_LENGTH_THRESHOLD - INDEX_LENGTH + blob.grouped(threshold).zipWithIndex.map { case (part, i) => + // Add a sequence number to the part and pad it to a given length. + // E.g. 1 will become "001" if the given length is 3. + val index = s"%0${INDEX_LENGTH}d".format(i) + if (index.length > INDEX_LENGTH) { + throw UserDefinedFunctionErrors.routinePropertyTooLarge(name.funcName) + } + FunctionResource(FileResource, index + part) + }.toSeq + } + + /** + * Get a object mapper to serialize and deserialize function properties. + */ + private def getObjectMapper: ObjectMapper = { + val mapper = new ObjectMapper with ClassTagExtensions + mapper.setSerializationInclusion(Include.NON_ABSENT) + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + mapper.registerModule(DefaultScalaModule) + mapper + } + + /** + * Convert a [[CatalogFunction]] into a corresponding UDF. + */ + def fromCatalogFunction(function: CatalogFunction, parser: ParserInterface) + : UserDefinedFunction = { + val className = function.className + if (SQLFunction.isSQLFunction(className)) { + SQLFunction.fromCatalogFunction(function, parser) + } else { + throw SparkException.internalError(s"Unsupported function type $className") + } + } + + /** + * Verify if the function is a [[UserDefinedFunction]]. + */ + def isUserDefinedFunction(className: String): Boolean = SQLFunction.isSQLFunction(className) + + /** + * Covert properties to SQL configs. + */ + def propertiesToSQLConfigs(properties: Map[String, String]): Map[String, String] = { + try { + for ((key, value) <- properties if key.startsWith(SQL_CONFIG_PREFIX)) + yield (key.substring(SQL_CONFIG_PREFIX.length), value) + } catch { + case e: Exception => throw SparkException.internalError( + "Corrupted user defined function SQL configs in catalog", cause = e) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala index a5381669caea8..904a17bc8ce44 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala @@ -18,10 +18,12 @@ package org.apache.spark.sql.catalyst.catalog import org.apache.spark.SparkException +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.errors.QueryErrorsBase /** - * Errors during registering and executing [[UserDefinedFunction]]s. + * Errors during registering and executing + * [[org.apache.spark.sql.expressions.UserDefinedFunction]]s. */ object UserDefinedFunctionErrors extends QueryErrorsBase { def unsupportedUserDefinedFunction(language: RoutineLanguage): Throwable = { @@ -31,4 +33,86 @@ object UserDefinedFunctionErrors extends QueryErrorsBase { def unsupportedUserDefinedFunction(language: String): Throwable = { SparkException.internalError(s"Unsupported user defined function type: $language") } + + def duplicateParameterNames(routineName: String, names: String): Throwable = { + new AnalysisException( + errorClass = "DUPLICATE_ROUTINE_PARAMETER_NAMES", + messageParameters = Map("routineName" -> routineName, "names" -> names)) + } + + def duplicateReturnsColumns(routineName: String, columns: String): Throwable = { + new AnalysisException( + errorClass = "DUPLICATE_ROUTINE_RETURNS_COLUMNS", + messageParameters = Map("routineName" -> routineName, "columns" -> columns)) + } + + def cannotSpecifyNotNullOnFunctionParameters(input: String): Throwable = { + new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.NOT_NULL_ON_FUNCTION_PARAMETERS", + messageParameters = Map("input" -> input)) + } + + def bodyIsNotAQueryForSqlTableUdf(functionName: String): Throwable = { + new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.SQL_TABLE_UDF_BODY_MUST_BE_A_QUERY", + messageParameters = Map("name" -> functionName)) + } + + def missingColumnNamesForSqlTableUdf(functionName: String): Throwable = { + new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.SQL_TABLE_UDF_MISSING_COLUMN_NAMES", + messageParameters = Map("functionName" -> toSQLId(functionName))) + } + + def invalidTempViewReference(routineName: Seq[String], tempViewName: Seq[String]): Throwable = { + new AnalysisException( + errorClass = "INVALID_TEMP_OBJ_REFERENCE", + messageParameters = Map( + "obj" -> "FUNCTION", + "objName" -> toSQLId(routineName), + "tempObj" -> "VIEW", + "tempObjName" -> toSQLId(tempViewName) + ) + ) + } + + def invalidTempFuncReference(routineName: Seq[String], tempFuncName: String): Throwable = { + new AnalysisException( + errorClass = "INVALID_TEMP_OBJ_REFERENCE", + messageParameters = Map( + "obj" -> "FUNCTION", + "objName" -> toSQLId(routineName), + "tempObj" -> "FUNCTION", + "tempObjName" -> toSQLId(tempFuncName) + ) + ) + } + + def invalidTempVarReference(routineName: Seq[String], varName: Seq[String]): Throwable = { + new AnalysisException( + errorClass = "INVALID_TEMP_OBJ_REFERENCE", + messageParameters = Map( + "obj" -> "FUNCTION", + "objName" -> toSQLId(routineName), + "tempObj" -> "VARIABLE", + "tempObjName" -> toSQLId(varName))) + } + + def routinePropertyTooLarge(routineName: String): Throwable = { + new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.ROUTINE_PROPERTY_TOO_LARGE", + messageParameters = Map("name" -> toSQLId(routineName))) + } + + def notAScalarFunction(functionName: Seq[String]): Throwable = { + new AnalysisException( + errorClass = "NOT_A_SCALAR_FUNCTION", + messageParameters = Map("functionName" -> toSQLId(functionName))) + } + + def notATableFunction(functionName: Seq[String]): Throwable = { + new AnalysisException( + errorClass = "NOT_A_TABLE_FUNCTION", + messageParameters = Map("functionName" -> toSQLId(functionName))) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index dcd1d3137da3f..7836e533c8b5c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.catalog import java.net.URI import java.time.{ZoneId, ZoneOffset} -import java.util.Date import scala.collection.mutable import scala.util.control.NonFatal @@ -28,7 +27,7 @@ import com.fasterxml.jackson.annotation.JsonInclude.Include import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} import org.apache.commons.lang3.StringUtils -import org.json4s.JsonAST.{JArray, JString} +import org.json4s.JsonAST.{JArray, JBool, JDouble, JInt, JNull, JObject, JString, JValue} import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException @@ -51,6 +50,52 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.{CaseInsensitiveStringMap, SchemaUtils} import org.apache.spark.util.ArrayImplicits._ +/** + * Interface providing util to convert JValue to String representation of catalog entities. + */ +trait MetadataMapSupport { + def toJsonLinkedHashMap: mutable.LinkedHashMap[String, JValue] + + def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { + jsonToString(toJsonLinkedHashMap) + } + + protected def jsonToString( + jsonMap: mutable.LinkedHashMap[String, JValue]): mutable.LinkedHashMap[String, String] = { + val map = new mutable.LinkedHashMap[String, String]() + jsonMap.foreach { case (key, jValue) => + val stringValue = jValue match { + case JString(value) => value + case JArray(values) => + values.map(_.values) + .map { + case str: String => quoteIdentifier(str) + case other => other.toString + } + .mkString("[", ", ", "]") + case JObject(fields) => + fields.map { case (k, v) => + s"$k=${v.values.toString}" + } + .mkString("[", ", ", "]") + case JInt(value) => value.toString + case JDouble(value) => value.toString + case _ => jValue.values.toString + } + map.put(key, stringValue) + } + map + } + + val timestampFormatter = new Iso8601TimestampFormatter( + pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'", + zoneId = ZoneId.of("UTC"), + locale = DateFormatter.defaultLocale, + legacyFormat = LegacyDateFormats.LENIENT_SIMPLE_DATE_FORMAT, + isParsing = true + ) +} + /** * A function defined in the catalog. @@ -62,7 +107,9 @@ import org.apache.spark.util.ArrayImplicits._ case class CatalogFunction( identifier: FunctionIdentifier, className: String, - resources: Seq[FunctionResource]) + resources: Seq[FunctionResource]) { + val isUserDefinedFunction: Boolean = UserDefinedFunction.isUserDefinedFunction(className) +} /** @@ -74,25 +121,31 @@ case class CatalogStorageFormat( outputFormat: Option[String], serde: Option[String], compressed: Boolean, - properties: Map[String, String]) { + properties: Map[String, String]) extends MetadataMapSupport { override def toString: String = { - toLinkedHashMap.map { case ((key, value)) => + toLinkedHashMap.map { case (key, value) => if (value.isEmpty) key else s"$key: $value" }.mkString("Storage(", ", ", ")") } - def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { - val map = new mutable.LinkedHashMap[String, String]() - locationUri.foreach(l => map.put("Location", l.toString)) - serde.foreach(map.put("Serde Library", _)) - inputFormat.foreach(map.put("InputFormat", _)) - outputFormat.foreach(map.put("OutputFormat", _)) - if (compressed) map.put("Compressed", "") + def toJsonLinkedHashMap: mutable.LinkedHashMap[String, JValue] = { + val map = mutable.LinkedHashMap[String, JValue]() + + locationUri.foreach(l => map += ("Location" -> JString(l.toString))) + serde.foreach(s => map += ("Serde Library" -> JString(s))) + inputFormat.foreach(format => map += ("InputFormat" -> JString(format))) + outputFormat.foreach(format => map += ("OutputFormat" -> JString(format))) + + if (compressed) map += ("Compressed" -> JBool(true)) + SQLConf.get.redactOptions(properties) match { case props if props.isEmpty => // No-op case props => - map.put("Storage Properties", props.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]")) + val storagePropsJson = JObject( + props.map { case (k, v) => k -> JString(v) }.toList + ) + map += ("Storage Properties" -> storagePropsJson) } map } @@ -120,35 +173,46 @@ case class CatalogTablePartition( parameters: Map[String, String] = Map.empty, createTime: Long = System.currentTimeMillis, lastAccessTime: Long = -1, - stats: Option[CatalogStatistics] = None) { + stats: Option[CatalogStatistics] = None) extends MetadataMapSupport { + def toJsonLinkedHashMap: mutable.LinkedHashMap[String, JValue] = { + val map = mutable.LinkedHashMap[String, JValue]() - def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { - val map = new mutable.LinkedHashMap[String, String]() - val specString = spec.map { case (k, v) => s"$k=$v" }.mkString(", ") - map.put("Partition Values", s"[$specString]") - map ++= storage.toLinkedHashMap - if (parameters.nonEmpty) { - map.put("Partition Parameters", s"{" + - s"${SQLConf.get.redactOptions(parameters).map(p => p._1 + "=" + p._2).mkString(", ")}}") + val specJson = JObject(spec.map { case (k, v) => k -> JString(v) }.toList) + map += ("Partition Values" -> specJson) + + storage.toJsonLinkedHashMap.foreach { case (k, v) => + map += (k -> v) } - map.put("Created Time", new Date(createTime).toString) - val lastAccess = { - if (lastAccessTime <= 0) "UNKNOWN" else new Date(lastAccessTime).toString + + if (parameters.nonEmpty) { + val paramsJson = JObject(SQLConf.get.redactOptions(parameters).map { + case (k, v) => k -> JString(v) + }.toList) + map += ("Partition Parameters" -> paramsJson) } - map.put("Last Access", lastAccess) - stats.foreach(s => map.put("Partition Statistics", s.simpleString)) + + map += ("Created Time" -> JString( + timestampFormatter.format(DateTimeUtils.millisToMicros(createTime)))) + + val lastAccess = if (lastAccessTime <= 0) JString("UNKNOWN") + else JString( + timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))) + map += ("Last Access" -> lastAccess) + + stats.foreach(s => map += ("Partition Statistics" -> JString(s.simpleString))) + map } override def toString: String = { - toLinkedHashMap.map { case ((key, value)) => + toLinkedHashMap.map { case (key, value) => if (value.isEmpty) key else s"$key: $value" }.mkString("CatalogPartition(\n\t", "\n\t", ")") } /** Readable string representation for the CatalogTablePartition. */ def simpleString: String = { - toLinkedHashMap.map { case ((key, value)) => + toLinkedHashMap.map { case (key, value) => if (value.isEmpty) key else s"$key: $value" }.mkString("", "\n", "") } @@ -284,7 +348,7 @@ object ClusterBySpec { case class BucketSpec( numBuckets: Int, bucketColumnNames: Seq[String], - sortColumnNames: Seq[String]) extends SQLConfHelper { + sortColumnNames: Seq[String]) extends SQLConfHelper with MetadataMapSupport { if (numBuckets <= 0 || numBuckets > conf.bucketingMaxBuckets) { throw QueryCompilationErrors.invalidBucketNumberError( @@ -301,11 +365,11 @@ case class BucketSpec( s"$numBuckets buckets, $bucketString$sortString" } - def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { - mutable.LinkedHashMap[String, String]( - "Num Buckets" -> numBuckets.toString, - "Bucket Columns" -> bucketColumnNames.map(quoteIdentifier).mkString("[", ", ", "]"), - "Sort Columns" -> sortColumnNames.map(quoteIdentifier).mkString("[", ", ", "]") + def toJsonLinkedHashMap: mutable.LinkedHashMap[String, JValue] = { + mutable.LinkedHashMap[String, JValue]( + "Num Buckets" -> JInt(numBuckets), + "Bucket Columns" -> JArray(bucketColumnNames.map(JString).toList), + "Sort Columns" -> JArray(sortColumnNames.map(JString).toList) ) } } @@ -350,11 +414,12 @@ case class CatalogTable( stats: Option[CatalogStatistics] = None, viewText: Option[String] = None, comment: Option[String] = None, + collation: Option[String] = None, unsupportedFeatures: Seq[String] = Seq.empty, tracksPartitionsInCatalog: Boolean = false, schemaPreservesCase: Boolean = true, ignoredProperties: Map[String, String] = Map.empty, - viewOriginalText: Option[String] = None) { + viewOriginalText: Option[String] = None) extends MetadataMapSupport { import CatalogTable._ @@ -523,65 +588,81 @@ case class CatalogTable( locationUri, inputFormat, outputFormat, serde, compressed, properties)) } + def toJsonLinkedHashMap: mutable.LinkedHashMap[String, JValue] = { + val filteredTableProperties = SQLConf.get + .redactOptions(properties.filter { case (k, v) => + !k.startsWith(VIEW_PREFIX) && v.nonEmpty + }) - def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { - val map = new mutable.LinkedHashMap[String, String]() - val tableProperties = - SQLConf.get.redactOptions(properties.filter { case (k, _) => !k.startsWith(VIEW_PREFIX) }) - .toSeq.sortBy(_._1) - .map(p => p._1 + "=" + p._2) - val partitionColumns = partitionColumnNames.map(quoteIdentifier).mkString("[", ", ", "]") - val lastAccess = { - if (lastAccessTime <= 0) "UNKNOWN" else new Date(lastAccessTime).toString + val tableProperties: JValue = + if (filteredTableProperties.isEmpty) JNull + else JObject( + filteredTableProperties.toSeq.sortBy(_._1).map { case (k, v) => k -> JString(v) }: _*) + + val partitionColumns: JValue = + if (partitionColumnNames.nonEmpty) JArray(partitionColumnNames.map(JString).toList) + else JNull + + val lastAccess: JValue = + if (lastAccessTime <= 0) JString("UNKNOWN") + else JString(timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))) + + val viewQueryOutputColumns: JValue = + if (viewQueryColumnNames.nonEmpty) JArray(viewQueryColumnNames.map(JString).toList) + else JNull + + val map = mutable.LinkedHashMap[String, JValue]() + + if (identifier.catalog.isDefined) map += "Catalog" -> JString(identifier.catalog.get) + if (identifier.database.isDefined) map += "Database" -> JString(identifier.database.get) + map += "Table" -> JString(identifier.table) + if (Option(owner).exists(_.nonEmpty)) map += "Owner" -> JString(owner) + map += "Created Time" -> + JString(timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))) + if (lastAccess != JNull) map += "Last Access" -> lastAccess + map += "Created By" -> JString(s"Spark $createVersion") + map += "Type" -> JString(tableType.name) + if (provider.isDefined) map += "Provider" -> JString(provider.get) + bucketSpec.foreach { spec => + map ++= spec.toJsonLinkedHashMap.map { case (k, v) => k -> v } } - - identifier.catalog.foreach(map.put("Catalog", _)) - identifier.database.foreach(map.put("Database", _)) - map.put("Table", identifier.table) - if (owner != null && owner.nonEmpty) map.put("Owner", owner) - map.put("Created Time", new Date(createTime).toString) - map.put("Last Access", lastAccess) - map.put("Created By", "Spark " + createVersion) - map.put("Type", tableType.name) - provider.foreach(map.put("Provider", _)) - bucketSpec.foreach(map ++= _.toLinkedHashMap) - comment.foreach(map.put("Comment", _)) - if (tableType == CatalogTableType.VIEW) { - viewText.foreach(map.put("View Text", _)) - viewOriginalText.foreach(map.put("View Original Text", _)) - if (SQLConf.get.viewSchemaBindingEnabled) { - map.put("View Schema Mode", viewSchemaMode.toString) - } - if (viewCatalogAndNamespace.nonEmpty) { - import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ - map.put("View Catalog and Namespace", viewCatalogAndNamespace.quoted) - } - if (viewQueryColumnNames.nonEmpty) { - map.put("View Query Output Columns", viewQueryColumnNames.mkString("[", ", ", "]")) - } + if (comment.isDefined) map += "Comment" -> JString(comment.get) + if (collation.isDefined) map += "Collation" -> JString(collation.get) + if (tableType == CatalogTableType.VIEW && viewText.isDefined) { + map += "View Text" -> JString(viewText.get) } - - if (tableProperties.nonEmpty) { - map.put("Table Properties", tableProperties.mkString("[", ", ", "]")) + if (tableType == CatalogTableType.VIEW && viewOriginalText.isDefined) { + map += "View Original Text" -> JString(viewOriginalText.get) } - stats.foreach(s => map.put("Statistics", s.simpleString)) - map ++= storage.toLinkedHashMap - if (tracksPartitionsInCatalog) map.put("Partition Provider", "Catalog") - if (partitionColumnNames.nonEmpty) map.put("Partition Columns", partitionColumns) - if (schema.nonEmpty) map.put("Schema", schema.treeString) - - map + if (SQLConf.get.viewSchemaBindingEnabled && tableType == CatalogTableType.VIEW) { + map += "View Schema Mode" -> JString(viewSchemaMode.toString) + } + if (viewCatalogAndNamespace.nonEmpty && tableType == CatalogTableType.VIEW) { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + map += "View Catalog and Namespace" -> JString(viewCatalogAndNamespace.quoted) + } + if (viewQueryOutputColumns != JNull) { + map += "View Query Output Columns" -> viewQueryOutputColumns + } + if (tableProperties != JNull) map += "Table Properties" -> tableProperties + if (stats.isDefined) map += "Statistics" -> JString(stats.get.simpleString) + map ++= storage.toJsonLinkedHashMap.map { case (k, v) => k -> v } + if (tracksPartitionsInCatalog) map += "Partition Provider" -> JString("Catalog") + if (partitionColumns != JNull) map += "Partition Columns" -> partitionColumns + if (schema.nonEmpty) map += "Schema" -> JString(schema.treeString) + + map.filterNot(_._2 == JNull) } override def toString: String = { - toLinkedHashMap.map { case ((key, value)) => + toLinkedHashMap.map { case (key, value) => if (value.isEmpty) key else s"$key: $value" }.mkString("CatalogTable(\n", "\n", ")") } /** Readable string representation for the CatalogTable. */ def simpleString: String = { - toLinkedHashMap.map { case ((key, value)) => + toLinkedHashMap.map { case (key, value) => if (value.isEmpty) key else s"$key: $value" }.mkString("", "\n", "") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 5a23d6f7a3ccb..6c68bc1aa5890 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -103,6 +103,16 @@ class CSVOptions( val delimiter = CSVExprUtils.toDelimiterStr( parameters.getOrElse(SEP, parameters.getOrElse(DELIMITER, ","))) + + val extension = { + val ext = parameters.getOrElse(EXTENSION, "csv") + if (ext.size != 3 && !ext.forall(_.isLetter)) { + throw QueryExecutionErrors.invalidFileExtensionError(EXTENSION, ext) + } + + ext + } + val parseMode: ParseMode = parameters.get(MODE).map(ParseMode.fromString).getOrElse(PermissiveMode) val charset = parameters.get(ENCODING).orElse(parameters.get(CHARSET)) @@ -385,6 +395,7 @@ object CSVOptions extends DataSourceOptions { val NEGATIVE_INF = newOption("negativeInf") val TIME_ZONE = newOption("timeZone") val UNESCAPED_QUOTE_HANDLING = newOption("unescapedQuoteHandling") + val EXTENSION = newOption("extension") // Options with alternative val ENCODING = "encoding" val CHARSET = "charset" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index f2f86a90d5172..5f0b42fec0fa8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -87,7 +87,8 @@ object ExpressionEncoder { } constructProjection(row).get(0, anyObjectType).asInstanceOf[T] } catch { - case e: SparkRuntimeException if e.getCondition == "NOT_NULL_ASSERT_VIOLATION" => + case e: SparkRuntimeException if e.getCondition == "NOT_NULL_ASSERT_VIOLATION" || + e.getCondition == "EXCEED_LIMIT_LENGTH" => throw e case e: Exception => throw QueryExecutionErrors.expressionDecodingError(e, expressions) @@ -115,7 +116,8 @@ object ExpressionEncoder { inputRow(0) = t extractProjection(inputRow) } catch { - case e: SparkRuntimeException if e.getCondition == "NOT_NULL_ASSERT_VIOLATION" => + case e: SparkRuntimeException if e.getCondition == "NOT_NULL_ASSERT_VIOLATION" || + e.getCondition == "EXCEED_LIMIT_LENGTH" => throw e case e: Exception => throw QueryExecutionErrors.expressionEncodingError(e, expressions) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala index d38ee01485288..4eb14fb9e7b86 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala @@ -115,7 +115,7 @@ case class CallMethodViaReflection( "requiredType" -> toSQLType( TypeCollection(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, - StringTypeWithCollation)), + StringTypeWithCollation(supportsTrimCollation = true))), "inputSql" -> toSQLExpr(e), "inputType" -> toSQLType(e.dataType)) ) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 154199d37c46d..8773d7a6a029e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -281,7 +281,7 @@ object Cast extends QueryErrorsBase { def needsTimeZone(from: DataType, to: DataType): Boolean = (from, to) match { case (VariantType, _) => true case (_: StringType, TimestampType) => true - case (TimestampType, StringType) => true + case (TimestampType, _: StringType) => true case (DateType, TimestampType) => true case (TimestampType, DateType) => true case (TimestampType, TimestampNTZType) => true @@ -565,6 +565,11 @@ case class Cast( } } + private lazy val castArgs = variant.VariantCastArgs( + evalMode != EvalMode.TRY, + timeZoneId, + zoneId) + def needsTimeZone: Boolean = Cast.needsTimeZone(child.dataType, dataType) // [[func]] assumes the input is no longer null because eval already does the null check. @@ -1120,13 +1125,13 @@ case class Cast( _ => throw QueryExecutionErrors.cannotCastFromNullTypeError(to) } else if (from.isInstanceOf[VariantType]) { buildCast[VariantVal](_, v => { - variant.VariantGet.cast(v, to, evalMode != EvalMode.TRY, timeZoneId, zoneId) + variant.VariantGet.cast(v, to, castArgs) }) } else { to match { case dt if dt == from => identity[Any] case VariantType => input => variant.VariantExpressionEvalUtils.castToVariant(input, from) - case _: StringType => castToString(from) + case s: StringType => castToString(from, s.constraint) case BinaryType => castToBinary(from) case DateType => castToDate(from) case decimal: DecimalType => castToDecimal(from, decimal) @@ -1218,12 +1223,10 @@ case class Cast( case _ if from.isInstanceOf[VariantType] => (c, evPrim, evNull) => val tmp = ctx.freshVariable("tmp", classOf[Object]) val dataTypeArg = ctx.addReferenceObj("dataType", to) - val zoneStrArg = ctx.addReferenceObj("zoneStr", timeZoneId) - val zoneIdArg = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) - val failOnError = evalMode != EvalMode.TRY + val castArgsArg = ctx.addReferenceObj("castArgs", castArgs) val cls = classOf[variant.VariantGet].getName code""" - Object $tmp = $cls.cast($c, $dataTypeArg, $failOnError, $zoneStrArg, $zoneIdArg); + Object $tmp = $cls.cast($c, $dataTypeArg, $castArgsArg); if ($tmp == null) { $evNull = true; } else { @@ -1234,7 +1237,8 @@ case class Cast( val cls = variant.VariantExpressionEvalUtils.getClass.getName.stripSuffix("$") val fromArg = ctx.addReferenceObj("from", from) (c, evPrim, evNull) => code"$evPrim = $cls.castToVariant($c, $fromArg);" - case _: StringType => (c, evPrim, _) => castToStringCode(from, ctx).apply(c, evPrim) + case s: StringType => + (c, evPrim, _) => castToStringCode(from, ctx, s.constraint).apply(c, evPrim) case BinaryType => castToBinaryCode(from) case DateType => castToDateCode(from, ctx) case decimal: DecimalType => castToDecimalCode(from, decimal, ctx) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala index e65a0200b064f..8b7d641828ba1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala @@ -61,7 +61,9 @@ object ExprUtils extends EvalHelper with QueryErrorsBase { def convertToMapData(exp: Expression): Map[String, String] = exp match { case m: CreateMap - if AbstractMapType(StringTypeWithCollation, StringTypeWithCollation) + if AbstractMapType( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true)) .acceptsType(m.dataType) => val arrayMap = m.eval().asInstanceOf[ArrayBasedMapData] ArrayBasedMapData.toScalaMap(arrayMap).map { case (key, value) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index c454799852826..4c83f92509ecd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.trees.{BinaryLike, CurrentOrigin, LeafLike, QuaternaryLike, TernaryLike, TreeNode, UnaryLike} -import org.apache.spark.sql.catalyst.trees.TreePattern.{LAZY_ANALYSIS_EXPRESSION, RUNTIME_REPLACEABLE, TreePattern} +import org.apache.spark.sql.catalyst.trees.TreePattern.{RUNTIME_REPLACEABLE, TreePattern} import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.errors.{QueryErrorsBase, QueryExecutionErrors} @@ -410,20 +410,6 @@ trait Unevaluable extends Expression with FoldableUnevaluable { final override def foldable: Boolean = false } -/** - * An expression that cannot be analyzed. These expressions don't live analysis time or after - * and should not be evaluated during query planning and execution. - */ -trait LazyAnalysisExpression extends Expression { - final override lazy val resolved = false - - final override val nodePatterns: Seq[TreePattern] = - Seq(LAZY_ANALYSIS_EXPRESSION) ++ nodePatternsInternal() - - // Subclasses can override this function to provide more TreePatterns. - def nodePatternsInternal(): Seq[TreePattern] = Seq() -} - /** * An expression that gets replaced at runtime (currently by the optimizer) into a different * expression for evaluation. This is mainly used to provide compatibility with other databases. @@ -1368,19 +1354,24 @@ trait UserDefinedExpression { } trait CommutativeExpression extends Expression { - /** Collects adjacent commutative operations. */ - private def gatherCommutative( + /** + * Collects adjacent commutative operations. + * + * Exposed for testing + */ + private[spark] def gatherCommutative( e: Expression, f: PartialFunction[CommutativeExpression, Seq[Expression]]): Seq[Expression] = { val resultBuffer = scala.collection.mutable.Buffer[Expression]() - val stack = scala.collection.mutable.Stack[Expression](e) + val queue = scala.collection.mutable.Queue[Expression](e) // [SPARK-49977]: Use iterative approach to avoid creating many temporary List objects // for deep expression trees through recursion. - while (stack.nonEmpty) { - stack.pop() match { + while (queue.nonEmpty) { + val current = queue.dequeue() + current match { case c: CommutativeExpression if f.isDefinedAt(c) => - stack.pushAll(f(c)) + queue ++= f(c) case other => resultBuffer += other.canonicalized } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala index 130b4ee4c8cac..de72b94df3ac5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala @@ -22,7 +22,7 @@ import java.time.ZoneOffset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.util.{ArrayData, DateFormatter, IntervalStringStyles, IntervalUtils, MapData, SparkStringUtils, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.{ArrayData, CharVarcharCodegenUtils, DateFormatter, IntervalStringStyles, IntervalUtils, MapData, SparkStringUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.IntervalStringStyles.ANSI_STYLE import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.BinaryOutputStyle @@ -53,7 +53,17 @@ trait ToStringBase { self: UnaryExpression with TimeZoneAwareExpression => i => func(i.asInstanceOf[T]) // Returns a function to convert a value to pretty string. The function assumes input is not null. - protected final def castToString(from: DataType): Any => UTF8String = from match { + protected final def castToString( + from: DataType, to: StringConstraint = NoConstraint): Any => UTF8String = + to match { + case FixedLength(length) => + s => CharVarcharCodegenUtils.charTypeWriteSideCheck(castToString(from)(s), length) + case MaxLength(length) => + s => CharVarcharCodegenUtils.varcharTypeWriteSideCheck(castToString(from)(s), length) + case NoConstraint => castToString(from) + } + + private def castToString(from: DataType): Any => UTF8String = from match { case CalendarIntervalType => acceptAny[CalendarInterval](i => UTF8String.fromString(i.toString)) case BinaryType => acceptAny[Array[Byte]](binaryFormatter.apply) @@ -167,8 +177,31 @@ trait ToStringBase { self: UnaryExpression with TimeZoneAwareExpression => // Returns a function to generate code to convert a value to pretty string. It assumes the input // is not null. - @scala.annotation.tailrec protected final def castToStringCode( + from: DataType, + ctx: CodegenContext, + to: StringConstraint = NoConstraint): (ExprValue, ExprValue) => Block = + (c, evPrim) => { + val tmpVar = ctx.freshVariable("tmp", classOf[UTF8String]) + val castToString = castToStringCode(from, ctx)(c, tmpVar) + val maintainConstraint = to match { + case FixedLength(length) => + code"""$evPrim = org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils + .charTypeWriteSideCheck($tmpVar, $length);""".stripMargin + case MaxLength(length) => + code"""$evPrim = org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils + .varcharTypeWriteSideCheck($tmpVar, $length);""".stripMargin + case NoConstraint => code"$evPrim = $tmpVar;" + } + code""" + UTF8String $tmpVar; + $castToString + $maintainConstraint + """ + } + + @scala.annotation.tailrec + private def castToStringCode( from: DataType, ctx: CodegenContext): (ExprValue, ExprValue) => Block = { from match { case BinaryType => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HistogramNumeric.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HistogramNumeric.scala index eda2c742ab4b5..142f4a4eae4c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HistogramNumeric.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HistogramNumeric.scala @@ -126,7 +126,10 @@ case class HistogramNumeric( // Ignore empty rows, for example: histogram_numeric(null) if (value != null) { // Convert the value to a double value - val doubleValue = value.asInstanceOf[Number].doubleValue + val doubleValue = value match { + case d: Decimal => d.toDouble + case o => o.asInstanceOf[Number].doubleValue() + } buffer.add(doubleValue) } buffer @@ -162,6 +165,11 @@ case class HistogramNumeric( case ShortType => coord.x.toShort case _: DayTimeIntervalType | LongType | TimestampType | TimestampNTZType => coord.x.toLong + case d: DecimalType => + val bigDecimal = BigDecimal + .decimal(coord.x, new java.math.MathContext(d.precision)) + .setScale(d.scale, BigDecimal.RoundingMode.HALF_UP) + Decimal(bigDecimal) case _ => coord.x } array(index) = InternalRow.apply(result, coord.y) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 97add0b8e45bc..f3eeaa96b3d46 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -183,6 +183,8 @@ case class Mode( } override def orderingFilled: Boolean = child != UnresolvedWithinGroup + override def isOrderingMandatory: Boolean = true + override def isDistinctSupported: Boolean = false assert(orderingFilled || (!orderingFilled && reverseOpt.isEmpty)) @@ -190,7 +192,7 @@ case class Mode( child match { case UnresolvedWithinGroup => if (orderingWithinGroup.length != 1) { - throw QueryCompilationErrors.wrongNumOrderingsForInverseDistributionFunctionError( + throw QueryCompilationErrors.wrongNumOrderingsForFunctionError( nodeName, 1, orderingWithinGroup.length) } orderingWithinGroup.head match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SupportsOrderingWithinGroup.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SupportsOrderingWithinGroup.scala index 9c0502a2c1fcf..453251ac61cde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SupportsOrderingWithinGroup.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SupportsOrderingWithinGroup.scala @@ -20,9 +20,26 @@ package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.expressions.SortOrder /** - * The trait used to set the [[SortOrder]] after inverse distribution functions parsed. + * The trait used to set the [[SortOrder]] for supporting functions. */ trait SupportsOrderingWithinGroup { self: AggregateFunction => - def orderingFilled: Boolean = false def withOrderingWithinGroup(orderingWithinGroup: Seq[SortOrder]): AggregateFunction + + /** Indicator that ordering was set. */ + def orderingFilled: Boolean + + /** + * Tells Analyzer that WITHIN GROUP (ORDER BY ...) is mandatory for function. + * + * @see [[QueryCompilationErrors.functionMissingWithinGroupError]] + */ + def isOrderingMandatory: Boolean + + /** + * Tells Analyzer that DISTINCT is supported. + * The DISTINCT can conflict with order so some functions can ban it. + * + * @see [[QueryCompilationErrors.functionMissingWithinGroupError]] + */ + def isDistinctSupported: Boolean } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index 3aaf353043a9a..7789c23b50a48 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -18,16 +18,22 @@ package org.apache.spark.sql.catalyst.expressions.aggregate import scala.collection.mutable -import scala.collection.mutable.Growable +import scala.collection.mutable.{ArrayBuffer, Growable} +import scala.util.{Left, Right} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.trees.UnaryLike +import org.apache.spark.sql.catalyst.types.PhysicalDataType import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, TypeUtils, UnsafeRowUtils} +import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLExpr import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} +import org.apache.spark.sql.errors.DataTypeErrors.{toSQLId, toSQLType} +import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.{ByteArray, UTF8String} import org.apache.spark.util.BoundedPriorityQueue /** @@ -36,8 +42,7 @@ import org.apache.spark.util.BoundedPriorityQueue * We have to store all the collected elements in memory, and so notice that too many elements * can cause GC paused and eventually OutOfMemory Errors. */ -abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImperativeAggregate[T] - with UnaryLike[Expression] { +abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImperativeAggregate[T] { val child: Expression @@ -102,7 +107,8 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper case class CollectList( child: Expression, mutableAggBufferOffset: Int = 0, - inputAggBufferOffset: Int = 0) extends Collect[mutable.ArrayBuffer[Any]] { + inputAggBufferOffset: Int = 0) extends Collect[mutable.ArrayBuffer[Any]] + with UnaryLike[Expression] { def this(child: Expression) = this(child, 0, 0) @@ -149,7 +155,7 @@ case class CollectSet( child: Expression, mutableAggBufferOffset: Int = 0, inputAggBufferOffset: Int = 0) - extends Collect[mutable.HashSet[Any]] with QueryErrorsBase { + extends Collect[mutable.HashSet[Any]] with QueryErrorsBase with UnaryLike[Expression] { def this(child: Expression) = this(child, 0, 0) @@ -215,7 +221,8 @@ case class CollectTopK( num: Int, reverse: Boolean = false, mutableAggBufferOffset: Int = 0, - inputAggBufferOffset: Int = 0) extends Collect[BoundedPriorityQueue[Any]] { + inputAggBufferOffset: Int = 0) extends Collect[BoundedPriorityQueue[Any]] + with UnaryLike[Expression] { assert(num > 0) def this(child: Expression, num: Int) = this(child, num, false, 0, 0) @@ -265,3 +272,280 @@ private[aggregate] object CollectTopK { case _ => throw QueryCompilationErrors.invalidNumParameter(e) } } + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(expr[, delimiter])[ WITHIN GROUP (ORDER BY key [ASC | DESC] [,...])] - Returns + the concatenation of non-null input values, separated by the delimiter ordered by key. + If all values are null, null is returned. + """, + arguments = """ + Arguments: + * expr - a string or binary expression to be concatenated. + * delimiter - an optional string or binary foldable expression used to separate the input values. + If null, the concatenation will be performed without a delimiter. Default is null. + * key - an optional expression for ordering the input values. Multiple keys can be specified. + If none are specified, the order of the rows in the result is non-deterministic. + """, + examples = """ + Examples: + > SELECT _FUNC_(col) FROM VALUES ('a'), ('b'), ('c') AS tab(col); + abc + > SELECT _FUNC_(col) WITHIN GROUP (ORDER BY col DESC) FROM VALUES ('a'), ('b'), ('c') AS tab(col); + cba + > SELECT _FUNC_(col) FROM VALUES ('a'), (NULL), ('b') AS tab(col); + ab + > SELECT _FUNC_(col) FROM VALUES ('a'), ('a') AS tab(col); + aa + > SELECT _FUNC_(DISTINCT col) FROM VALUES ('a'), ('a'), ('b') AS tab(col); + ab + > SELECT _FUNC_(col, ', ') FROM VALUES ('a'), ('b'), ('c') AS tab(col); + a, b, c + > SELECT _FUNC_(col) FROM VALUES (NULL), (NULL) AS tab(col); + NULL + """, + note = """ + * If the order is not specified, the function is non-deterministic because + the order of the rows may be non-deterministic after a shuffle. + * If DISTINCT is specified, then expr and key must be the same expression. + """, + group = "agg_funcs", + since = "4.0.0" +) +// scalastyle:on line.size.limit +case class ListAgg( + child: Expression, + delimiter: Expression = Literal(null), + orderExpressions: Seq[SortOrder] = Nil, + mutableAggBufferOffset: Int = 0, + inputAggBufferOffset: Int = 0) + extends Collect[mutable.ArrayBuffer[Any]] + with SupportsOrderingWithinGroup + with ImplicitCastInputTypes { + + override def orderingFilled: Boolean = orderExpressions.nonEmpty + + override def isOrderingMandatory: Boolean = false + + override def isDistinctSupported: Boolean = true + + override def withOrderingWithinGroup(orderingWithinGroup: Seq[SortOrder]): AggregateFunction = + copy(orderExpressions = orderingWithinGroup) + + override protected lazy val bufferElementType: DataType = { + if (!needSaveOrderValue) { + child.dataType + } else { + StructType( + StructField("value", child.dataType) + +: orderValuesField + ) + } + } + /** Indicates that the result of [[child]] is not enough for evaluation */ + lazy val needSaveOrderValue: Boolean = !isOrderCompatible(orderExpressions) + + def this(child: Expression) = + this(child, Literal(null), Nil, 0, 0) + + def this(child: Expression, delimiter: Expression) = + this(child, delimiter, Nil, 0, 0) + + override def nullable: Boolean = true + + override def createAggregationBuffer(): mutable.ArrayBuffer[Any] = mutable.ArrayBuffer.empty + + override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate = + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = + copy(inputAggBufferOffset = newInputAggBufferOffset) + + override def defaultResult: Option[Literal] = Option(Literal.create(null, dataType)) + + override def sql(isDistinct: Boolean): String = { + val distinct = if (isDistinct) "DISTINCT " else "" + val withinGroup = if (orderingFilled) { + s" WITHIN GROUP (ORDER BY ${orderExpressions.map(_.sql).mkString(", ")})" + } else { + "" + } + s"$prettyName($distinct${child.sql}, ${delimiter.sql})$withinGroup" + } + + override def inputTypes: Seq[AbstractDataType] = + TypeCollection( + StringTypeWithCollation(supportsTrimCollation = true), + BinaryType + ) +: + TypeCollection( + StringTypeWithCollation(supportsTrimCollation = true), + BinaryType, + NullType + ) +: + orderExpressions.map(_ => AnyDataType) + + override def checkInputDataTypes(): TypeCheckResult = { + val matchInputTypes = super.checkInputDataTypes() + if (matchInputTypes.isFailure) { + matchInputTypes + } else if (!delimiter.foldable) { + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> toSQLId("delimiter"), + "inputType" -> toSQLType(delimiter.dataType), + "inputExpr" -> toSQLExpr(delimiter) + ) + ) + } else if (delimiter.dataType == NullType) { + // null is the default empty delimiter so type is not important + TypeCheckSuccess + } else { + TypeUtils.checkForSameTypeInputExpr(child.dataType :: delimiter.dataType :: Nil, prettyName) + } + } + + override def eval(buffer: mutable.ArrayBuffer[Any]): Any = { + if (buffer.nonEmpty) { + val sortedBufferWithoutNulls = sortBuffer(buffer) + concatSkippingNulls(sortedBufferWithoutNulls) + } else { + null + } + } + + /** + * Sort buffer according orderExpressions. + * If orderExpressions is empty then returns buffer as is. + * The format of buffer is determined by [[needSaveOrderValue]] + * @return sorted buffer containing only child's values + */ + private[this] def sortBuffer(buffer: mutable.ArrayBuffer[Any]): mutable.ArrayBuffer[Any] = { + if (!orderingFilled) { + // without order return as is. + return buffer + } + if (!needSaveOrderValue) { + // Here the buffer has structure [childValue0, childValue1, ...] + // and we want to sort it by childValues + val sortOrderExpression = orderExpressions.head + val ascendingOrdering = PhysicalDataType.ordering(sortOrderExpression.dataType) + val ordering = + if (sortOrderExpression.direction == Ascending) ascendingOrdering + else ascendingOrdering.reverse + buffer.sorted(ordering) + } else { + // Here the buffer has structure + // [[childValue, orderValue0, orderValue1, ...], + // [childValue, orderValue0, orderValue1, ...], + // ...] + // and we want to sort it by tuples (orderValue0, orderValue1, ...) + buffer + .asInstanceOf[mutable.ArrayBuffer[InternalRow]] + .sorted(bufferOrdering) + // drop orderValues after sort + .map(_.get(0, child.dataType)) + } + } + + /** + * @return ordering by (orderValue0, orderValue1, ...) + * for InternalRow with format [childValue, orderValue0, orderValue1, ...] + */ + private[this] def bufferOrdering: Ordering[InternalRow] = { + val bufferSortOrder = orderExpressions.zipWithIndex.map { + case (originalOrder, i) => + originalOrder.copy( + // first value is the evaluated child so add +1 for order's values + child = BoundReference(i + 1, originalOrder.dataType, originalOrder.child.nullable) + ) + } + new InterpretedOrdering(bufferSortOrder) + } + + private[this] def concatSkippingNulls(buffer: mutable.ArrayBuffer[Any]): Any = { + getDelimiterValue match { + case Right(delimiterValue: Array[Byte]) => + val inputs = buffer.filter(_ != null).map(_.asInstanceOf[Array[Byte]]) + ByteArray.concatWS(delimiterValue, inputs.toSeq: _*) + case Left(delimiterValue: UTF8String) => + val inputs = buffer.filter(_ != null).map(_.asInstanceOf[UTF8String]) + UTF8String.concatWs(delimiterValue, inputs.toSeq: _*) + } + } + + /** + * @return delimiter value or default empty value if delimiter is null. Type respects [[dataType]] + */ + private[this] def getDelimiterValue: Either[UTF8String, Array[Byte]] = { + val delimiterValue = delimiter.eval() + dataType match { + case _: StringType => + Left( + if (delimiterValue == null) UTF8String.fromString("") + else delimiterValue.asInstanceOf[UTF8String] + ) + case _: BinaryType => + Right( + if (delimiterValue == null) ByteArray.EMPTY_BYTE + else delimiterValue.asInstanceOf[Array[Byte]] + ) + } + } + + override def dataType: DataType = child.dataType + + override def update(buffer: ArrayBuffer[Any], input: InternalRow): ArrayBuffer[Any] = { + val value = child.eval(input) + if (value != null) { + val v = if (!needSaveOrderValue) { + convertToBufferElement(value) + } else { + InternalRow.fromSeq(convertToBufferElement(value) +: evalOrderValues(input)) + } + buffer += v + } + buffer + } + + private[this] def evalOrderValues(internalRow: InternalRow): Seq[Any] = { + orderExpressions.map(order => convertToBufferElement(order.child.eval(internalRow))) + } + + override protected def convertToBufferElement(value: Any): Any = InternalRow.copyValue(value) + + override def children: Seq[Expression] = child +: delimiter +: orderExpressions + + /** + * Utility func to check if given order is defined and different from [[child]]. + * + * @see [[QueryCompilationErrors.functionAndOrderExpressionMismatchError]] + * @see [[needSaveOrderValue]] + */ + private[this] def isOrderCompatible(someOrder: Seq[SortOrder]): Boolean = { + if (someOrder.isEmpty) { + return true + } + if (someOrder.size == 1 && someOrder.head.child.semanticEquals(child)) { + return true + } + false + } + + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy( + child = newChildren.head, + delimiter = newChildren(1), + orderExpressions = newChildren + .drop(2) + .map(_.asInstanceOf[SortOrder]) + ) + + private[this] def orderValuesField: Seq[StructField] = { + orderExpressions.zipWithIndex.map { + case (order, i) => StructField(s"sortOrderValue[$i]", order.dataType) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala index 89a6984b80852..6dfa1b499df23 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala @@ -378,7 +378,7 @@ case class PercentileCont(left: Expression, right: Expression, reverse: Boolean override def withOrderingWithinGroup(orderingWithinGroup: Seq[SortOrder]): AggregateFunction = { if (orderingWithinGroup.length != 1) { - throw QueryCompilationErrors.wrongNumOrderingsForInverseDistributionFunctionError( + throw QueryCompilationErrors.wrongNumOrderingsForFunctionError( nodeName, 1, orderingWithinGroup.length) } orderingWithinGroup.head match { @@ -390,6 +390,10 @@ case class PercentileCont(left: Expression, right: Expression, reverse: Boolean override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): PercentileCont = this.copy(left = newLeft, right = newRight) + + override def orderingFilled: Boolean = left != UnresolvedWithinGroup + override def isOrderingMandatory: Boolean = true + override def isDistinctSupported: Boolean = false } /** @@ -432,7 +436,7 @@ case class PercentileDisc( override def withOrderingWithinGroup(orderingWithinGroup: Seq[SortOrder]): AggregateFunction = { if (orderingWithinGroup.length != 1) { - throw QueryCompilationErrors.wrongNumOrderingsForInverseDistributionFunctionError( + throw QueryCompilationErrors.wrongNumOrderingsForFunctionError( nodeName, 1, orderingWithinGroup.length) } orderingWithinGroup.head match { @@ -467,6 +471,10 @@ case class PercentileDisc( toDoubleValue(higherKey) } } + + override def orderingFilled: Boolean = left != UnresolvedWithinGroup + override def isOrderingMandatory: Boolean = true + override def isDistinctSupported: Boolean = false } // scalastyle:off line.size.limit diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/toFromAvroSqlFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/avroSqlFunctions.scala similarity index 69% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/toFromAvroSqlFunctions.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/avroSqlFunctions.scala index 457f469e0f687..6693ee83fd4af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/toFromAvroSqlFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/avroSqlFunctions.scala @@ -200,3 +200,96 @@ case class ToAvro(child: Expression, jsonFormatSchema: Expression) override def prettyName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("to_avro") } + +/** + * Returns schema in the DDL format of the avro schema in JSON string format. + * This is a thin wrapper over the [[SchemaOfAvro]] class to create a SQL function. + * + * @param jsonFormatSchema the Avro schema in JSON string format. + * @param options the options to use when performing the conversion. + * + * @since 4.0.0 + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(jsonFormatSchema, options) - Returns schema in the DDL format of the avro schema in JSON string format. + """, + examples = """ + Examples: + > SELECT _FUNC_('{"type": "record", "name": "struct", "fields": [{"name": "u", "type": ["int", "string"]}]}', map()); + STRUCT NOT NULL> + """, + group = "misc_funcs", + since = "4.0.0" +) +// scalastyle:on line.size.limit +case class SchemaOfAvro(jsonFormatSchema: Expression, options: Expression) + extends BinaryExpression with RuntimeReplaceable { + + override def left: Expression = jsonFormatSchema + override def right: Expression = options + + override protected def withNewChildrenInternal( + newLeft: Expression, newRight: Expression): Expression = + copy(jsonFormatSchema = newLeft, options = newRight) + + def this(jsonFormatSchema: Expression) = + this(jsonFormatSchema, Literal.create(null)) + + override def checkInputDataTypes(): TypeCheckResult = { + val schemaCheck = jsonFormatSchema.dataType match { + case _: StringType | + _: NullType + if jsonFormatSchema.foldable => + None + case _ => + Some(TypeCheckResult.TypeCheckFailure("The first argument of the SCHEMA_OF_AVRO SQL " + + "function must be a constant string containing the JSON representation of the schema " + + "to use for converting the value from AVRO format")) + } + val optionsCheck = options.dataType match { + case MapType(StringType, StringType, _) | + MapType(NullType, NullType, _) | + _: NullType + if options.foldable => + None + case _ => + Some(TypeCheckResult.TypeCheckFailure("The second argument of the SCHEMA_OF_AVRO SQL " + + "function must be a constant map of strings to strings containing the options to use " + + "for converting the value from AVRO format")) + } + schemaCheck.getOrElse( + optionsCheck.getOrElse( + TypeCheckResult.TypeCheckSuccess)) + } + + override lazy val replacement: Expression = { + val schemaValue: String = jsonFormatSchema.eval() match { + case s: UTF8String => + s.toString + case null => + "" + } + val optionsValue: Map[String, String] = options.eval() match { + case a: ArrayBasedMapData if a.keyArray.array.nonEmpty => + val keys: Array[String] = a.keyArray.array.map(_.toString) + val values: Array[String] = a.valueArray.array.map(_.toString) + keys.zip(values).toMap + case _ => + Map.empty + } + val constructor = try { + Utils.classForName("org.apache.spark.sql.avro.SchemaOfAvro").getConstructors.head + } catch { + case _: java.lang.ClassNotFoundException => + throw QueryCompilationErrors.avroNotLoadedSqlFunctionsUnusable( + functionName = "SCHEMA_OF_AVRO") + } + val expr = constructor.newInstance(schemaValue, optionsValue) + expr.asInstanceOf[Expression] + } + + override def prettyName: String = + getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("schema_of_avro") +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 30c00f5bf96b8..de74bb2f8cd21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -29,6 +29,7 @@ import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionExce import org.codehaus.commons.compiler.{CompileException, InternalCompilerException} import org.codehaus.janino.ClassBodyEvaluator import org.codehaus.janino.util.ClassFile +import org.codehaus.janino.util.ClassFile.CodeAttribute import org.apache.spark.{SparkException, SparkIllegalArgumentException, TaskContext, TaskKilledException} import org.apache.spark.executor.InputMetrics @@ -1578,9 +1579,6 @@ object CodeGenerator extends Logging { val classes = evaluator.getBytecodes.asScala // Then walk the classes to get at the method bytecode. - val codeAttr = Utils.classForName("org.codehaus.janino.util.ClassFile$CodeAttribute") - val codeAttrField = codeAttr.getDeclaredField("code") - codeAttrField.setAccessible(true) val codeStats = classes.map { case (_, classBytes) => val classCodeSize = classBytes.length CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.update(classCodeSize) @@ -1588,8 +1586,8 @@ object CodeGenerator extends Logging { val cf = new ClassFile(new ByteArrayInputStream(classBytes)) val constPoolSize = cf.getConstantPoolSize val methodCodeSizes = cf.methodInfos.asScala.flatMap { method => - method.getAttributes().filter(_.getClass eq codeAttr).map { a => - val byteCodeSize = codeAttrField.get(a).asInstanceOf[Array[Byte]].length + method.getAttributes.collect { case attr: CodeAttribute => + val byteCodeSize = attr.code.length CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update(byteCodeSize) if (byteCodeSize > DEFAULT_JVM_HUGE_METHOD_LIMIT) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index c75bf30ad21f7..024bef08b5273 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -17,10 +17,12 @@ package org.apache.spark.sql.catalyst.expressions +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.ExpressionBuilder +import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, UnresolvedException} import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UNRESOLVED_COLLATION} +import org.apache.spark.sql.catalyst.util.{AttributeNameParser, CollationFactory} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation @@ -37,7 +39,7 @@ import org.apache.spark.sql.types._ examples = """ Examples: > SELECT COLLATION('Spark SQL' _FUNC_ UTF8_LCASE); - UTF8_LCASE + SYSTEM.BUILTIN.UTF8_LCASE """, since = "4.0.0", group = "string_funcs") @@ -56,7 +58,8 @@ object CollateExpressionBuilder extends ExpressionBuilder { evalCollation.toString.toUpperCase().contains("TRIM")) { throw QueryCompilationErrors.trimCollationNotEnabledError() } - Collate(e, evalCollation.toString) + Collate(e, UnresolvedCollation( + AttributeNameParser.parseAttributeName(evalCollation.toString))) } case (_: StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError( funcName, "collationName", StringType) @@ -73,24 +76,63 @@ object CollateExpressionBuilder extends ExpressionBuilder { * This function is pass-through, it will not modify the input data. * Only type metadata will be updated. */ -case class Collate(child: Expression, collationName: String) - extends UnaryExpression with ExpectsInputTypes { - private val collationId = CollationFactory.collationNameToId(collationName) - override def dataType: DataType = StringType(collationId) +case class Collate(child: Expression, collation: Expression) + extends BinaryExpression with ExpectsInputTypes { + override def left: Expression = child + override def right: Expression = collation + override def dataType: DataType = collation.dataType override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation(supportsTrimCollation = true)) - - override protected def withNewChildInternal( - newChild: Expression): Expression = copy(newChild) + Seq(StringTypeWithCollation(supportsTrimCollation = true), AnyDataType) override def eval(row: InternalRow): Any = child.eval(row) - override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = - defineCodeGen(ctx, ev, (in) => in) + /** Just a simple passthrough for code generation. */ + override def genCode(ctx: CodegenContext): ExprCode = child.genCode(ctx) + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + throw SparkException.internalError("Collate.doGenCode should not be called.") + } + + override def sql: String = s"$prettyName(${child.sql}, $collation)" + + override def toString: String = + s"$prettyName($child, $collation)" + + override protected def withNewChildrenInternal( + newLeft: Expression, newRight: Expression): Expression = + copy(child = newLeft, collation = newRight) + + override def foldable: Boolean = child.foldable +} + +/** + * An expression that marks an unresolved collation name. + * + * This class is used to represent a collation name that has not yet been resolved from a fully + * qualified collation name. It is used during the analysis phase, where the collation name is + * specified but not yet validated or resolved. + */ +case class UnresolvedCollation(collationName: Seq[String]) + extends LeafExpression with Unevaluable { + override def dataType: DataType = throw new UnresolvedException("dataType") + + override def nullable: Boolean = false + + override lazy val resolved: Boolean = false + + final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_COLLATION) +} + +/** + * An expression that represents a resolved collation name. + */ +case class ResolvedCollation(collationName: String) extends LeafExpression with Unevaluable { + override def nullable: Boolean = false + + override def dataType: DataType = StringType(CollationFactory.collationNameToId(collationName)) - override def sql: String = s"$prettyName(${child.sql}, $collationName)" + override def toString: String = collationName - override def toString: String = s"$prettyName($child, $collationName)" + override def sql: String = collationName } // scalastyle:off line.contains.tab @@ -103,7 +145,7 @@ case class Collate(child: Expression, collationName: String) examples = """ Examples: > SELECT _FUNC_('Spark SQL'); - UTF8_BINARY + SYSTEM.BUILTIN.UTF8_BINARY """, since = "4.0.0", group = "string_funcs") @@ -113,8 +155,8 @@ case class Collation(child: Expression) override protected def withNewChildInternal(newChild: Expression): Collation = copy(newChild) override lazy val replacement: Expression = { val collationId = child.dataType.asInstanceOf[StringType].collationId - val collationName = CollationFactory.fetchCollation(collationId).collationName - Literal.create(collationName, SQLConf.get.defaultStringType) + val fullyQualifiedCollationName = CollationFactory.fullyQualifiedName(collationId) + Literal.create(fullyQualifiedCollationName, SQLConf.get.defaultStringType) } override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation(supportsTrimCollation = true)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index fb130574d3474..84e52282b632f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1354,7 +1354,7 @@ case class Reverse(child: Expression) override def nullIntolerant: Boolean = true // Input types are utilized by type coercion in ImplicitTypeCasts. override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(StringTypeWithCollation, ArrayType)) + Seq(TypeCollection(StringTypeWithCollation(supportsTrimCollation = true), ArrayType)) override def dataType: DataType = child.dataType @@ -2127,12 +2127,12 @@ case class ArrayJoin( this(array, delimiter, Some(nullReplacement)) override def inputTypes: Seq[AbstractDataType] = if (nullReplacement.isDefined) { - Seq(AbstractArrayType(StringTypeWithCollation), - StringTypeWithCollation, - StringTypeWithCollation) + Seq(AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true)), + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true)) } else { - Seq(AbstractArrayType(StringTypeWithCollation), - StringTypeWithCollation) + Seq(AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true)), + StringTypeWithCollation(supportsTrimCollation = true)) } override def children: Seq[Expression] = if (nullReplacement.isDefined) { @@ -2609,9 +2609,6 @@ case class ElementAt( @transient private lazy val mapKeyType = left.dataType.asInstanceOf[MapType].keyType - @transient private lazy val mapValueContainsNull = - left.dataType.asInstanceOf[MapType].valueContainsNull - @transient private lazy val arrayElementNullable = left.dataType.asInstanceOf[ArrayType].containsNull @@ -2855,7 +2852,7 @@ case class Concat(children: Seq[Expression]) extends ComplexTypeMergingExpressio with QueryErrorsBase { private def allowedTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, BinaryType, ArrayType) + Seq(StringTypeWithCollation(supportsTrimCollation = true), BinaryType, ArrayType) final override val nodePatterns: Seq[TreePattern] = Seq(CONCAT) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csv/CsvExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csv/CsvExpressionEvalUtils.scala index a91e4ab13001b..fd298b33450b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csv/CsvExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csv/CsvExpressionEvalUtils.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types.{DataType, NullType, StructType} import org.apache.spark.unsafe.types.UTF8String /** - * The expression `CsvToStructs` will utilize the `Invoke` to call it, support codegen. + * The expression `CsvToStructs` will utilize it to support codegen. */ case class CsvToStructsEvaluator( options: Map[String, String], @@ -86,6 +86,7 @@ case class CsvToStructsEvaluator( } final def evaluate(csv: UTF8String): InternalRow = { + if (csv == null) return null converter(parser.parse(csv.toString)) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index 02e5488835c91..04fb9bc133c67 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -23,10 +23,10 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} import org.apache.spark.sql.catalyst.csv._ -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} +import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper import org.apache.spark.sql.catalyst.expressions.csv.{CsvToStructsEvaluator, SchemaOfCsvEvaluator} import org.apache.spark.sql.catalyst.expressions.objects.Invoke -import org.apache.spark.sql.catalyst.trees.TreePattern.{RUNTIME_REPLACEABLE, TreePattern} import org.apache.spark.sql.catalyst.util.TypeUtils._ import org.apache.spark.sql.errors.QueryErrorsBase import org.apache.spark.sql.internal.SQLConf @@ -57,17 +57,12 @@ case class CsvToStructs( timeZoneId: Option[String] = None, requiredSchema: Option[StructType] = None) extends UnaryExpression - with RuntimeReplaceable - with ExpectsInputTypes - with TimeZoneAwareExpression { + with TimeZoneAwareExpression + with ExpectsInputTypes { override def nullable: Boolean = child.nullable - override def nodePatternsInternal(): Seq[TreePattern] = Seq(RUNTIME_REPLACEABLE) - - // The CSV input data might be missing certain fields. We force the nullability - // of the user-provided schema to avoid data corruptions. - private val nullableSchema: StructType = schema.asNullable + override def nullIntolerant: Boolean = true // Used in `FunctionRegistry` def this(child: Expression, schema: Expression, options: Map[String, String]) = @@ -86,28 +81,48 @@ case class CsvToStructs( child = child, timeZoneId = None) - private val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD) - override def dataType: DataType = requiredSchema.getOrElse(schema).asNullable override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { copy(timeZoneId = Option(timeZoneId)) } - override def inputTypes: Seq[AbstractDataType] = StringTypeWithCollation :: Nil + override def inputTypes: Seq[AbstractDataType] = + StringTypeWithCollation(supportsTrimCollation = true) :: Nil override def prettyName: String = "from_csv" + // The CSV input data might be missing certain fields. We force the nullability + // of the user-provided schema to avoid data corruptions. + private val nullableSchema: StructType = schema.asNullable + + @transient + private val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD) + @transient private lazy val evaluator: CsvToStructsEvaluator = CsvToStructsEvaluator( options, nullableSchema, nameOfCorruptRecord, timeZoneId, requiredSchema) - override def replacement: Expression = Invoke( - Literal.create(evaluator, ObjectType(classOf[CsvToStructsEvaluator])), - "evaluate", - dataType, - Seq(child), - Seq(child.dataType)) + override def nullSafeEval(input: Any): Any = { + evaluator.evaluate(input.asInstanceOf[UTF8String]) + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val refEvaluator = ctx.addReferenceObj("evaluator", evaluator) + val eval = child.genCode(ctx) + val resultType = CodeGenerator.boxedType(dataType) + val resultTerm = ctx.freshName("result") + ev.copy(code = + code""" + |${eval.code} + |$resultType $resultTerm = ($resultType) $refEvaluator.evaluate(${eval.value}); + |boolean ${ev.isNull} = $resultTerm == null; + |${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; + |if (!${ev.isNull}) { + | ${ev.value} = $resultTerm; + |} + |""".stripMargin) + } override protected def withNewChildInternal(newChild: Expression): CsvToStructs = copy(child = newChild) @@ -173,7 +188,8 @@ case class SchemaOfCsv( "evaluate", dataType, Seq(child), - Seq(child.dataType)) + Seq(child.dataType), + returnNullable = false) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index fba3927a0bc9c..81be40b3b6474 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -971,7 +971,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[AbstractDataType] = - Seq(TimestampType, StringTypeWithCollation) + Seq(TimestampType, StringTypeWithCollation(supportsTrimCollation = true)) override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) @@ -1129,6 +1129,7 @@ case class GetTimestamp( left: Expression, right: Expression, override val dataType: DataType, + override val suggestedFuncOnFail: String = "try_to_timestamp", timeZoneId: Option[String] = None, failOnError: Boolean = SQLConf.get.ansiEnabled) extends ToTimestamp { @@ -1267,6 +1268,7 @@ object TryToTimestampExpressionBuilder extends ExpressionBuilder { abstract class ToTimestamp extends BinaryExpression with TimestampFormatterHelper with ExpectsInputTypes { + val suggestedFuncOnFail: String = "try_to_timestamp" def failOnError: Boolean // The result of the conversion to timestamp is microseconds divided by this factor. @@ -1279,10 +1281,13 @@ abstract class ToTimestamp override def forTimestampNTZ: Boolean = left.dataType == TimestampNTZType override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection( - StringTypeWithCollation, DateType, TimestampType, TimestampNTZType - ), - StringTypeWithCollation) + Seq( + TypeCollection( + StringTypeWithCollation(supportsTrimCollation = true), + DateType, + TimestampType, + TimestampNTZType), + StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = LongType override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true @@ -1318,9 +1323,9 @@ abstract class ToTimestamp } } catch { case e: DateTimeException if failOnError => - throw QueryExecutionErrors.ansiDateTimeParseError(e) + throw QueryExecutionErrors.ansiDateTimeParseError(e, suggestedFuncOnFail) case e: ParseException if failOnError => - throw QueryExecutionErrors.ansiDateTimeParseError(e) + throw QueryExecutionErrors.ansiDateTimeParseError(e, suggestedFuncOnFail) case e if isParseError(e) => null } } @@ -1331,7 +1336,7 @@ abstract class ToTimestamp override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val javaType = CodeGenerator.javaType(dataType) val parseErrorBranch: String = if (failOnError) { - "throw QueryExecutionErrors.ansiDateTimeParseError(e);" + s"throw QueryExecutionErrors.ansiDateTimeParseError(e, \"${suggestedFuncOnFail}\");" } else { s"${ev.isNull} = true;" } @@ -1454,7 +1459,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = - Seq(LongType, StringTypeWithCollation) + Seq(LongType, StringTypeWithCollation(supportsTrimCollation = true)) override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) @@ -1566,7 +1571,7 @@ case class NextDay( def this(left: Expression, right: Expression) = this(left, right, SQLConf.get.ansiEnabled) override def inputTypes: Seq[AbstractDataType] = - Seq(DateType, StringTypeWithCollation) + Seq(DateType, StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = DateType override def nullable: Boolean = true @@ -1781,7 +1786,7 @@ sealed trait UTCTimestamp extends BinaryExpression with ImplicitCastInputTypes { val funcName: String override def inputTypes: Seq[AbstractDataType] = - Seq(TimestampType, StringTypeWithCollation) + Seq(TimestampType, StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = TimestampType override def nullSafeEval(time: Any, timezone: Any): Any = { @@ -2097,8 +2102,8 @@ case class ParseToDate( extends RuntimeReplaceable with ImplicitCastInputTypes with TimeZoneAwareExpression { override lazy val replacement: Expression = format.map { f => - Cast(GetTimestamp(left, f, TimestampType, timeZoneId, ansiEnabled), DateType, timeZoneId, - EvalMode.fromBoolean(ansiEnabled)) + Cast(GetTimestamp(left, f, TimestampType, "try_to_date", timeZoneId, ansiEnabled), DateType, + timeZoneId, EvalMode.fromBoolean(ansiEnabled)) }.getOrElse(Cast(left, DateType, timeZoneId, EvalMode.fromBoolean(ansiEnabled))) // backwards compatibility @@ -2123,8 +2128,11 @@ case class ParseToDate( // Note: ideally this function should only take string input, but we allow more types here to // be backward compatible. TypeCollection( - StringTypeWithCollation, DateType, TimestampType, TimestampNTZType) +: - format.map(_ => StringTypeWithCollation).toSeq + StringTypeWithCollation(supportsTrimCollation = true), + DateType, + TimestampType, + TimestampNTZType) +: + format.map(_ => StringTypeWithCollation(supportsTrimCollation = true)).toSeq } override protected def withNewChildrenInternal( @@ -2173,7 +2181,7 @@ case class ParseToTimestamp( extends RuntimeReplaceable with ImplicitCastInputTypes with TimeZoneAwareExpression { override lazy val replacement: Expression = format.map { f => - GetTimestamp(left, f, dataType, timeZoneId, failOnError = failOnError) + GetTimestamp(left, f, dataType, "try_to_timestamp", timeZoneId, failOnError = failOnError) }.getOrElse(Cast(left, dataType, timeZoneId, ansiEnabled = failOnError)) def this(left: Expression, format: Expression) = { @@ -2195,10 +2203,15 @@ case class ParseToTimestamp( override def inputTypes: Seq[AbstractDataType] = { // Note: ideally this function should only take string input, but we allow more types here to // be backward compatible. - val types = Seq(StringTypeWithCollation, DateType, TimestampType, TimestampNTZType) + val types = Seq( + StringTypeWithCollation( + supportsTrimCollation = true), + DateType, + TimestampType, + TimestampNTZType) TypeCollection( (if (dataType.isInstanceOf[TimestampType]) types :+ NumericType else types): _* - ) +: format.map(_ => StringTypeWithCollation).toSeq + ) +: format.map(_ => StringTypeWithCollation(supportsTrimCollation = true)).toSeq } override protected def withNewChildrenInternal( @@ -2329,7 +2342,7 @@ case class TruncDate(date: Expression, format: Expression) override def right: Expression = format override def inputTypes: Seq[AbstractDataType] = - Seq(DateType, StringTypeWithCollation) + Seq(DateType, StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = DateType override def prettyName: String = "trunc" override val instant = date @@ -2399,7 +2412,7 @@ case class TruncTimestamp( override def right: Expression = timestamp override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, TimestampType) + Seq(StringTypeWithCollation(supportsTrimCollation = true), TimestampType) override def dataType: TimestampType = TimestampType override def prettyName: String = "date_trunc" override val instant = timestamp @@ -2800,7 +2813,7 @@ case class MakeTimestamp( // casted into decimal safely, we use DecimalType(16, 6) which is wider than DecimalType(10, 0). override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType, IntegerType, IntegerType, IntegerType, IntegerType, DecimalType(16, 6)) ++ - timezone.map(_ => StringTypeWithCollation) + timezone.map(_ => StringTypeWithCollation(supportsTrimCollation = true)) override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = @@ -3333,7 +3346,10 @@ case class ConvertTimezone( override def third: Expression = sourceTs override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, StringTypeWithCollation, TimestampNTZType) + Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + TimestampNTZType) override def dataType: DataType = TimestampNTZType override def nullSafeEval(srcTz: Any, tgtTz: Any, micros: Any): Any = { @@ -3415,7 +3431,7 @@ case class TimestampAdd( override def left: Expression = quantity override def right: Expression = timestamp - override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType, AnyTimestampType) + override def inputTypes: Seq[AbstractDataType] = Seq(LongType, AnyTimestampType) override def dataType: DataType = timestamp.dataType override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = @@ -3424,7 +3440,7 @@ case class TimestampAdd( @transient private lazy val zoneIdInEval: ZoneId = zoneIdForType(timestamp.dataType) override def nullSafeEval(q: Any, micros: Any): Any = { - DateTimeUtils.timestampAdd(unit, q.asInstanceOf[Int], micros.asInstanceOf[Long], zoneIdInEval) + DateTimeUtils.timestampAdd(unit, q.asInstanceOf[Long], micros.asInstanceOf[Long], zoneIdInEval) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 79879dc0edb4c..89d2259ea5c28 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -419,7 +419,7 @@ abstract class HashExpression[E] extends Expression { protected def genHashString( ctx: CodegenContext, stringType: StringType, input: String, result: String): String = { - if (stringType.supportsBinaryEquality && !stringType.usesTrimCollation) { + if (stringType.supportsBinaryEquality) { val baseObject = s"$input.getBaseObject()" val baseOffset = s"$input.getBaseOffset()" val numBytes = s"$input.numBytes()" @@ -570,7 +570,7 @@ abstract class InterpretedHashFunction { hashUnsafeBytes(a, Platform.BYTE_ARRAY_OFFSET, a.length, seed) case s: UTF8String => val st = dataType.asInstanceOf[StringType] - if (st.supportsBinaryEquality && !st.usesTrimCollation) { + if (st.supportsBinaryEquality) { hashUnsafeBytes(s.getBaseObject, s.getBaseOffset, s.numBytes(), seed) } else { val stringHash = CollationFactory @@ -821,7 +821,7 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] { override protected def genHashString( ctx: CodegenContext, stringType: StringType, input: String, result: String): String = { - if (stringType.supportsBinaryEquality && !stringType.usesTrimCollation) { + if (stringType.supportsBinaryEquality) { val baseObject = s"$input.getBaseObject()" val baseOffset = s"$input.getBaseOffset()" val numBytes = s"$input.numBytes()" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionEvalUtils.scala index edc8012eb3da2..c9d15e1eb2e4d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionEvalUtils.scala @@ -16,12 +16,16 @@ */ package org.apache.spark.sql.catalyst.expressions.json -import java.io.CharArrayWriter +import java.io.{ByteArrayOutputStream, CharArrayWriter, StringWriter} -import com.fasterxml.jackson.core.JsonFactory +import scala.util.parsing.combinator.RegexParsers +import com.fasterxml.jackson.core._ +import com.fasterxml.jackson.core.json.JsonReadFeature + +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.ExprUtils +import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericInternalRow} import org.apache.spark.sql.catalyst.expressions.variant.VariantExpressionEvalUtils import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonGenerator, JacksonParser, JsonInferSchema, JSONOptions} import org.apache.spark.sql.catalyst.util.{ArrayData, FailFastMode, FailureSafeParser, MapData, PermissiveMode} @@ -31,34 +35,79 @@ import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, St import org.apache.spark.unsafe.types.{UTF8String, VariantVal} import org.apache.spark.util.Utils -object JsonExpressionEvalUtils { +private[this] sealed trait PathInstruction +private[this] object PathInstruction { + private[expressions] case object Subscript extends PathInstruction + private[expressions] case object Wildcard extends PathInstruction + private[expressions] case object Key extends PathInstruction + private[expressions] case class Index(index: Long) extends PathInstruction + private[expressions] case class Named(name: String) extends PathInstruction +} - def schemaOfJson( - jsonFactory: JsonFactory, - jsonOptions: JSONOptions, - jsonInferSchema: JsonInferSchema, - json: UTF8String): UTF8String = { - val dt = Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, json)) { parser => - parser.nextToken() - // To match with schema inference from JSON datasource. - jsonInferSchema.inferField(parser) match { - case st: StructType => - jsonInferSchema.canonicalizeType(st, jsonOptions).getOrElse(StructType(Nil)) - case at: ArrayType if at.elementType.isInstanceOf[StructType] => - jsonInferSchema - .canonicalizeType(at.elementType, jsonOptions) - .map(ArrayType(_, containsNull = at.containsNull)) - .getOrElse(ArrayType(StructType(Nil), containsNull = at.containsNull)) - case other: DataType => - jsonInferSchema.canonicalizeType(other, jsonOptions).getOrElse( - SQLConf.get.defaultStringType) - } +private[this] sealed trait WriteStyle +private[this] object WriteStyle { + private[expressions] case object RawStyle extends WriteStyle + private[expressions] case object QuotedStyle extends WriteStyle + private[expressions] case object FlattenStyle extends WriteStyle +} + +private[this] object JsonPathParser extends RegexParsers { + import PathInstruction._ + + def root: Parser[Char] = '$' + + def long: Parser[Long] = "\\d+".r ^? { + case x => x.toLong + } + + // parse `[*]` and `[123]` subscripts + def subscript: Parser[List[PathInstruction]] = + for { + operand <- '[' ~> ('*' ^^^ Wildcard | long ^^ Index) <~ ']' + } yield { + Subscript :: operand :: Nil } - UTF8String.fromString(dt.sql) + // parse `.name` or `['name']` child expressions + def named: Parser[List[PathInstruction]] = + for { + name <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\']+".r <~ "']" + } yield { + Key :: Named(name) :: Nil + } + + // child wildcards: `..`, `.*` or `['*']` + def wildcard: Parser[List[PathInstruction]] = + (".*" | "['*']") ^^^ List(Wildcard) + + def node: Parser[List[PathInstruction]] = + wildcard | + named | + subscript + + val expression: Parser[List[PathInstruction]] = { + phrase(root ~> rep(node) ^^ (x => x.flatten)) + } + + def parse(str: String): Option[List[PathInstruction]] = { + this.parseAll(expression, str) match { + case Success(result, _) => + Some(result) + + case _ => + None + } } } +private[this] object SharedFactory { + val jsonFactory: JsonFactory = new JsonFactoryBuilder() + // The two options below enabled for Hive compatibility + .enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS) + .enable(JsonReadFeature.ALLOW_SINGLE_QUOTES) + .build() +} + case class JsonToStructsEvaluator( options: Map[String, String], nullableSchema: DataType, @@ -103,6 +152,7 @@ case class JsonToStructsEvaluator( } final def evaluate(json: UTF8String): Any = { + if (json == null) return null nullableSchema match { case _: VariantType => VariantExpressionEvalUtils.parseJson(json, @@ -159,3 +209,370 @@ case class StructsToJsonEvaluator( converter(value) } } + +case class SchemaOfJsonEvaluator(options: Map[String, String]) { + @transient + private lazy val jsonOptions = new JSONOptions(options, "UTC") + + @transient + private lazy val jsonFactory = jsonOptions.buildJsonFactory() + + @transient + private lazy val jsonInferSchema = new JsonInferSchema(jsonOptions) + + final def evaluate(json: UTF8String): Any = { + val dt = Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, json)) { parser => + parser.nextToken() + // To match with schema inference from JSON datasource. + jsonInferSchema.inferField(parser) match { + case st: StructType => + jsonInferSchema.canonicalizeType(st, jsonOptions).getOrElse(StructType(Nil)) + case at: ArrayType if at.elementType.isInstanceOf[StructType] => + jsonInferSchema + .canonicalizeType(at.elementType, jsonOptions) + .map(ArrayType(_, containsNull = at.containsNull)) + .getOrElse(ArrayType(StructType(Nil), containsNull = at.containsNull)) + case other: DataType => + jsonInferSchema.canonicalizeType(other, jsonOptions).getOrElse( + SQLConf.get.defaultStringType) + } + } + + UTF8String.fromString(dt.sql) + } +} + +/** + * The expression `JsonTuple` will utilize it to support codegen. + */ +case class JsonTupleEvaluator(foldableFieldNames: Array[Option[String]]) { + + import SharedFactory._ + + // If processing fails this shared value will be returned. + @transient private lazy val nullRow: Seq[InternalRow] = + new GenericInternalRow(Array.ofDim[Any](foldableFieldNames.length)) :: Nil + + // And count the number of foldable fields, we'll use this later to optimize evaluation. + @transient private lazy val constantFields: Int = foldableFieldNames.count(_ != null) + + private def getFieldNameStrings(fields: Array[UTF8String]): Array[String] = { + // Evaluate the field names as String rather than UTF8String to + // optimize lookups from the json token, which is also a String. + if (constantFields == fields.length) { + // Typically the user will provide the field names as foldable expressions + // so we can use the cached copy. + foldableFieldNames.map(_.orNull) + } else if (constantFields == 0) { + // None are foldable so all field names need to be evaluated from the input row. + fields.map { f => if (f != null) f.toString else null } + } else { + // If there is a mix of constant and non-constant expressions + // prefer the cached copy when available. + foldableFieldNames.zip(fields).map { + case (null, f) => if (f != null) f.toString else null + case (fieldName, _) => fieldName.orNull + } + } + } + + private def parseRow(parser: JsonParser, fieldNames: Array[String]): Seq[InternalRow] = { + // Only objects are supported. + if (parser.nextToken() != JsonToken.START_OBJECT) return nullRow + + val row = Array.ofDim[Any](fieldNames.length) + + // Start reading through the token stream, looking for any requested field names. + while (parser.nextToken() != JsonToken.END_OBJECT) { + if (parser.getCurrentToken == JsonToken.FIELD_NAME) { + // Check to see if this field is desired in the output. + val jsonField = parser.currentName + var idx = fieldNames.indexOf(jsonField) + if (idx >= 0) { + // It is, copy the child tree to the correct location in the output row. + val output = new ByteArrayOutputStream() + + // Write the output directly to UTF8 encoded byte array. + if (parser.nextToken() != JsonToken.VALUE_NULL) { + Utils.tryWithResource(jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { + generator => copyCurrentStructure(generator, parser) + } + + val jsonValue = UTF8String.fromBytes(output.toByteArray) + + // SPARK-21804: json_tuple returns null values within repeated columns + // except the first one; so that we need to check the remaining fields. + do { + row(idx) = jsonValue + idx = fieldNames.indexOf(jsonField, idx + 1) + } while (idx >= 0) + } + } + } + + // Always skip children, it's cheap enough to do even if copyCurrentStructure was called. + parser.skipChildren() + } + new GenericInternalRow(row) :: Nil + } + + private def copyCurrentStructure(generator: JsonGenerator, parser: JsonParser): Unit = { + parser.getCurrentToken match { + // If the user requests a string field it needs to be returned without enclosing + // quotes which is accomplished via JsonGenerator.writeRaw instead of JsonGenerator.write. + case JsonToken.VALUE_STRING if parser.hasTextCharacters => + // Slight optimization to avoid allocating a String instance, though the characters + // still have to be decoded... Jackson doesn't have a way to access the raw bytes. + generator.writeRaw(parser.getTextCharacters, parser.getTextOffset, parser.getTextLength) + + case JsonToken.VALUE_STRING => + // The normal String case, pass it through to the output without enclosing quotes. + generator.writeRaw(parser.getText) + + case JsonToken.VALUE_NULL => + // A special case that needs to be handled outside of this method. + // If a requested field is null, the result must be null. The easiest + // way to achieve this is just by ignoring null tokens entirely. + throw SparkException.internalError("Do not attempt to copy a null field.") + + case _ => + // Handle other types including objects, arrays, booleans and numbers. + generator.copyCurrentStructure(parser) + } + } + + final def evaluate(json: UTF8String, fieldNames: Array[UTF8String]): IterableOnce[InternalRow] = { + if (json == null) return nullRow + try { + /* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson + detect character encoding which could fail for some malformed strings. */ + Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, json)) { parser => + parseRow(parser, getFieldNameStrings(fieldNames)) + } + } catch { + case _: JsonProcessingException => nullRow + } + } +} + +/** + * The expression `GetJsonObject` will utilize it to support codegen. + */ +case class GetJsonObjectEvaluator(cachedPath: UTF8String) { + import com.fasterxml.jackson.core.JsonToken._ + import PathInstruction._ + import SharedFactory._ + import WriteStyle._ + + def this() = this(null) + + @transient + private lazy val parsedPath: Option[List[PathInstruction]] = parsePath(cachedPath) + + @transient + private var jsonStr: UTF8String = _ + + @transient + private var pathStr: UTF8String = _ + + def setJson(arg: UTF8String): Unit = { + jsonStr = arg + } + + def setPath(arg: UTF8String): Unit = { + pathStr = arg + } + + def evaluate(): Any = { + if (jsonStr == null) return null + + val parsed = if (cachedPath != null) { + parsedPath + } else { + parsePath(pathStr) + } + + if (parsed.isDefined) { + try { + /* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson + detect character encoding which could fail for some malformed strings */ + Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, jsonStr)) { parser => + val output = new ByteArrayOutputStream() + val matched = Utils.tryWithResource( + jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { generator => + parser.nextToken() + evaluatePath(parser, generator, RawStyle, parsed.get) + } + if (matched) { + UTF8String.fromBytes(output.toByteArray) + } else { + null + } + } + } catch { + case _: JsonProcessingException => null + } + } else { + null + } + } + + private def parsePath(path: UTF8String): Option[List[PathInstruction]] = { + if (path != null) { + JsonPathParser.parse(path.toString) + } else { + None + } + } + + // advance to the desired array index, assumes to start at the START_ARRAY token + private def arrayIndex(p: JsonParser, f: () => Boolean): Long => Boolean = { + case _ if p.getCurrentToken == END_ARRAY => + // terminate, nothing has been written + false + + case 0 => + // we've reached the desired index + val dirty = f() + + while (p.nextToken() != END_ARRAY) { + // advance the token stream to the end of the array + p.skipChildren() + } + + dirty + + case i if i > 0 => + // skip this token and evaluate the next + p.skipChildren() + p.nextToken() + arrayIndex(p, f)(i - 1) + } + + /** + * Evaluate a list of JsonPath instructions, returning a bool that indicates if any leaf nodes + * have been written to the generator + */ + private def evaluatePath( + p: JsonParser, + g: JsonGenerator, + style: WriteStyle, + path: List[PathInstruction]): Boolean = { + (p.getCurrentToken, path) match { + case (VALUE_STRING, Nil) if style == RawStyle => + // there is no array wildcard or slice parent, emit this string without quotes + if (p.hasTextCharacters) { + g.writeRaw(p.getTextCharacters, p.getTextOffset, p.getTextLength) + } else { + g.writeRaw(p.getText) + } + true + + case (START_ARRAY, Nil) if style == FlattenStyle => + // flatten this array into the parent + var dirty = false + while (p.nextToken() != END_ARRAY) { + dirty |= evaluatePath(p, g, style, Nil) + } + dirty + + case (_, Nil) => + // general case: just copy the child tree verbatim + g.copyCurrentStructure(p) + true + + case (START_OBJECT, Key :: xs) => + var dirty = false + while (p.nextToken() != END_OBJECT) { + if (dirty) { + // once a match has been found we can skip other fields + p.skipChildren() + } else { + dirty = evaluatePath(p, g, style, xs) + } + } + dirty + + case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) => + // special handling for the non-structure preserving double wildcard behavior in Hive + var dirty = false + g.writeStartArray() + while (p.nextToken() != END_ARRAY) { + dirty |= evaluatePath(p, g, FlattenStyle, xs) + } + g.writeEndArray() + dirty + + case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle => + // retain Flatten, otherwise use Quoted... cannot use Raw within an array + val nextStyle = style match { + case RawStyle => QuotedStyle + case FlattenStyle => FlattenStyle + case QuotedStyle => throw SparkException.internalError("Unexpected the quoted style.") + } + + // temporarily buffer child matches, the emitted json will need to be + // modified slightly if there is only a single element written + val buffer = new StringWriter() + + var dirty = 0 + Utils.tryWithResource(jsonFactory.createGenerator(buffer)) { flattenGenerator => + flattenGenerator.writeStartArray() + + while (p.nextToken() != END_ARRAY) { + // track the number of array elements and only emit an outer array if + // we've written more than one element, this matches Hive's behavior + dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0) + } + flattenGenerator.writeEndArray() + } + + val buf = buffer.getBuffer + if (dirty > 1) { + g.writeRawValue(buf.toString) + } else if (dirty == 1) { + // remove outer array tokens + g.writeRawValue(buf.substring(1, buf.length() - 1)) + } // else do not write anything + + dirty > 0 + + case (START_ARRAY, Subscript :: Wildcard :: xs) => + var dirty = false + g.writeStartArray() + while (p.nextToken() != END_ARRAY) { + // wildcards can have multiple matches, continually update the dirty count + dirty |= evaluatePath(p, g, QuotedStyle, xs) + } + g.writeEndArray() + + dirty + + case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) => + p.nextToken() + // we're going to have 1 or more results, switch to QuotedStyle + arrayIndex(p, () => evaluatePath(p, g, QuotedStyle, xs))(idx) + + case (START_ARRAY, Subscript :: Index(idx) :: xs) => + p.nextToken() + arrayIndex(p, () => evaluatePath(p, g, style, xs))(idx) + + case (FIELD_NAME, Named(name) :: xs) if p.currentName == name => + // exact field match + if (p.nextToken() != JsonToken.VALUE_NULL) { + evaluatePath(p, g, style, xs) + } else { + false + } + + case (FIELD_NAME, Wildcard :: xs) => + // wildcard field match + p.nextToken() + evaluatePath(p, g, style, xs) + + case _ => + p.skipChildren() + false + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index ac6c233f7d2ea..e80f543f14eda 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -17,20 +17,12 @@ package org.apache.spark.sql.catalyst.expressions -import java.io._ - -import scala.util.parsing.combinator.RegexParsers - -import com.fasterxml.jackson.core._ -import com.fasterxml.jackson.core.json.JsonReadFeature - -import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, CodegenFallback, ExprCode} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper -import org.apache.spark.sql.catalyst.expressions.json.{JsonExpressionEvalUtils, JsonExpressionUtils, JsonToStructsEvaluator, StructsToJsonEvaluator} +import org.apache.spark.sql.catalyst.expressions.json.{GetJsonObjectEvaluator, JsonExpressionUtils, JsonToStructsEvaluator, JsonTupleEvaluator, SchemaOfJsonEvaluator, StructsToJsonEvaluator} import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} import org.apache.spark.sql.catalyst.json._ import org.apache.spark.sql.catalyst.trees.TreePattern.{JSON_TO_STRUCT, RUNTIME_REPLACEABLE, TreePattern} @@ -39,80 +31,6 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import org.apache.spark.util.Utils - -private[this] sealed trait PathInstruction -private[this] object PathInstruction { - private[expressions] case object Subscript extends PathInstruction - private[expressions] case object Wildcard extends PathInstruction - private[expressions] case object Key extends PathInstruction - private[expressions] case class Index(index: Long) extends PathInstruction - private[expressions] case class Named(name: String) extends PathInstruction -} - -private[this] sealed trait WriteStyle -private[this] object WriteStyle { - private[expressions] case object RawStyle extends WriteStyle - private[expressions] case object QuotedStyle extends WriteStyle - private[expressions] case object FlattenStyle extends WriteStyle -} - -private[this] object JsonPathParser extends RegexParsers { - import PathInstruction._ - - def root: Parser[Char] = '$' - - def long: Parser[Long] = "\\d+".r ^? { - case x => x.toLong - } - - // parse `[*]` and `[123]` subscripts - def subscript: Parser[List[PathInstruction]] = - for { - operand <- '[' ~> ('*' ^^^ Wildcard | long ^^ Index) <~ ']' - } yield { - Subscript :: operand :: Nil - } - - // parse `.name` or `['name']` child expressions - def named: Parser[List[PathInstruction]] = - for { - name <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\']+".r <~ "']" - } yield { - Key :: Named(name) :: Nil - } - - // child wildcards: `..`, `.*` or `['*']` - def wildcard: Parser[List[PathInstruction]] = - (".*" | "['*']") ^^^ List(Wildcard) - - def node: Parser[List[PathInstruction]] = - wildcard | - named | - subscript - - val expression: Parser[List[PathInstruction]] = { - phrase(root ~> rep(node) ^^ (x => x.flatten)) - } - - def parse(str: String): Option[List[PathInstruction]] = { - this.parseAll(expression, str) match { - case Success(result, _) => - Some(result) - - case _ => - None - } - } -} - -private[this] object SharedFactory { - val jsonFactory = new JsonFactoryBuilder() - // The two options below enabled for Hive compatibility - .enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS) - .enable(JsonReadFeature.ALLOW_SINGLE_QUOTES) - .build() -} /** * Extracts json object from a json string based on json path specified, and returns json string @@ -133,7 +51,9 @@ case class GetJsonObject(json: Expression, path: Expression) override def left: Expression = json override def right: Expression = path override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, StringTypeWithCollation) + Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = true override def prettyName: String = "get_json_object" @@ -211,228 +131,6 @@ case class GetJsonObject(json: Expression, path: Expression) copy(json = newLeft, path = newRight) } -class GetJsonObjectEvaluator(cachedPath: UTF8String) { - import com.fasterxml.jackson.core.JsonToken._ - import PathInstruction._ - import SharedFactory._ - import WriteStyle._ - - def this() = this(null) - - @transient - private lazy val parsedPath: Option[List[PathInstruction]] = - parsePath(cachedPath) - - @transient - private var jsonStr: UTF8String = null - - @transient - private var pathStr: UTF8String = null - - def setJson(arg: UTF8String): Unit = { - jsonStr = arg - } - - def setPath(arg: UTF8String): Unit = { - pathStr = arg - } - - def evaluate(): Any = { - if (jsonStr == null) { - return null - } - - val parsed = if (cachedPath != null) { - parsedPath - } else { - parsePath(pathStr) - } - - if (parsed.isDefined) { - try { - /* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson - detect character encoding which could fail for some malformed strings */ - Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, jsonStr)) { parser => - val output = new ByteArrayOutputStream() - val matched = Utils.tryWithResource( - jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { generator => - parser.nextToken() - evaluatePath(parser, generator, RawStyle, parsed.get) - } - if (matched) { - UTF8String.fromBytes(output.toByteArray) - } else { - null - } - } - } catch { - case _: JsonProcessingException => null - } - } else { - null - } - } - - private def parsePath(path: UTF8String): Option[List[PathInstruction]] = { - if (path != null) { - JsonPathParser.parse(path.toString) - } else { - None - } - } - - // advance to the desired array index, assumes to start at the START_ARRAY token - private def arrayIndex(p: JsonParser, f: () => Boolean): Long => Boolean = { - case _ if p.getCurrentToken == END_ARRAY => - // terminate, nothing has been written - false - - case 0 => - // we've reached the desired index - val dirty = f() - - while (p.nextToken() != END_ARRAY) { - // advance the token stream to the end of the array - p.skipChildren() - } - - dirty - - case i if i > 0 => - // skip this token and evaluate the next - p.skipChildren() - p.nextToken() - arrayIndex(p, f)(i - 1) - } - - /** - * Evaluate a list of JsonPath instructions, returning a bool that indicates if any leaf nodes - * have been written to the generator - */ - private def evaluatePath( - p: JsonParser, - g: JsonGenerator, - style: WriteStyle, - path: List[PathInstruction]): Boolean = { - (p.getCurrentToken, path) match { - case (VALUE_STRING, Nil) if style == RawStyle => - // there is no array wildcard or slice parent, emit this string without quotes - if (p.hasTextCharacters) { - g.writeRaw(p.getTextCharacters, p.getTextOffset, p.getTextLength) - } else { - g.writeRaw(p.getText) - } - true - - case (START_ARRAY, Nil) if style == FlattenStyle => - // flatten this array into the parent - var dirty = false - while (p.nextToken() != END_ARRAY) { - dirty |= evaluatePath(p, g, style, Nil) - } - dirty - - case (_, Nil) => - // general case: just copy the child tree verbatim - g.copyCurrentStructure(p) - true - - case (START_OBJECT, Key :: xs) => - var dirty = false - while (p.nextToken() != END_OBJECT) { - if (dirty) { - // once a match has been found we can skip other fields - p.skipChildren() - } else { - dirty = evaluatePath(p, g, style, xs) - } - } - dirty - - case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) => - // special handling for the non-structure preserving double wildcard behavior in Hive - var dirty = false - g.writeStartArray() - while (p.nextToken() != END_ARRAY) { - dirty |= evaluatePath(p, g, FlattenStyle, xs) - } - g.writeEndArray() - dirty - - case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle => - // retain Flatten, otherwise use Quoted... cannot use Raw within an array - val nextStyle = style match { - case RawStyle => QuotedStyle - case FlattenStyle => FlattenStyle - case QuotedStyle => throw SparkException.internalError("Unexpected the quoted style.") - } - - // temporarily buffer child matches, the emitted json will need to be - // modified slightly if there is only a single element written - val buffer = new StringWriter() - - var dirty = 0 - Utils.tryWithResource(jsonFactory.createGenerator(buffer)) { flattenGenerator => - flattenGenerator.writeStartArray() - - while (p.nextToken() != END_ARRAY) { - // track the number of array elements and only emit an outer array if - // we've written more than one element, this matches Hive's behavior - dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0) - } - flattenGenerator.writeEndArray() - } - - val buf = buffer.getBuffer - if (dirty > 1) { - g.writeRawValue(buf.toString) - } else if (dirty == 1) { - // remove outer array tokens - g.writeRawValue(buf.substring(1, buf.length() - 1)) - } // else do not write anything - - dirty > 0 - - case (START_ARRAY, Subscript :: Wildcard :: xs) => - var dirty = false - g.writeStartArray() - while (p.nextToken() != END_ARRAY) { - // wildcards can have multiple matches, continually update the dirty count - dirty |= evaluatePath(p, g, QuotedStyle, xs) - } - g.writeEndArray() - - dirty - - case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) => - p.nextToken() - // we're going to have 1 or more results, switch to QuotedStyle - arrayIndex(p, () => evaluatePath(p, g, QuotedStyle, xs))(idx) - - case (START_ARRAY, Subscript :: Index(idx) :: xs) => - p.nextToken() - arrayIndex(p, () => evaluatePath(p, g, style, xs))(idx) - - case (FIELD_NAME, Named(name) :: xs) if p.currentName == name => - // exact field match - if (p.nextToken() != JsonToken.VALUE_NULL) { - evaluatePath(p, g, style, xs) - } else { - false - } - - case (FIELD_NAME, Wildcard :: xs) => - // wildcard field match - p.nextToken() - evaluatePath(p, g, style, xs) - - case _ => - p.skipChildren() - false - } - } -} - // scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.", @@ -446,37 +144,27 @@ class GetJsonObjectEvaluator(cachedPath: UTF8String) { // scalastyle:on line.size.limit line.contains.tab case class JsonTuple(children: Seq[Expression]) extends Generator - with CodegenFallback with QueryErrorsBase { - import SharedFactory._ - override def nullable: Boolean = { - // a row is always returned + // A row is always returned. false } - // if processing fails this shared value will be returned - @transient private lazy val nullRow: Seq[InternalRow] = - new GenericInternalRow(Array.ofDim[Any](fieldExpressions.length)) :: Nil - - // the json body is the first child + // The json body is the first child. @transient private lazy val jsonExpr: Expression = children.head - // the fields to query are the remaining children + // The fields to query are the remaining children. @transient private lazy val fieldExpressions: Seq[Expression] = children.tail - // eagerly evaluate any foldable the field names - @transient private lazy val foldableFieldNames: IndexedSeq[Option[String]] = { + // Eagerly evaluate any foldable the field names. + @transient private lazy val foldableFieldNames: Array[Option[String]] = { fieldExpressions.map { case expr if expr.foldable => Option(expr.eval()).map(_.asInstanceOf[UTF8String].toString) case _ => null - }.toIndexedSeq + }.toArray } - // and count the number of foldable fields, we'll use this later to optimize evaluation - @transient private lazy val constantFields: Int = foldableFieldNames.count(_ != null) - override def elementSchema: StructType = StructType(fieldExpressions.zipWithIndex.map { case (_, idx) => StructField(s"c$idx", children.head.dataType, nullable = true) }) @@ -490,7 +178,8 @@ case class JsonTuple(children: Seq[Expression]) ) } else if ( children.forall( - child => StringTypeWithCollation.acceptsType(child.dataType))) { + child => StringTypeWithCollation(supportsTrimCollation = true) + .acceptsType(child.dataType))) { TypeCheckResult.TypeCheckSuccess } else { DataTypeMismatch( @@ -499,111 +188,41 @@ case class JsonTuple(children: Seq[Expression]) } } + @transient + private lazy val evaluator: JsonTupleEvaluator = JsonTupleEvaluator(foldableFieldNames) + override def eval(input: InternalRow): IterableOnce[InternalRow] = { val json = jsonExpr.eval(input).asInstanceOf[UTF8String] - if (json == null) { - return nullRow - } - - try { - /* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson - detect character encoding which could fail for some malformed strings */ - Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, json)) { parser => - parseRow(parser, input) - } - } catch { - case _: JsonProcessingException => - nullRow - } + val filedNames = fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String]).toArray + evaluator.evaluate(json, filedNames) } - private def parseRow(parser: JsonParser, input: InternalRow): Seq[InternalRow] = { - // only objects are supported - if (parser.nextToken() != JsonToken.START_OBJECT) { - return nullRow - } - - // evaluate the field names as String rather than UTF8String to - // optimize lookups from the json token, which is also a String - val fieldNames = if (constantFields == fieldExpressions.length) { - // typically the user will provide the field names as foldable expressions - // so we can use the cached copy - foldableFieldNames.map(_.orNull) - } else if (constantFields == 0) { - // none are foldable so all field names need to be evaluated from the input row - fieldExpressions.map { expr => - Option(expr.eval(input)).map(_.asInstanceOf[UTF8String].toString).orNull - } - } else { - // if there is a mix of constant and non-constant expressions - // prefer the cached copy when available - foldableFieldNames.zip(fieldExpressions).map { - case (null, expr) => - Option(expr.eval(input)).map(_.asInstanceOf[UTF8String].toString).orNull - case (fieldName, _) => fieldName.orNull - } - } - - val row = Array.ofDim[Any](fieldNames.length) - - // start reading through the token stream, looking for any requested field names - while (parser.nextToken() != JsonToken.END_OBJECT) { - if (parser.getCurrentToken == JsonToken.FIELD_NAME) { - // check to see if this field is desired in the output - val jsonField = parser.currentName - var idx = fieldNames.indexOf(jsonField) - if (idx >= 0) { - // it is, copy the child tree to the correct location in the output row - val output = new ByteArrayOutputStream() - - // write the output directly to UTF8 encoded byte array - if (parser.nextToken() != JsonToken.VALUE_NULL) { - Utils.tryWithResource(jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { - generator => copyCurrentStructure(generator, parser) - } - - val jsonValue = UTF8String.fromBytes(output.toByteArray) - - // SPARK-21804: json_tuple returns null values within repeated columns - // except the first one; so that we need to check the remaining fields. - do { - row(idx) = jsonValue - idx = fieldNames.indexOf(jsonField, idx + 1) - } while (idx >= 0) - } - } - } - - // always skip children, it's cheap enough to do even if copyCurrentStructure was called - parser.skipChildren() - } - - new GenericInternalRow(row) :: Nil - } - - private def copyCurrentStructure(generator: JsonGenerator, parser: JsonParser): Unit = { - parser.getCurrentToken match { - // if the user requests a string field it needs to be returned without enclosing - // quotes which is accomplished via JsonGenerator.writeRaw instead of JsonGenerator.write - case JsonToken.VALUE_STRING if parser.hasTextCharacters => - // slight optimization to avoid allocating a String instance, though the characters - // still have to be decoded... Jackson doesn't have a way to access the raw bytes - generator.writeRaw(parser.getTextCharacters, parser.getTextOffset, parser.getTextLength) - - case JsonToken.VALUE_STRING => - // the normal String case, pass it through to the output without enclosing quotes - generator.writeRaw(parser.getText) - - case JsonToken.VALUE_NULL => - // a special case that needs to be handled outside of this method. - // if a requested field is null, the result must be null. the easiest - // way to achieve this is just by ignoring null tokens entirely - throw SparkException.internalError("Do not attempt to copy a null field.") - - case _ => - // handle other types including objects, arrays, booleans and numbers - generator.copyCurrentStructure(parser) + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val refEvaluator = ctx.addReferenceObj("evaluator", evaluator) + val jsonEval = jsonExpr.genCode(ctx) + val filedNamesTerm = ctx.freshName("fieldNames") + val fieldNamesEval = fieldExpressions.map(_.genCode(ctx)) + val wrapperClass = classOf[IterableOnce[_]].getName + val setFieldNames = fieldNamesEval.zipWithIndex.map { + case (fieldNameEval, idx) => + s""" + |if (${fieldNameEval.isNull}) { + | $filedNamesTerm[$idx] = null; + |} else { + | $filedNamesTerm[$idx] = ${fieldNameEval.value}; + |} + |""".stripMargin } + ev.copy(code = + code""" + |UTF8String[] $filedNamesTerm = new UTF8String[${fieldExpressions.length}]; + |${jsonEval.code} + |${fieldNamesEval.map(_.code).mkString("\n")} + |${setFieldNames.mkString("\n")} + |boolean ${ev.isNull} = false; + |$wrapperClass ${ev.value} = + | $refEvaluator.evaluate(${jsonEval.value}, $filedNamesTerm); + |""".stripMargin) } override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): JsonTuple = @@ -636,9 +255,8 @@ case class JsonToStructs( timeZoneId: Option[String] = None, variantAllowDuplicateKeys: Boolean = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS)) extends UnaryExpression - with RuntimeReplaceable - with ExpectsInputTypes with TimeZoneAwareExpression + with ExpectsInputTypes with QueryErrorsBase { // The JSON input data might be missing certain fields. We force the nullability @@ -648,7 +266,9 @@ case class JsonToStructs( override def nullable: Boolean = true - override def nodePatternsInternal(): Seq[TreePattern] = Seq(JSON_TO_STRUCT, RUNTIME_REPLACEABLE) + final override def nodePatternsInternal(): Seq[TreePattern] = Seq(JSON_TO_STRUCT) + + override def nullIntolerant: Boolean = true // Used in `FunctionRegistry` def this(child: Expression, schema: Expression, options: Map[String, String]) = @@ -682,7 +302,34 @@ case class JsonToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) - override def inputTypes: Seq[AbstractDataType] = StringTypeWithCollation :: Nil + @transient + private val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD) + + @transient + private lazy val evaluator = new JsonToStructsEvaluator( + options, nullableSchema, nameOfCorruptRecord, timeZoneId, variantAllowDuplicateKeys) + + override def nullSafeEval(json: Any): Any = evaluator.evaluate(json.asInstanceOf[UTF8String]) + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val refEvaluator = ctx.addReferenceObj("evaluator", evaluator) + val eval = child.genCode(ctx) + val resultType = CodeGenerator.boxedType(dataType) + val resultTerm = ctx.freshName("result") + ev.copy(code = + code""" + |${eval.code} + |$resultType $resultTerm = ($resultType) $refEvaluator.evaluate(${eval.value}); + |boolean ${ev.isNull} = $resultTerm == null; + |${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; + |if (!${ev.isNull}) { + | ${ev.value} = $resultTerm; + |} + |""".stripMargin) + } + + override def inputTypes: Seq[AbstractDataType] = + StringTypeWithCollation(supportsTrimCollation = true) :: Nil override def sql: String = schema match { case _: MapType => "entries" @@ -691,21 +338,6 @@ case class JsonToStructs( override def prettyName: String = "from_json" - @transient - private val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD) - - @transient - lazy val evaluator: JsonToStructsEvaluator = JsonToStructsEvaluator( - options, nullableSchema, nameOfCorruptRecord, timeZoneId, variantAllowDuplicateKeys) - - override def replacement: Expression = Invoke( - Literal.create(evaluator, ObjectType(classOf[JsonToStructsEvaluator])), - "evaluate", - dataType, - Seq(child), - Seq(child.dataType) - ) - override protected def withNewChildInternal(newChild: Expression): JsonToStructs = copy(child = newChild) } @@ -833,15 +465,6 @@ case class SchemaOfJson( override def nullable: Boolean = false - @transient - private lazy val jsonOptions = new JSONOptions(options, "UTC") - - @transient - private lazy val jsonFactory = jsonOptions.buildJsonFactory() - - @transient - private lazy val jsonInferSchema = new JsonInferSchema(jsonOptions) - @transient private lazy val json = child.eval().asInstanceOf[UTF8String] @@ -862,20 +485,16 @@ case class SchemaOfJson( } } - @transient private lazy val jsonFactoryObjectType = ObjectType(classOf[JsonFactory]) - @transient private lazy val jsonOptionsObjectType = ObjectType(classOf[JSONOptions]) - @transient private lazy val jsonInferSchemaObjectType = ObjectType(classOf[JsonInferSchema]) + @transient + private lazy val evaluator: SchemaOfJsonEvaluator = SchemaOfJsonEvaluator(options) - override def replacement: Expression = StaticInvoke( - JsonExpressionEvalUtils.getClass, + override def replacement: Expression = Invoke( + Literal.create(evaluator, ObjectType(classOf[SchemaOfJsonEvaluator])), + "evaluate", dataType, - "schemaOfJson", - Seq(Literal(jsonFactory, jsonFactoryObjectType), - Literal(jsonOptions, jsonOptionsObjectType), - Literal(jsonInferSchema, jsonInferSchemaObjectType), - child), - Seq(jsonFactoryObjectType, jsonOptionsObjectType, jsonInferSchemaObjectType, child.dataType) - ) + Seq(child), + Seq(child.dataType), + returnNullable = false) override def prettyName: String = "schema_of_json" @@ -910,7 +529,8 @@ case class LengthOfJsonArray(child: Expression) with ExpectsInputTypes with RuntimeReplaceable { - override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = IntegerType override def nullable: Boolean = true override def prettyName: String = "json_array_length" @@ -955,7 +575,8 @@ case class JsonObjectKeys(child: Expression) with ExpectsInputTypes with RuntimeReplaceable { - override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = ArrayType(SQLConf.get.defaultStringType) override def nullable: Boolean = true override def prettyName: String = "json_object_keys" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 4cffc7f0b53a3..c1225f9e5b502 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -166,6 +166,8 @@ object Literal { case _: DayTimeIntervalType if v.isInstanceOf[Duration] => Literal(CatalystTypeConverters.createToCatalystConverter(dataType)(v), dataType) case _: ObjectType => Literal(v, dataType) + case CharType(_) | VarcharType(_) if SQLConf.get.preserveCharVarcharTypeInfo => + Literal(CatalystTypeConverters.createToCatalystConverter(dataType)(v), dataType) case _ => Literal(CatalystTypeConverters.convertToCatalyst(v), dataType) } } @@ -196,6 +198,12 @@ object Literal { case TimestampNTZType => create(0L, TimestampNTZType) case it: DayTimeIntervalType => create(0L, it) case it: YearMonthIntervalType => create(0, it) + case CharType(length) => + create(CharVarcharCodegenUtils.charTypeWriteSideCheck(UTF8String.fromString(""), length), + dataType) + case VarcharType(length) => + create(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(UTF8String.fromString(""), length), + dataType) case st: StringType => Literal(UTF8String.fromString(""), st) case BinaryType => Literal("".getBytes(StandardCharsets.UTF_8)) case CalendarIntervalType => Literal(new CalendarInterval(0, 0, 0)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala index 7be6df14194fc..5b17d2029ed1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala @@ -193,11 +193,11 @@ case class Mask( */ override def inputTypes: Seq[AbstractDataType] = Seq( - StringTypeWithCollation, - StringTypeWithCollation, - StringTypeWithCollation, - StringTypeWithCollation, - StringTypeWithCollation) + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true)) override def nullable: Boolean = true diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 30f07dcc1e67e..317a08b8c64c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -455,7 +455,7 @@ case class Conv( override def second: Expression = fromBaseExpr override def third: Expression = toBaseExpr override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, IntegerType, IntegerType) + Seq(StringTypeWithCollation(supportsTrimCollation = true), IntegerType, IntegerType) override def dataType: DataType = first.dataType override def nullable: Boolean = true @@ -1118,7 +1118,7 @@ case class Hex(child: Expression) override def nullIntolerant: Boolean = true override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(LongType, BinaryType, StringTypeWithCollation)) + Seq(TypeCollection(LongType, BinaryType, StringTypeWithCollation(supportsTrimCollation = true))) override def dataType: DataType = child.dataType match { case st: StringType => st @@ -1163,7 +1163,8 @@ case class Unhex(child: Expression, failOnError: Boolean = false) def this(expr: Expression) = this(expr, false) - override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeWithCollation(supportsTrimCollation = true)) override def nullable: Boolean = true override def dataType: DataType = BinaryType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 5f1b3dc0a01ac..fb30eab327d4c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.util.{MapData, RandomUUIDGenerator} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.errors.QueryExecutionErrors.raiseError import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.types.StringTypeWithCollation +import org.apache.spark.sql.internal.types.{AbstractMapType, StringTypeWithCollation} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -85,7 +85,12 @@ case class RaiseError(errorClass: Expression, errorParms: Expression, dataType: override def foldable: Boolean = false override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, MapType(StringType, StringType)) + Seq( + StringTypeWithCollation(supportsTrimCollation = true), + AbstractMapType( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true) + )) override def left: Expression = errorClass override def right: Expression = errorParms @@ -416,8 +421,8 @@ case class AesEncrypt( override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType, - StringTypeWithCollation, - StringTypeWithCollation, + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), BinaryType, BinaryType) override def children: Seq[Expression] = Seq(input, key, mode, padding, iv, aad) @@ -493,8 +498,8 @@ case class AesDecrypt( override def inputTypes: Seq[AbstractDataType] = { Seq(BinaryType, BinaryType, - StringTypeWithCollation, - StringTypeWithCollation, BinaryType) + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), BinaryType) } override def prettyName: String = "aes_decrypt" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index d4dcfdc5e72fb..fd6399d65271e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -51,7 +51,9 @@ abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Bo } override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, StringTypeWithCollation) + Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true)) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() @@ -288,7 +290,7 @@ case class ToCharacter(left: Expression, right: Expression) override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[AbstractDataType] = - Seq(DecimalType, StringTypeWithCollation) + Seq(DecimalType, StringTypeWithCollation(supportsTrimCollation = true)) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() if (inputTypeCheck.isSuccess) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala index 86d3cee6a0600..114a43c34c040 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala @@ -346,7 +346,16 @@ package object expressions { */ def resolve(nameParts: Seq[String], resolver: Resolver): Option[NamedExpression] = { val (candidates, nestedFields) = getCandidatesForResolution(nameParts, resolver) - resolveCandidates(nameParts, resolver, candidates, nestedFields) + val resolvedCandidates = resolveCandidates(nameParts, resolver, candidates, nestedFields) + resolvedCandidates match { + case Seq() => None + case Seq(a) => Some(a) + case _ => + throw QueryCompilationErrors.ambiguousReferenceError( + UnresolvedAttribute(nameParts).name, + resolvedCandidates.map(_.toAttribute) + ) + } } def getCandidatesForResolution( @@ -371,7 +380,7 @@ package object expressions { nameParts: Seq[String], resolver: Resolver, candidates: Seq[Attribute], - nestedFields: Seq[String]): Option[NamedExpression] = { + nestedFields: Seq[String]): Seq[NamedExpression] = { def name = UnresolvedAttribute(nameParts).name // We may have resolved the attributes from metadata columns. The resolved attributes will be // put in a logical plan node and becomes normal attributes. They can still keep the special @@ -389,19 +398,19 @@ package object expressions { val fieldExprs = nestedFields.foldLeft(a: Expression) { (e, name) => ExtractValue(e, Literal(name), resolver) } - Some(Alias(fieldExprs, nestedFields.last)()) + Seq(Alias(fieldExprs, nestedFields.last)()) case Seq(a) => // One match, no nested fields, use it. - Some(a) + Seq(a) case Seq() => // No matches. - None + Seq() case ambiguousReferences => // More than one match. - throw QueryCompilationErrors.ambiguousReferenceError(name, ambiguousReferences) + ambiguousReferences } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/pipeOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/pipeOperators.scala index 1b5ee54729136..2ee68663ad2fd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/pipeOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/pipeOperators.scala @@ -18,7 +18,11 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern.{PIPE_EXPRESSION, PIPE_OPERATOR, TreePattern} import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.types.DataType /** * Represents an expression when used with a SQL pipe operator. @@ -30,19 +34,56 @@ import org.apache.spark.sql.errors.QueryCompilationErrors * @param clause The clause of the pipe operator. This is used to generate error messages. */ case class PipeExpression(child: Expression, isAggregate: Boolean, clause: String) - extends UnaryExpression with RuntimeReplaceable { + extends UnaryExpression with Unevaluable { + final override val nodePatterns = Seq(PIPE_EXPRESSION) + final override lazy val resolved = false override def withNewChildInternal(newChild: Expression): Expression = PipeExpression(newChild, isAggregate, clause) - override lazy val replacement: Expression = { - val firstAggregateFunction: Option[AggregateFunction] = findFirstAggregate(child) - if (isAggregate && firstAggregateFunction.isEmpty) { - throw QueryCompilationErrors.pipeOperatorAggregateExpressionContainsNoAggregateFunction(child) - } else if (!isAggregate) { - firstAggregateFunction.foreach { a => - throw QueryCompilationErrors.pipeOperatorContainsAggregateFunction(a, clause) + override def dataType: DataType = child.dataType +} + +/** + * Represents the location within a logical plan that a SQL pipe operator appeared. + * This acts as a logical boundary that works to prevent the analyzer from modifying the logical + * operators above and below the boundary. + */ +case class PipeOperator(child: LogicalPlan) extends UnaryNode { + final override val nodePatterns: Seq[TreePattern] = Seq(PIPE_OPERATOR) + override def output: Seq[Attribute] = child.output + override def withNewChildInternal(newChild: LogicalPlan): PipeOperator = copy(child = newChild) +} + +/** This rule removes all PipeOperator nodes from a logical plan at the end of analysis. */ +object EliminatePipeOperators extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan.transformWithPruning( + _.containsPattern(PIPE_OPERATOR), ruleId) { + case PipeOperator(child) => child + } +} + +/** + * Validates and strips PipeExpression nodes from a logical plan once the child expressions are + * resolved. + */ +object ValidateAndStripPipeExpressions extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUpWithPruning( + _.containsPattern(PIPE_EXPRESSION), ruleId) { + case node: LogicalPlan => + node.resolveExpressions { + case p: PipeExpression if p.child.resolved => + // Once the child expression is resolved, we can perform the necessary invariant checks + // and then remove this expression, replacing it with the child expression instead. + val firstAggregateFunction: Option[AggregateFunction] = findFirstAggregate(p.child) + if (p.isAggregate && firstAggregateFunction.isEmpty) { + throw QueryCompilationErrors + .pipeOperatorAggregateExpressionContainsNoAggregateFunction(p.child) + } else if (!p.isAggregate) { + firstAggregateFunction.foreach { a => + throw QueryCompilationErrors.pipeOperatorContainsAggregateFunction(a, p.clause) + } + } + p.child } - } - child } /** Returns the first aggregate function in the given expression, or None if not found. */ @@ -67,6 +108,7 @@ object PipeOperators { val offsetClause = "OFFSET" val orderByClause = "ORDER BY" val selectClause = "SELECT" + val setClause = "SET" val sortByClause = "SORT BY" val sortByDistributeByClause = "SORT BY ... DISTRIBUTE BY ..." val windowClause = "WINDOW" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 986bc63363d5d..d8d81a9cc12f8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, LogicalPlan, Project, Union} import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.catalyst.util.{CollationFactory, TypeUtils} +import org.apache.spark.sql.catalyst.util.SparkStringUtils.truncatedString import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -487,7 +488,10 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { } } - override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}" + override def simpleString(maxFields: Int): String = + s"$value IN ${truncatedString(list, "(", ",", ")", maxFields)}" + + override def toString: String = simpleString(Int.MaxValue) override def eval(input: InternalRow): Any = { if (list.isEmpty && !legacyNullInEmptyBehavior) { @@ -608,15 +612,29 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with require(hset != null, "hset could not be null") - override def toString: String = { - val listString = hset.toSeq - .map(elem => Literal(elem, child.dataType).toString) - // Sort elements for deterministic behaviours - .sorted - .mkString(", ") - s"$child INSET $listString" + override def simpleString(maxFields: Int): String = { + if (!child.resolved) { + return s"$child INSET (values with unresolved data types)" + } + if (hset.size <= maxFields) { + val listString = hset.toSeq + .map(elem => Literal(elem, child.dataType).toString) + // Sort elements for deterministic behaviours + .sorted + .mkString(", ") + s"$child INSET $listString" + } else { + // Skip sorting if there are many elements. Do not use truncatedString because we would have + // to convert elements we do not print to Literals. + val listString = hset.take(maxFields).toSeq + .map(elem => Literal(elem, child.dataType).toString) + .mkString(", ") + s"$child INSET $listString, ... ${hset.size - maxFields} more fields" + } } + override def toString: String = simpleString(Int.MaxValue) + @transient private[this] lazy val hasNull: Boolean = hset.contains(null) @transient private[this] lazy val isNaN: Any => Boolean = child.dataType match { case DoubleType => (value: Any) => java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double]) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala index 7148d3738f7fa..50c699ef69bd6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala @@ -21,12 +21,12 @@ import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedSeed} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch -import org.apache.spark.sql.catalyst.expressions.ExpectsInputTypes.{ordinalNumber, toSQLExpr, toSQLId, toSQLType} +import org.apache.spark.sql.catalyst.expressions.ExpectsInputTypes.{toSQLExpr, toSQLId} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.trees.{BinaryLike, TernaryLike, UnaryLike} import org.apache.spark.sql.catalyst.trees.TreePattern.{EXPRESSION_WITH_RANDOM_SEED, RUNTIME_REPLACEABLE, TreePattern} -import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types._ import org.apache.spark.util.random.XORShiftRandom @@ -206,7 +206,7 @@ object Randn { since = "4.0.0", group = "math_funcs") case class Uniform(min: Expression, max: Expression, seedExpression: Expression, hideSeed: Boolean) - extends RuntimeReplaceable with TernaryLike[Expression] with RDG { + extends RuntimeReplaceable with TernaryLike[Expression] with RDG with ExpectsInputTypes { def this(min: Expression, max: Expression) = this(min, max, UnresolvedSeed, hideSeed = true) def this(min: Expression, max: Expression, seedExpression: Expression) = @@ -216,63 +216,46 @@ case class Uniform(min: Expression, max: Expression, seedExpression: Expression, override val nodePatterns: Seq[TreePattern] = Seq(RUNTIME_REPLACEABLE, EXPRESSION_WITH_RANDOM_SEED) + override def inputTypes: Seq[AbstractDataType] = { + val randomSeedTypes = TypeCollection(IntegerType, LongType) + Seq(NumericType, NumericType, randomSeedTypes) + } + override def dataType: DataType = { - val first = min.dataType - val second = max.dataType (min.dataType, max.dataType) match { case _ if !seedExpression.resolved || seedExpression.dataType == NullType => NullType - case (_, NullType) | (NullType, _) => NullType - case (_, LongType) | (LongType, _) - if Seq(first, second).forall(integer) => LongType - case (_, IntegerType) | (IntegerType, _) - if Seq(first, second).forall(integer) => IntegerType - case (_, ShortType) | (ShortType, _) - if Seq(first, second).forall(integer) => ShortType - case (_, DoubleType) | (DoubleType, _) => DoubleType - case (_, FloatType) | (FloatType, _) => FloatType + case (left: IntegralType, right: IntegralType) => + if (UpCastRule.legalNumericPrecedence(left, right)) right else left + case (_: NumericType, DoubleType) | (DoubleType, _: NumericType) => DoubleType + case (_: NumericType, FloatType) | (FloatType, _: NumericType) => FloatType + case (lhs: DecimalType, rhs: DecimalType) => if (lhs.isWiderThan(rhs)) lhs else rhs + case (_, d: DecimalType) => d + case (d: DecimalType, _) => d case _ => throw SparkException.internalError( s"Unexpected argument data types: ${min.dataType}, ${max.dataType}") } } - private def integer(t: DataType): Boolean = t match { - case _: ShortType | _: IntegerType | _: LongType => true - case _ => false - } - override def sql: String = { s"uniform(${min.sql}, ${max.sql}${if (hideSeed) "" else s", ${seedExpression.sql}"})" } override def checkInputDataTypes(): TypeCheckResult = { - var result: TypeCheckResult = TypeCheckResult.TypeCheckSuccess + var result: TypeCheckResult = super.checkInputDataTypes() def requiredType = "integer or floating-point" - Seq((min, "min", 0), - (max, "max", 1), - (seedExpression, "seed", 2)).foreach { - case (expr: Expression, name: String, index: Int) => - if (result == TypeCheckResult.TypeCheckSuccess) { - if (!expr.foldable) { - result = DataTypeMismatch( - errorSubClass = "NON_FOLDABLE_INPUT", - messageParameters = Map( - "inputName" -> toSQLId(name), - "inputType" -> requiredType, - "inputExpr" -> toSQLExpr(expr))) - } else expr.dataType match { - case _: ShortType | _: IntegerType | _: LongType | _: FloatType | _: DoubleType | - _: NullType => - case _ => - result = DataTypeMismatch( - errorSubClass = "UNEXPECTED_INPUT_TYPE", - messageParameters = Map( - "paramIndex" -> ordinalNumber(index), - "requiredType" -> requiredType, - "inputSql" -> toSQLExpr(expr), - "inputType" -> toSQLType(expr.dataType))) - } + Seq((min, "min"), + (max, "max"), + (seedExpression, "seed")).foreach { + case (expr: Expression, name: String) => + if (result == TypeCheckResult.TypeCheckSuccess && !expr.foldable) { + result = DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> toSQLId(name), + "inputType" -> requiredType, + "inputExpr" -> toSQLExpr(expr))) } } result @@ -330,7 +313,8 @@ object Uniform { group = "string_funcs") case class RandStr( length: Expression, override val seedExpression: Expression, hideSeed: Boolean) - extends ExpressionWithRandomSeed with BinaryLike[Expression] with Nondeterministic { + extends ExpressionWithRandomSeed with BinaryLike[Expression] with Nondeterministic + with ExpectsInputTypes { def this(length: Expression) = this(length, UnresolvedSeed, hideSeed = true) def this(length: Expression, seedExpression: Expression) = @@ -342,6 +326,10 @@ case class RandStr( override def left: Expression = length override def right: Expression = seedExpression + override def inputTypes: Seq[AbstractDataType] = Seq( + IntegerType, + TypeCollection(IntegerType, LongType)) + /** * Record ID within each partition. By being transient, the Random Number Generator is * reset every time we serialize and deserialize and initialize it. @@ -366,52 +354,48 @@ case class RandStr( } override def checkInputDataTypes(): TypeCheckResult = { - var result: TypeCheckResult = TypeCheckResult.TypeCheckSuccess - def requiredType = "INT or SMALLINT" - Seq((length, "length", 0), - (seedExpression, "seed", 1)).foreach { - case (expr: Expression, name: String, index: Int) => - if (result == TypeCheckResult.TypeCheckSuccess) { - if (!expr.foldable) { - result = DataTypeMismatch( - errorSubClass = "NON_FOLDABLE_INPUT", - messageParameters = Map( - "inputName" -> toSQLId(name), - "inputType" -> requiredType, - "inputExpr" -> toSQLExpr(expr))) - } else expr.dataType match { - case _: ShortType | _: IntegerType => - case _: LongType if index == 1 => - case _ => - result = DataTypeMismatch( - errorSubClass = "UNEXPECTED_INPUT_TYPE", - messageParameters = Map( - "paramIndex" -> ordinalNumber(index), - "requiredType" -> requiredType, - "inputSql" -> toSQLExpr(expr), - "inputType" -> toSQLType(expr.dataType))) - } + var result: TypeCheckResult = super.checkInputDataTypes() + Seq((length, "length"), + (seedExpression, "seed")).foreach { + case (expr: Expression, name: String) => + if (result == TypeCheckResult.TypeCheckSuccess && !expr.foldable) { + result = DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> toSQLId(name), + "inputType" -> "integer", + "inputExpr" -> toSQLExpr(expr))) } } result } override def evalInternal(input: InternalRow): Any = { - val numChars = length.eval(input).asInstanceOf[Number].intValue() + val numChars = lengthInteger() ExpressionImplUtils.randStr(rng, numChars) } + private def lengthInteger(): Int = { + // We should have already added a cast to IntegerType (if necessary) in + // FunctionArgumentTypeCoercion. + assert(length.dataType == IntegerType, s"Expected IntegerType, got ${length.dataType}") + val result = length.eval().asInstanceOf[Int] + if (result < 0) { + throw QueryExecutionErrors.unexpectedValueForLengthInFunctionError(prettyName, result) + } + result + } + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = classOf[XORShiftRandom].getName val rngTerm = ctx.addMutableState(className, "rng") ctx.addPartitionInitializationStatement( s"$rngTerm = new $className(${seed}L + partitionIndex);") - val eval = length.genCode(ctx) + val numChars = lengthInteger() ev.copy(code = code""" - |${eval.code} |UTF8String ${ev.value} = - | ${classOf[ExpressionImplUtils].getName}.randStr($rngTerm, (int)(${eval.value})); + | ${classOf[ExpressionImplUtils].getName}.randStr($rngTerm, $numChars); |boolean ${ev.isNull} = false; |""".stripMargin, isNull = FalseLiteral) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index c97920619ba4d..efd7e5c07de40 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1863,7 +1863,7 @@ trait PadExpressionBuilderBase extends ExpressionBuilder { BinaryPad(funcName, expressions(0), expressions(1), Literal(Array[Byte](0))) } else { createStringPad(expressions(0), - expressions(1), Literal.create(" ", SQLConf.get.defaultStringType)) + expressions(1), Literal(" ")) } } else if (numArgs == 3) { if (expressions(0).dataType == BinaryType && expressions(2).dataType == BinaryType @@ -3557,9 +3557,9 @@ case class Sentences( ArrayType(ArrayType(str.dataType, containsNull = false), containsNull = false) override def inputTypes: Seq[AbstractDataType] = Seq( - StringTypeWithCollation, - StringTypeWithCollation, - StringTypeWithCollation + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true) ) override def first: Expression = str override def second: Expression = language diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala index bd6f65b61468d..c0a2bf25fbe67 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala @@ -19,11 +19,10 @@ package org.apache.spark.sql.catalyst.expressions import scala.collection.mutable.ArrayBuffer -import org.apache.spark.sql.catalyst.analysis.{LazyOuterReference, UnresolvedOuterReference} +import org.apache.spark.sql.catalyst.analysis.UnresolvedPlanId import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.trees.TreePattern import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf @@ -374,13 +373,6 @@ object SubExprUtils extends PredicateHelper { val nonEquivalentGroupByExprs = groupByExprs -- correlatedEquivalentExprs nonEquivalentGroupByExprs } - - def removeLazyOuterReferences(logicalPlan: LogicalPlan): LogicalPlan = { - logicalPlan.transformAllExpressionsWithPruning( - _.containsPattern(TreePattern.LAZY_OUTER_REFERENCE)) { - case or: LazyOuterReference => UnresolvedOuterReference(or.nameParts) - } - } } /** @@ -407,8 +399,7 @@ case class ScalarSubquery( joinCond: Seq[Expression] = Seq.empty, hint: Option[HintInfo] = None, mayHaveCountBug: Option[Boolean] = None, - needSingleJoin: Option[Boolean] = None, - hasExplicitOuterRefs: Boolean = false) + needSingleJoin: Option[Boolean] = None) extends SubqueryExpression(plan, outerAttrs, exprId, joinCond, hint) with Unevaluable { override def dataType: DataType = { if (!plan.schema.fields.nonEmpty) { @@ -449,6 +440,14 @@ object ScalarSubquery { } } +case class UnresolvedScalarSubqueryPlanId(planId: Long) + extends UnresolvedPlanId { + + override def withPlan(plan: LogicalPlan): Expression = { + ScalarSubquery(plan) + } +} + /** * A subquery that can return multiple rows and columns. This should be rewritten as a join * with the outer query during the optimization phase. @@ -577,8 +576,7 @@ case class Exists( outerAttrs: Seq[Expression] = Seq.empty, exprId: ExprId = NamedExpression.newExprId, joinCond: Seq[Expression] = Seq.empty, - hint: Option[HintInfo] = None, - hasExplicitOuterRefs: Boolean = false) + hint: Option[HintInfo] = None) extends SubqueryExpression(plan, outerAttrs, exprId, joinCond, hint) with Predicate with Unevaluable { @@ -603,3 +601,11 @@ case class Exists( final override def nodePatternsInternal(): Seq[TreePattern] = Seq(EXISTS_SUBQUERY) } + +case class UnresolvedExistsPlanId(planId: Long) + extends UnresolvedPlanId { + + override def withPlan(plan: LogicalPlan): Expression = { + Exists(plan) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index 22dcd33937dfb..845ca0b608ef3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -57,13 +57,14 @@ case class UrlEncode(child: Expression) SQLConf.get.defaultStringType, "encode", Seq(child), - Seq(StringTypeWithCollation)) + Seq(StringTypeWithCollation(supportsTrimCollation = true))) override protected def withNewChildInternal(newChild: Expression): Expression = { copy(child = newChild) } - override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeWithCollation(supportsTrimCollation = true)) override def prettyName: String = "url_encode" } @@ -96,13 +97,14 @@ case class UrlDecode(child: Expression, failOnError: Boolean = true) SQLConf.get.defaultStringType, "decode", Seq(child, Literal(failOnError)), - Seq(StringTypeWithCollation, BooleanType)) + Seq(StringTypeWithCollation(supportsTrimCollation = true), BooleanType)) override protected def withNewChildInternal(newChild: Expression): Expression = { copy(child = newChild) } - override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeWithCollation(supportsTrimCollation = true)) override def prettyName: String = "url_decode" } @@ -211,7 +213,7 @@ case class ParseUrl( override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = - Seq.fill(children.size)(StringTypeWithCollation) + Seq.fill(children.size)(StringTypeWithCollation(supportsTrimCollation = true)) override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "parse_url" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala index 06aec93912984..ff8b168793b5d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala @@ -66,7 +66,8 @@ case class ParseJson(child: Expression, failOnError: Boolean = true) inputTypes :+ BooleanType :+ BooleanType, returnNullable = !failOnError) - override def inputTypes: Seq[AbstractDataType] = StringTypeWithCollation :: Nil + override def inputTypes: Seq[AbstractDataType] = + StringTypeWithCollation(supportsTrimCollation = true) :: Nil override def dataType: DataType = VariantType @@ -183,33 +184,37 @@ case class ToVariantObject(child: Expression) } } -object VariantPathParser extends RegexParsers { - // A path segment in the `VariantGet` expression represents either an object key access or an - // array index access. - type PathSegment = Either[String, Int] +// A path segment in the `VariantGet` expression represents either an object key access or an array +// index access. +sealed abstract class VariantPathSegment extends Serializable + +case class ObjectExtraction(key: String) extends VariantPathSegment + +case class ArrayExtraction(index: Int) extends VariantPathSegment +object VariantPathParser extends RegexParsers { private def root: Parser[Char] = '$' // Parse index segment like `[123]`. - private def index: Parser[PathSegment] = + private def index: Parser[VariantPathSegment] = for { index <- '[' ~> "\\d+".r <~ ']' } yield { - scala.util.Right(index.toInt) + ArrayExtraction(index.toInt) } // Parse key segment like `.name`, `['name']`, or `["name"]`. - private def key: Parser[PathSegment] = + private def key: Parser[VariantPathSegment] = for { key <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\'\\?]+".r <~ "']" | "[\"" ~> "[^\\\"\\?]+".r <~ "\"]" } yield { - scala.util.Left(key) + ObjectExtraction(key) } - private val parser: Parser[List[PathSegment]] = phrase(root ~> rep(key | index)) + private val parser: Parser[List[VariantPathSegment]] = phrase(root ~> rep(key | index)) - def parse(str: String): Option[Array[PathSegment]] = { + def parse(str: String): Option[Array[VariantPathSegment]] = { this.parseAll(parser, str) match { case Success(result, _) => Some(result.toArray) case _ => None @@ -270,21 +275,20 @@ case class VariantGet( final override def nodePatternsInternal(): Seq[TreePattern] = Seq(VARIANT_GET) override def inputTypes: Seq[AbstractDataType] = - Seq(VariantType, StringTypeWithCollation) + Seq(VariantType, StringTypeWithCollation(supportsTrimCollation = true)) override def prettyName: String = if (failOnError) "variant_get" else "try_variant_get" override def nullable: Boolean = true override def nullIntolerant: Boolean = true + private lazy val castArgs = VariantCastArgs( + failOnError, + timeZoneId, + zoneId) + protected override def nullSafeEval(input: Any, path: Any): Any = { - VariantGet.variantGet( - input.asInstanceOf[VariantVal], - parsedPath, - dataType, - failOnError, - timeZoneId, - zoneId) + VariantGet.variantGet(input.asInstanceOf[VariantVal], parsedPath, dataType, castArgs) } protected override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -292,15 +296,14 @@ case class VariantGet( val tmp = ctx.freshVariable("tmp", classOf[Object]) val parsedPathArg = ctx.addReferenceObj("parsedPath", parsedPath) val dataTypeArg = ctx.addReferenceObj("dataType", dataType) - val zoneStrArg = ctx.addReferenceObj("zoneStr", timeZoneId) - val zoneIdArg = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) + val castArgsArg = ctx.addReferenceObj("castArgs", castArgs) val code = code""" ${childCode.code} boolean ${ev.isNull} = ${childCode.isNull}; ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; if (!${ev.isNull}) { Object $tmp = org.apache.spark.sql.catalyst.expressions.variant.VariantGet.variantGet( - ${childCode.value}, $parsedPathArg, $dataTypeArg, $failOnError, $zoneStrArg, $zoneIdArg); + ${childCode.value}, $parsedPathArg, $dataTypeArg, $castArgsArg); if ($tmp == null) { ${ev.isNull} = true; } else { @@ -322,6 +325,12 @@ case class VariantGet( override def withTimeZone(timeZoneId: String): VariantGet = copy(timeZoneId = Option(timeZoneId)) } +// Several parameters used by `VariantGet.cast`. Packed together to simplify parameter passing. +case class VariantCastArgs( + failOnError: Boolean, + zoneStr: Option[String], + zoneId: ZoneId) + case object VariantGet { /** * Returns whether a data type can be cast into/from variant. For scalar types, we allow a subset @@ -329,6 +338,7 @@ case object VariantGet { */ def checkDataType(dataType: DataType, allowStructsAndMaps: Boolean = true): Boolean = dataType match { + case CharType(_) | VarcharType(_) => false case _: NumericType | BooleanType | _: StringType | BinaryType | _: DatetimeType | VariantType => true @@ -343,35 +353,28 @@ case object VariantGet { /** The actual implementation of the `VariantGet` expression. */ def variantGet( input: VariantVal, - parsedPath: Array[VariantPathParser.PathSegment], + parsedPath: Array[VariantPathSegment], dataType: DataType, - failOnError: Boolean, - zoneStr: Option[String], - zoneId: ZoneId): Any = { + castArgs: VariantCastArgs): Any = { var v = new Variant(input.getValue, input.getMetadata) for (path <- parsedPath) { v = path match { - case scala.util.Left(key) if v.getType == Type.OBJECT => v.getFieldByKey(key) - case scala.util.Right(index) if v.getType == Type.ARRAY => v.getElementAtIndex(index) + case ObjectExtraction(key) if v.getType == Type.OBJECT => v.getFieldByKey(key) + case ArrayExtraction(index) if v.getType == Type.ARRAY => v.getElementAtIndex(index) case _ => null } if (v == null) return null } - VariantGet.cast(v, dataType, failOnError, zoneStr, zoneId) + VariantGet.cast(v, dataType, castArgs) } /** * A simple wrapper of the `cast` function that takes `Variant` rather than `VariantVal`. The * `Cast` expression uses it and makes the implementation simpler. */ - def cast( - input: VariantVal, - dataType: DataType, - failOnError: Boolean, - zoneStr: Option[String], - zoneId: ZoneId): Any = { + def cast(input: VariantVal, dataType: DataType, castArgs: VariantCastArgs): Any = { val v = new Variant(input.getValue, input.getMetadata) - VariantGet.cast(v, dataType, failOnError, zoneStr, zoneId) + VariantGet.cast(v, dataType, castArgs) } /** @@ -381,15 +384,10 @@ case object VariantGet { * "hello" to int). If the cast fails, throw an exception when `failOnError` is true, or return a * SQL NULL when it is false. */ - def cast( - v: Variant, - dataType: DataType, - failOnError: Boolean, - zoneStr: Option[String], - zoneId: ZoneId): Any = { + def cast(v: Variant, dataType: DataType, castArgs: VariantCastArgs): Any = { def invalidCast(): Any = { - if (failOnError) { - throw QueryExecutionErrors.invalidVariantCast(v.toJson(zoneId), dataType) + if (castArgs.failOnError) { + throw QueryExecutionErrors.invalidVariantCast(v.toJson(castArgs.zoneId), dataType) } else { null } @@ -409,7 +407,7 @@ case object VariantGet { val input = variantType match { case Type.OBJECT | Type.ARRAY => return if (dataType.isInstanceOf[StringType]) { - UTF8String.fromString(v.toJson(zoneId)) + UTF8String.fromString(v.toJson(castArgs.zoneId)) } else { invalidCast() } @@ -433,29 +431,20 @@ case object VariantGet { messageParameters = Map("id" -> v.getTypeInfo.toString) ) } - // We mostly use the `Cast` expression to implement the cast. However, `Cast` silently - // ignores the overflow in the long/decimal -> timestamp cast, and we want to enforce - // strict overflow checks. input.dataType match { case LongType if dataType == TimestampType => - try Math.multiplyExact(input.value.asInstanceOf[Long], MICROS_PER_SECOND) + try castLongToTimestamp(input.value.asInstanceOf[Long]) catch { case _: ArithmeticException => invalidCast() } case _: DecimalType if dataType == TimestampType => - try { - input.value - .asInstanceOf[Decimal] - .toJavaBigDecimal - .multiply(new java.math.BigDecimal(MICROS_PER_SECOND)) - .toBigInteger - .longValueExact() - } catch { + try castDecimalToTimestamp(input.value.asInstanceOf[Decimal]) + catch { case _: ArithmeticException => invalidCast() } case _ => if (Cast.canAnsiCast(input.dataType, dataType)) { - val result = Cast(input, dataType, zoneStr, EvalMode.TRY).eval() + val result = Cast(input, dataType, castArgs.zoneStr, EvalMode.TRY).eval() if (result == null) invalidCast() else result } else { invalidCast() @@ -466,7 +455,7 @@ case object VariantGet { val size = v.arraySize() val array = new Array[Any](size) for (i <- 0 until size) { - array(i) = cast(v.getElementAtIndex(i), elementType, failOnError, zoneStr, zoneId) + array(i) = cast(v.getElementAtIndex(i), elementType, castArgs) } new GenericArrayData(array) } else { @@ -480,7 +469,7 @@ case object VariantGet { for (i <- 0 until size) { val field = v.getFieldAtIndex(i) keyArray(i) = UTF8String.fromString(field.key) - valueArray(i) = cast(field.value, valueType, failOnError, zoneStr, zoneId) + valueArray(i) = cast(field.value, valueType, castArgs) } ArrayBasedMapData(keyArray, valueArray) } else { @@ -493,8 +482,7 @@ case object VariantGet { val field = v.getFieldAtIndex(i) st.getFieldIndex(field.key) match { case Some(idx) => - row.update(idx, - cast(field.value, fields(idx).dataType, failOnError, zoneStr, zoneId)) + row.update(idx, cast(field.value, fields(idx).dataType, castArgs)) case _ => } } @@ -504,6 +492,27 @@ case object VariantGet { } } } + + // We mostly use the `Cast` expression to implement the cast, but we need some custom logic for + // certain type combinations. + // + // `castLongToTimestamp/castDecimalToTimestamp`: `Cast` silently ignores the overflow in the + // long/decimal -> timestamp cast, and we want to enforce strict overflow checks. They both throw + // an `ArithmeticException` when overflow happens. + def castLongToTimestamp(input: Long): Long = + Math.multiplyExact(input, MICROS_PER_SECOND) + + def castDecimalToTimestamp(input: Decimal): Long = { + val multiplier = new java.math.BigDecimal(MICROS_PER_SECOND) + input.toJavaBigDecimal.multiply(multiplier).toBigInteger.longValueExact() + } + + // Cast decimal to string, but strip any trailing zeros. We don't have to call it if the decimal + // is returned by `Variant.getDecimal`, which already strips any trailing zeros. But we need it + // if the decimal is produced by Spark internally, e.g., on a shredded decimal produced by the + // Spark Parquet reader. + def castDecimalToString(input: Decimal): UTF8String = + UTF8String.fromString(input.toJavaBigDecimal.stripTrailingZeros.toPlainString) } abstract class ParseJsonExpressionBuilderBase(failOnError: Boolean) extends ExpressionBuilder { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/XmlExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/XmlExpressionEvalUtils.scala index dff88475327a2..44b98026d62d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/XmlExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/XmlExpressionEvalUtils.scala @@ -17,9 +17,10 @@ package org.apache.spark.sql.catalyst.expressions.xml +import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.catalyst.xml.XmlInferSchema import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, DataType, StructType} +import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String object XmlExpressionEvalUtils { @@ -40,3 +41,82 @@ object XmlExpressionEvalUtils { UTF8String.fromString(dataType.sql) } } + +trait XPathEvaluator { + + protected val path: UTF8String + + @transient protected lazy val xpathUtil: UDFXPathUtil = new UDFXPathUtil + + final def evaluate(xml: UTF8String): Any = { + if (xml == null || xml.toString.isEmpty || path == null || path.toString.isEmpty) return null + doEvaluate(xml) + } + + def doEvaluate(xml: UTF8String): Any +} + +case class XPathBooleanEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + xpathUtil.evalBoolean(xml.toString, path.toString) + } +} + +case class XPathShortEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val ret = xpathUtil.evalNumber(xml.toString, path.toString) + if (ret eq null) null.asInstanceOf[Short] else ret.shortValue() + } +} + +case class XPathIntEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val ret = xpathUtil.evalNumber(xml.toString, path.toString) + if (ret eq null) null.asInstanceOf[Int] else ret.intValue() + } +} + +case class XPathLongEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val ret = xpathUtil.evalNumber(xml.toString, path.toString) + if (ret eq null) null.asInstanceOf[Long] else ret.longValue() + } +} + +case class XPathFloatEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val ret = xpathUtil.evalNumber(xml.toString, path.toString) + if (ret eq null) null.asInstanceOf[Float] else ret.floatValue() + } +} + +case class XPathDoubleEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val ret = xpathUtil.evalNumber(xml.toString, path.toString) + if (ret eq null) null.asInstanceOf[Double] else ret.doubleValue() + } +} + +case class XPathStringEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val ret = xpathUtil.evalString(xml.toString, path.toString) + UTF8String.fromString(ret) + } +} + +case class XPathListEvaluator(path: UTF8String) extends XPathEvaluator { + override def doEvaluate(xml: UTF8String): Any = { + val nodeList = xpathUtil.evalNodeList(xml.toString, path.toString) + if (nodeList ne null) { + val ret = new Array[AnyRef](nodeList.getLength) + var i = 0 + while (i < nodeList.getLength) { + ret(i) = UTF8String.fromString(nodeList.item(i).getNodeValue) + i += 1 + } + new GenericArrayData(ret) + } else { + null + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index 9848e062a08fd..2e591288a21cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -21,8 +21,7 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Cast._ -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types._ @@ -34,16 +33,17 @@ import org.apache.spark.unsafe.types.UTF8String * This is not the world's most efficient implementation due to type conversion, but works. */ abstract class XPathExtract - extends BinaryExpression with ExpectsInputTypes with CodegenFallback { + extends BinaryExpression with RuntimeReplaceable with ExpectsInputTypes { override def left: Expression = xml override def right: Expression = path - override def nullIntolerant: Boolean = true /** XPath expressions are always nullable, e.g. if the xml string is empty. */ override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = - Seq(StringTypeWithCollation, StringTypeWithCollation) + Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true)) override def checkInputDataTypes(): TypeCheckResult = { if (!path.foldable) { @@ -51,7 +51,7 @@ abstract class XPathExtract errorSubClass = "NON_FOLDABLE_INPUT", messageParameters = Map( "inputName" -> toSQLId("path"), - "inputType" -> toSQLType(StringTypeWithCollation), + "inputType" -> toSQLType(StringTypeWithCollation(supportsTrimCollation = true)), "inputExpr" -> toSQLExpr(path) ) ) @@ -60,12 +60,20 @@ abstract class XPathExtract } } - @transient protected lazy val xpathUtil = new UDFXPathUtil - @transient protected lazy val pathString: String = path.eval().asInstanceOf[UTF8String].toString - /** Concrete implementations need to override the following three methods. */ def xml: Expression def path: Expression + + @transient protected lazy val pathUTF8String: UTF8String = path.eval().asInstanceOf[UTF8String] + + protected def evaluator: XPathEvaluator + + override def replacement: Expression = Invoke( + Literal.create(evaluator, ObjectType(classOf[XPathEvaluator])), + "evaluate", + dataType, + Seq(xml), + Seq(xml.dataType)) } // scalastyle:off line.size.limit @@ -81,11 +89,9 @@ abstract class XPathExtract // scalastyle:on line.size.limit case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract with Predicate { - override def prettyName: String = "xpath_boolean" + @transient override lazy val evaluator: XPathEvaluator = XPathBooleanEvaluator(pathUTF8String) - override def nullSafeEval(xml: Any, path: Any): Any = { - xpathUtil.evalBoolean(xml.asInstanceOf[UTF8String].toString, pathString) - } + override def prettyName: String = "xpath_boolean" override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): XPathBoolean = copy(xml = newLeft, path = newRight) @@ -103,14 +109,12 @@ case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract group = "xml_funcs") // scalastyle:on line.size.limit case class XPathShort(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathShortEvaluator(pathUTF8String) + override def prettyName: String = "xpath_short" override def dataType: DataType = ShortType - override def nullSafeEval(xml: Any, path: Any): Any = { - val ret = xpathUtil.evalNumber(xml.asInstanceOf[UTF8String].toString, pathString) - if (ret eq null) null else ret.shortValue() - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): XPathShort = copy(xml = newLeft, path = newRight) } @@ -127,14 +131,12 @@ case class XPathShort(xml: Expression, path: Expression) extends XPathExtract { group = "xml_funcs") // scalastyle:on line.size.limit case class XPathInt(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathIntEvaluator(pathUTF8String) + override def prettyName: String = "xpath_int" override def dataType: DataType = IntegerType - override def nullSafeEval(xml: Any, path: Any): Any = { - val ret = xpathUtil.evalNumber(xml.asInstanceOf[UTF8String].toString, pathString) - if (ret eq null) null else ret.intValue() - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): Expression = copy(xml = newLeft, path = newRight) } @@ -151,14 +153,12 @@ case class XPathInt(xml: Expression, path: Expression) extends XPathExtract { group = "xml_funcs") // scalastyle:on line.size.limit case class XPathLong(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathLongEvaluator(pathUTF8String) + override def prettyName: String = "xpath_long" override def dataType: DataType = LongType - override def nullSafeEval(xml: Any, path: Any): Any = { - val ret = xpathUtil.evalNumber(xml.asInstanceOf[UTF8String].toString, pathString) - if (ret eq null) null else ret.longValue() - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): XPathLong = copy(xml = newLeft, path = newRight) } @@ -175,14 +175,12 @@ case class XPathLong(xml: Expression, path: Expression) extends XPathExtract { group = "xml_funcs") // scalastyle:on line.size.limit case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathFloatEvaluator(pathUTF8String) + override def prettyName: String = "xpath_float" override def dataType: DataType = FloatType - override def nullSafeEval(xml: Any, path: Any): Any = { - val ret = xpathUtil.evalNumber(xml.asInstanceOf[UTF8String].toString, pathString) - if (ret eq null) null else ret.floatValue() - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): XPathFloat = copy(xml = newLeft, path = newRight) } @@ -199,15 +197,13 @@ case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract { group = "xml_funcs") // scalastyle:on line.size.limit case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathDoubleEvaluator(pathUTF8String) + override def prettyName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("xpath_double") override def dataType: DataType = DoubleType - override def nullSafeEval(xml: Any, path: Any): Any = { - val ret = xpathUtil.evalNumber(xml.asInstanceOf[UTF8String].toString, pathString) - if (ret eq null) null else ret.doubleValue() - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): XPathDouble = copy(xml = newLeft, path = newRight) } @@ -224,14 +220,12 @@ case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract { group = "xml_funcs") // scalastyle:on line.size.limit case class XPathString(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathStringEvaluator(pathUTF8String) + override def prettyName: String = "xpath_string" override def dataType: DataType = SQLConf.get.defaultStringType - override def nullSafeEval(xml: Any, path: Any): Any = { - val ret = xpathUtil.evalString(xml.asInstanceOf[UTF8String].toString, pathString) - UTF8String.fromString(ret) - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): Expression = copy(xml = newLeft, path = newRight) } @@ -250,24 +244,12 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract { group = "xml_funcs") // scalastyle:on line.size.limit case class XPathList(xml: Expression, path: Expression) extends XPathExtract { + + @transient override lazy val evaluator: XPathEvaluator = XPathListEvaluator(pathUTF8String) + override def prettyName: String = "xpath" override def dataType: DataType = ArrayType(SQLConf.get.defaultStringType) - override def nullSafeEval(xml: Any, path: Any): Any = { - val nodeList = xpathUtil.evalNodeList(xml.asInstanceOf[UTF8String].toString, pathString) - if (nodeList ne null) { - val ret = new Array[AnyRef](nodeList.getLength) - var i = 0 - while (i < nodeList.getLength) { - ret(i) = UTF8String.fromString(nodeList.item(i).getNodeValue) - i += 1 - } - new GenericArrayData(ret) - } else { - null - } - } - override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): XPathList = copy(xml = newLeft, path = newRight) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala index 6f004cbce4262..d8254f04b4d94 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala @@ -126,7 +126,8 @@ case class XmlToStructs( defineCodeGen(ctx, ev, input => s"(InternalRow) $expr.nullSafeEval($input)") } - override def inputTypes: Seq[AbstractDataType] = StringTypeWithCollation :: Nil + override def inputTypes: Seq[AbstractDataType] = + StringTypeWithCollation(supportsTrimCollation = true) :: Nil override def prettyName: String = "from_xml" @@ -208,8 +209,8 @@ case class SchemaOfXml( dataType, "schemaOfXml", Seq(Literal(xmlInferSchema, xmlInferSchemaObjectType), child), - Seq(xmlInferSchemaObjectType, child.dataType) - ) + Seq(xmlInferSchemaObjectType, child.dataType), + returnNullable = false) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 13129d44fe0c2..1cd4b4cd29bcf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -292,13 +292,9 @@ class JacksonParser( case _: StringType => (parser: JsonParser) => { // This must be enabled if we will retrieve the bytes directly from the raw content: - val includeSourceInLocation = JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION - val originalMask = if (includeSourceInLocation.enabledIn(parser.getFeatureMask)) { - 1 - } else { - 0 - } - parser.overrideStdFeatures(includeSourceInLocation.getMask, includeSourceInLocation.getMask) + val oldFeature = parser.getFeatureMask + val featureToAdd = JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION.getMask + parser.overrideStdFeatures(oldFeature | featureToAdd, featureToAdd) val result = parseJsonToken[UTF8String](parser, dataType) { case VALUE_STRING => UTF8String.fromString(parser.getText) @@ -343,8 +339,11 @@ class JacksonParser( UTF8String.fromBytes(writer.toByteArray) } } - // Reset back to the original configuration: - parser.overrideStdFeatures(includeSourceInLocation.getMask, originalMask) + // Reset back to the original configuration using `~0` as the mask, + // which is a bitmask with all bits set, effectively allowing all features + // to be reset. This ensures that every feature is restored to its previous + // state as defined by `oldFeature`. + parser.overrideStdFeatures(oldFeature, ~0) result } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSQLFunctionNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSQLFunctionNode.scala new file mode 100644 index 0000000000000..d9da38b4c2af4 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSQLFunctionNode.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.analysis.{SQLFunctionExpression, SQLFunctionNode, SQLScalarFunction, SQLTableFunction} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule + +/** + * This rule removes [[SQLScalarFunction]] and [[SQLFunctionNode]] wrapper. They are respected + * till the end of analysis stage because we want to see which part of an analyzed logical + * plan is generated from a SQL function and also perform ACL checks. + */ +object EliminateSQLFunctionNode extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = { + // Include subqueries when eliminating SQL function expressions otherwise we might miss + // expressions in subqueries which can be inlined by the rule `OptimizeOneRowRelationSubquery`. + plan.transformWithSubqueries { + case SQLFunctionNode(_, child) => child + case f: SQLTableFunction => + throw SparkException.internalError( + s"SQL table function plan should be rewritten during analysis: $f") + case p: LogicalPlan => p.transformExpressions { + case f: SQLScalarFunction => f.child + case f: SQLFunctionExpression => + throw SparkException.internalError( + s"SQL function expression should be rewritten during analysis: $f") + } + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala index b3384c4e29566..ad1a1a99b8257 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala @@ -71,53 +71,64 @@ case class InlineCTE( * @param plan The plan to collect the CTEs from * @param cteMap A mutable map that accumulates the CTEs and their reference information by CTE * ids. - * @param collectCTERefs A function to collect CTE references so that the caller side can do some - * bookkeeping work. + * @param outerCTEId While collecting the map we use this optional CTE id to identify the + * current outer CTE. */ private def buildCTEMap( plan: LogicalPlan, cteMap: mutable.Map[Long, CTEReferenceInfo], - collectCTERefs: CTERelationRef => Unit = _ => ()): Unit = { + outerCTEId: Option[Long] = None): Unit = { plan match { case WithCTE(child, cteDefs) => - cteDefs.foreach { cteDef => - cteMap(cteDef.id) = CTEReferenceInfo( - cteDef = cteDef, - refCount = 0, - outgoingRefs = mutable.Map.empty.withDefaultValue(0), - shouldInline = true - ) - } - cteDefs.foreach { cteDef => - buildCTEMap(cteDef, cteMap, ref => { - // A CTE relation can references CTE relations defined before it in the same `WithCTE`. - // Here we update the out-going-ref-count for it, in case this CTE relation is not - // referenced at all and can be optimized out, and we need to decrease the ref counts - // for CTE relations that are referenced by it. - if (cteDefs.exists(_.id == ref.cteId)) { - cteMap(cteDef.id).increaseOutgoingRefCount(ref.cteId, 1) - } - // Similarly, a CTE relation can reference CTE relations defined in the outer `WithCTE`. - // Here we call the `collectCTERefs` function so that the outer CTE can also update the - // out-going-ref-count if needed. - collectCTERefs(ref) - }) + val isDuplicated = cteDefs.forall(cteDef => cteMap.contains(cteDef.id)) + if (isDuplicated) { + // If we have seen this `WithCTE` node then it must be self-contained so we can clear + // the references from containers to the definitions, and we don't need to process it + // again + + cteDefs.foreach { cteDef => + cteMap(cteDef.id).container.foreach(c => cteMap(c).outgoingRefs -= cteDef.id) + } + } else { + cteDefs.foreach { cteDef => + cteMap(cteDef.id) = CTEReferenceInfo( + cteDef = cteDef, + refCount = 0, + outgoingRefs = mutable.Map.empty.withDefaultValue(0), + shouldInline = true, + container = outerCTEId + ) + } + + cteDefs.foreach { cteDef => + buildCTEMap(cteDef, cteMap, Some(cteDef.id)) + } + buildCTEMap(child, cteMap, outerCTEId) } - buildCTEMap(child, cteMap, collectCTERefs) case ref: CTERelationRef => cteMap(ref.cteId) = cteMap(ref.cteId).withRefCountIncreased(1) - collectCTERefs(ref) + + // The `outerCTEId` CTE definition can either reference `cteId` definition if `cteId` is in + // the same or in an outer `WithCTE` node, or `outerCTEId` can contain `cteId` definition if + // `cteId` is an inner `WithCTE` node inside `outerCTEId`. + // In both cases we can track the relations in `outgoingRefs` when we see a definition the + // first time. But if we encounter a conflicting duplicated contains relation later, then we + // will remove the references of the first contains relation. + outerCTEId.foreach { cteId => + cteMap(cteId).increaseOutgoingRefCount(ref.cteId, 1) + } + case _ => if (plan.containsPattern(CTE)) { plan.children.foreach { child => - buildCTEMap(child, cteMap, collectCTERefs) + buildCTEMap(child, cteMap, outerCTEId) } plan.expressions.foreach { expr => if (expr.containsAllPatterns(PLAN_EXPRESSION, CTE)) { expr.foreach { - case e: SubqueryExpression => buildCTEMap(e.plan, cteMap, collectCTERefs) + case e: SubqueryExpression => buildCTEMap(e.plan, cteMap, outerCTEId) case _ => } } @@ -225,12 +236,15 @@ case class InlineCTE( * from other CTE relations and regular places. * @param outgoingRefs A mutable map that tracks outgoing reference counts to other CTE relations. * @param shouldInline If true, this CTE relation should be inlined in the places that reference it. + * @param container The container of a CTE definition is another CTE definition in which the + * `WithCTE` node of the definition resides. */ case class CTEReferenceInfo( cteDef: CTERelationDef, refCount: Int, outgoingRefs: mutable.Map[Long, Int], - shouldInline: Boolean) { + shouldInline: Boolean, + container: Option[Long]) { def withRefCountIncreased(count: Int): CTEReferenceInfo = { copy(refCount = refCount + count) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InsertMapSortInGroupingExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InsertMapSortExpression.scala similarity index 69% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InsertMapSortInGroupingExpressions.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InsertMapSortExpression.scala index b6ced6c49a36f..9e613c54a49bd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InsertMapSortInGroupingExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InsertMapSortExpression.scala @@ -20,32 +20,30 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{Alias, ArrayTransform, CreateNamedStruct, Expression, GetStructField, If, IsNull, LambdaFunction, Literal, MapFromArrays, MapKeys, MapSort, MapValues, NamedExpression, NamedLambdaVariable} -import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project, RepartitionByExpression} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.trees.TreePattern +import org.apache.spark.sql.catalyst.trees.TreePattern.{AGGREGATE, REPARTITION_OPERATION} import org.apache.spark.sql.types.{ArrayType, MapType, StructType} import org.apache.spark.util.ArrayImplicits.SparkArrayOps /** - * Adds [[MapSort]] to group expressions containing map columns, as the key/value pairs need to be - * in the correct order before grouping: + * Adds [[MapSort]] to [[Aggregate]] expressions containing map columns, + * as the key/value pairs need to be in the correct order before grouping: * - * SELECT map_column, COUNT(*) FROM TABLE GROUP BY map_column => + * SELECT map_column, COUNT(*) FROM TABLE GROUP BY map_column => * SELECT _groupingmapsort as map_column, COUNT(*) FROM ( * SELECT map_sort(map_column) as _groupingmapsort FROM TABLE * ) GROUP BY _groupingmapsort */ object InsertMapSortInGroupingExpressions extends Rule[LogicalPlan] { - private def shouldAddMapSort(expr: Expression): Boolean = { - expr.dataType.existsRecursively(_.isInstanceOf[MapType]) - } + import InsertMapSortExpression._ override def apply(plan: LogicalPlan): LogicalPlan = { - if (!plan.containsPattern(TreePattern.AGGREGATE)) { + if (!plan.containsPattern(AGGREGATE)) { return plan } val shouldRewrite = plan.exists { - case agg: Aggregate if agg.groupingExpressions.exists(shouldAddMapSort) => true + case agg: Aggregate if agg.groupingExpressions.exists(mapTypeExistsRecursively) => true case _ => false } if (!shouldRewrite) { @@ -53,8 +51,7 @@ object InsertMapSortInGroupingExpressions extends Rule[LogicalPlan] { } plan transformUpWithNewOutput { - case agg @ Aggregate(groupingExprs, aggregateExpressions, child, _) - if agg.groupingExpressions.exists(shouldAddMapSort) => + case agg @ Aggregate(groupingExprs, aggregateExpressions, child, hint) => val exprToMapSort = new mutable.HashMap[Expression, NamedExpression] val newGroupingKeys = groupingExprs.map { expr => val inserted = insertMapSortRecursively(expr) @@ -77,15 +74,53 @@ object InsertMapSortInGroupingExpressions extends Rule[LogicalPlan] { }.asInstanceOf[NamedExpression] } val newChild = Project(child.output ++ exprToMapSort.values, child) - val newAgg = Aggregate(newGroupingKeys, newAggregateExprs, newChild) + val newAgg = Aggregate(newGroupingKeys, newAggregateExprs, newChild, hint) newAgg -> agg.output.zip(newAgg.output) } } +} + +/** + * Adds [[MapSort]] to [[RepartitionByExpression]] expressions containing map columns, + * as the key/value pairs need to be in the correct order before repartitioning: + * + * SELECT * FROM TABLE DISTRIBUTE BY map_column => + * SELECT * FROM TABLE DISTRIBUTE BY map_sort(map_column) + */ +object InsertMapSortInRepartitionExpressions extends Rule[LogicalPlan] { + import InsertMapSortExpression._ + + override def apply(plan: LogicalPlan): LogicalPlan = { + plan.transformUpWithPruning(_.containsPattern(REPARTITION_OPERATION)) { + case rep: RepartitionByExpression + if rep.partitionExpressions.exists(mapTypeExistsRecursively) => + val exprToMapSort = new mutable.HashMap[Expression, Expression] + val newPartitionExprs = rep.partitionExpressions.map { expr => + val inserted = insertMapSortRecursively(expr) + if (expr.ne(inserted)) { + exprToMapSort.getOrElseUpdate(expr.canonicalized, inserted) + } else { + expr + } + } + rep.copy(partitionExpressions = newPartitionExprs) + } + } +} + +private[optimizer] object InsertMapSortExpression { /** - * Inserts MapSort recursively taking into account when it is nested inside a struct or array. + * Returns true if the expression contains a [[MapType]] in DataType tree. */ - private def insertMapSortRecursively(e: Expression): Expression = { + def mapTypeExistsRecursively(expr: Expression): Boolean = { + expr.dataType.existsRecursively(_.isInstanceOf[MapType]) + } + + /** + * Inserts [[MapSort]] recursively taking into account when it is nested inside a struct or array. + */ + def insertMapSortRecursively(e: Expression): Expression = { e.dataType match { case m: MapType => // Check if value type of MapType contains MapType (possibly nested) @@ -122,5 +157,4 @@ object InsertMapSortInGroupingExpressions extends Rule[LogicalPlan] { case _ => e } } - } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 29216523fefc5..9d269f37e58b9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -73,6 +73,21 @@ abstract class Optimizer(catalogManager: CatalogManager) conf.optimizerMaxIterations, maxIterationsSetting = SQLConf.OPTIMIZER_MAX_ITERATIONS.key) + /** + * A helper method that takes as input a Seq of Batch or Seq[Batch], and flattens it out. + */ + def flattenBatches(nestedBatchSequence: Seq[Any]): Seq[Batch] = { + assert(nestedBatchSequence.forall { + case _: Batch => true + case s: Seq[_] => s.forall(_.isInstanceOf[Batch]) + case _ => false + }) + nestedBatchSequence.flatMap { + case batches: Seq[Batch @unchecked] => batches + case batch: Batch => Seq(batch) + } + } + /** * Defines the default rule batches in the Optimizer. * @@ -143,39 +158,38 @@ abstract class Optimizer(catalogManager: CatalogManager) PushdownPredicatesAndPruneColumnsForCTEDef) ++ extendedOperatorOptimizationRules - val operatorOptimizationBatch: Seq[Batch] = { + val operatorOptimizationBatch: Seq[Batch] = Seq( Batch("Operator Optimization before Inferring Filters", fixedPoint, - operatorOptimizationRuleSet: _*) :: + operatorOptimizationRuleSet: _*), Batch("Infer Filters", Once, InferFiltersFromGenerate, - InferFiltersFromConstraints) :: + InferFiltersFromConstraints), Batch("Operator Optimization after Inferring Filters", fixedPoint, - operatorOptimizationRuleSet: _*) :: + operatorOptimizationRuleSet: _*), Batch("Push extra predicate through join", fixedPoint, PushExtraPredicateThroughJoin, - PushDownPredicates) :: Nil - } + PushDownPredicates)) - val batches = ( - Batch("Finish Analysis", FixedPoint(1), FinishAnalysis) :: + val batches: Seq[Batch] = flattenBatches(Seq( + Batch("Finish Analysis", FixedPoint(1), FinishAnalysis), // We must run this batch after `ReplaceExpressions`, as `RuntimeReplaceable` expression // may produce `With` expressions that need to be rewritten. - Batch("Rewrite With expression", Once, RewriteWithExpression) :: + Batch("Rewrite With expression", fixedPoint, RewriteWithExpression), ////////////////////////////////////////////////////////////////////////////////////////// // Optimizer rules start here ////////////////////////////////////////////////////////////////////////////////////////// - Batch("Eliminate Distinct", Once, EliminateDistinct) :: + Batch("Eliminate Distinct", Once, EliminateDistinct), // - Do the first call of CombineUnions before starting the major Optimizer rules, // since it can reduce the number of iteration and the other rules could add/move // extra operators between two adjacent Union operators. // - Call CombineUnions again in Batch("Operator Optimizations"), // since the other rules might make two separate Unions operators adjacent. Batch("Inline CTE", Once, - InlineCTE()) :: + InlineCTE()), Batch("Union", fixedPoint, RemoveNoopOperators, CombineUnions, - RemoveNoopUnion) :: + RemoveNoopUnion), // Run this once earlier. This might simplify the plan and reduce cost of optimizer. // For example, a query such as Filter(LocalRelation) would go through all the heavy // optimizer rules that are triggered when there is a filter @@ -186,16 +200,16 @@ abstract class Optimizer(catalogManager: CatalogManager) PropagateEmptyRelation, // PropagateEmptyRelation can change the nullability of an attribute from nullable to // non-nullable when an empty relation child of a Union is removed - UpdateAttributeNullability) :: + UpdateAttributeNullability), Batch("Pullup Correlated Expressions", Once, OptimizeOneRowRelationSubquery, PullOutNestedDataOuterRefExpressions, - PullupCorrelatedPredicates) :: + PullupCorrelatedPredicates), // Subquery batch applies the optimizer rules recursively. Therefore, it makes no sense // to enforce idempotence on it and we change this batch from Once to FixedPoint(1). Batch("Subquery", FixedPoint(1), OptimizeSubqueries, - OptimizeOneRowRelationSubquery) :: + OptimizeOneRowRelationSubquery), Batch("Replace Operators", fixedPoint, RewriteExceptAll, RewriteIntersectAll, @@ -203,48 +217,48 @@ abstract class Optimizer(catalogManager: CatalogManager) ReplaceExceptWithFilter, ReplaceExceptWithAntiJoin, ReplaceDistinctWithAggregate, - ReplaceDeduplicateWithAggregate) :: + ReplaceDeduplicateWithAggregate), Batch("Aggregate", fixedPoint, RemoveLiteralFromGroupExpressions, - RemoveRepetitionFromGroupExpressions) :: Nil ++ - operatorOptimizationBatch) :+ - Batch("Clean Up Temporary CTE Info", Once, CleanUpTempCTEInfo) :+ + RemoveRepetitionFromGroupExpressions), + operatorOptimizationBatch, + Batch("Clean Up Temporary CTE Info", Once, CleanUpTempCTEInfo), // This batch rewrites plans after the operator optimization and // before any batches that depend on stats. - Batch("Pre CBO Rules", Once, preCBORules: _*) :+ + Batch("Pre CBO Rules", Once, preCBORules: _*), // This batch pushes filters and projections into scan nodes. Before this batch, the logical // plan may contain nodes that do not report stats. Anything that uses stats must run after // this batch. - Batch("Early Filter and Projection Push-Down", Once, earlyScanPushDownRules: _*) :+ - Batch("Update CTE Relation Stats", Once, UpdateCTERelationStats) :+ + Batch("Early Filter and Projection Push-Down", Once, earlyScanPushDownRules: _*), + Batch("Update CTE Relation Stats", Once, UpdateCTERelationStats), // Since join costs in AQP can change between multiple runs, there is no reason that we have an // idempotence enforcement on this batch. We thus make it FixedPoint(1) instead of Once. Batch("Join Reorder", FixedPoint(1), - CostBasedJoinReorder) :+ + CostBasedJoinReorder), Batch("Eliminate Sorts", Once, EliminateSorts, - RemoveRedundantSorts) :+ + RemoveRedundantSorts), Batch("Decimal Optimizations", fixedPoint, - DecimalAggregates) :+ + DecimalAggregates), // This batch must run after "Decimal Optimizations", as that one may change the // aggregate distinct column Batch("Distinct Aggregate Rewrite", Once, - RewriteDistinctAggregates) :+ + RewriteDistinctAggregates), Batch("Object Expressions Optimization", fixedPoint, EliminateMapObjects, CombineTypedFilters, ObjectSerializerPruning, - ReassignLambdaVariableID) :+ + ReassignLambdaVariableID), Batch("LocalRelation", fixedPoint, ConvertToLocalRelation, PropagateEmptyRelation, // PropagateEmptyRelation can change the nullability of an attribute from nullable to // non-nullable when an empty relation child of a Union is removed - UpdateAttributeNullability) :+ - Batch("Optimize One Row Plan", fixedPoint, OptimizeOneRowPlan) :+ + UpdateAttributeNullability), + Batch("Optimize One Row Plan", fixedPoint, OptimizeOneRowPlan), // The following batch should be executed after batch "Join Reorder" and "LocalRelation". Batch("Check Cartesian Products", Once, - CheckCartesianProducts) :+ + CheckCartesianProducts), Batch("RewriteSubquery", Once, RewritePredicateSubquery, PushPredicateThroughJoin, @@ -252,10 +266,10 @@ abstract class Optimizer(catalogManager: CatalogManager) ColumnPruning, CollapseProject, RemoveRedundantAliases, - RemoveNoopOperators) :+ + RemoveNoopOperators), // This batch must be executed after the `RewriteSubquery` batch, which creates joins. - Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers) :+ - Batch("ReplaceUpdateFieldsExpression", Once, ReplaceUpdateFieldsExpression) + Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers), + Batch("ReplaceUpdateFieldsExpression", Once, ReplaceUpdateFieldsExpression))) // remove any batches with no rules. this may happen when subclasses do not add optional rules. batches.filter(_.rules.nonEmpty) @@ -270,22 +284,23 @@ abstract class Optimizer(catalogManager: CatalogManager) * (defaultBatches - (excludedRules - nonExcludableRules)). */ def nonExcludableRules: Seq[String] = - FinishAnalysis.ruleName :: - RewriteDistinctAggregates.ruleName :: - ReplaceDeduplicateWithAggregate.ruleName :: - ReplaceIntersectWithSemiJoin.ruleName :: - ReplaceExceptWithFilter.ruleName :: - ReplaceExceptWithAntiJoin.ruleName :: - RewriteExceptAll.ruleName :: - RewriteIntersectAll.ruleName :: - ReplaceDistinctWithAggregate.ruleName :: - PullupCorrelatedPredicates.ruleName :: - RewriteCorrelatedScalarSubquery.ruleName :: - RewritePredicateSubquery.ruleName :: - NormalizeFloatingNumbers.ruleName :: - ReplaceUpdateFieldsExpression.ruleName :: - RewriteLateralSubquery.ruleName :: - OptimizeSubqueries.ruleName :: Nil + Seq( + FinishAnalysis.ruleName, + RewriteDistinctAggregates.ruleName, + ReplaceDeduplicateWithAggregate.ruleName, + ReplaceIntersectWithSemiJoin.ruleName, + ReplaceExceptWithFilter.ruleName, + ReplaceExceptWithAntiJoin.ruleName, + RewriteExceptAll.ruleName, + RewriteIntersectAll.ruleName, + ReplaceDistinctWithAggregate.ruleName, + PullupCorrelatedPredicates.ruleName, + RewriteCorrelatedScalarSubquery.ruleName, + RewritePredicateSubquery.ruleName, + NormalizeFloatingNumbers.ruleName, + ReplaceUpdateFieldsExpression.ruleName, + RewriteLateralSubquery.ruleName, + OptimizeSubqueries.ruleName) /** * Apply finish-analysis rules for the entire plan including all subqueries. @@ -298,7 +313,9 @@ abstract class Optimizer(catalogManager: CatalogManager) private val rules = Seq( EliminateResolvedHint, EliminateSubqueryAliases, + EliminatePipeOperators, EliminateView, + EliminateSQLFunctionNode, ReplaceExpressions, RewriteNonCorrelatedExists, PullOutGroupingExpressions, @@ -306,6 +323,7 @@ abstract class Optimizer(catalogManager: CatalogManager) // so the grouping keys can only be attribute and literal which makes // `InsertMapSortInGroupingExpressions` easy to insert `MapSort`. InsertMapSortInGroupingExpressions, + InsertMapSortInRepartitionExpressions, ComputeCurrentTime, ReplaceCurrentLike(catalogManager), SpecialDatetimeValues, @@ -346,7 +364,7 @@ abstract class Optimizer(catalogManager: CatalogManager) case d: DynamicPruningSubquery => d case s @ ScalarSubquery( PhysicalOperation(projections, predicates, a @ Aggregate(group, _, child, _)), - _, _, _, _, mayHaveCountBug, _, _) + _, _, _, _, mayHaveCountBug, _) if conf.getConf(SQLConf.DECORRELATE_SUBQUERY_PREVENT_CONSTANT_FOLDING_FOR_COUNT_BUG) && mayHaveCountBug.nonEmpty && mayHaveCountBug.get => // This is a subquery with an aggregate that may suffer from a COUNT bug. @@ -1031,6 +1049,9 @@ object ColumnPruning extends Rule[LogicalPlan] { // Can't prune the columns on LeafNode case p @ Project(_, _: LeafNode) => p + // Can't prune the columns on UpdateEventTimeWatermarkColumn + case p @ Project(_, _: UpdateEventTimeWatermarkColumn) => p + case NestedColumnAliasing(rewrittenPlan) => rewrittenPlan // for all other logical plans that inherits the output from it's children diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushdownPredicatesAndPruneColumnsForCTEDef.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushdownPredicatesAndPruneColumnsForCTEDef.scala index aa13e6a67c510..59b3d83c55162 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushdownPredicatesAndPruneColumnsForCTEDef.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushdownPredicatesAndPruneColumnsForCTEDef.scala @@ -122,7 +122,7 @@ object PushdownPredicatesAndPruneColumnsForCTEDef extends Rule[LogicalPlan] { private def pushdownPredicatesAndAttributes( plan: LogicalPlan, cteMap: CTEMap): LogicalPlan = plan.transformWithSubqueries { - case cteDef @ CTERelationDef(child, id, originalPlanWithPredicates, _) => + case cteDef @ CTERelationDef(child, id, originalPlanWithPredicates, _, _) => val (_, _, newPreds, newAttrSet) = cteMap(id) val originalPlan = originalPlanWithPredicates.map(_._1).getOrElse(child) val preds = originalPlanWithPredicates.map(_._2).getOrElse(Seq.empty) @@ -141,7 +141,7 @@ object PushdownPredicatesAndPruneColumnsForCTEDef extends Rule[LogicalPlan] { cteDef } - case cteRef @ CTERelationRef(cteId, _, output, _, _) => + case cteRef @ CTERelationRef(cteId, _, output, _, _, _) => val (cteDef, _, _, newAttrSet) = cteMap(cteId) if (needsPruning(cteDef.child, newAttrSet)) { val indices = newAttrSet.toSeq.map(cteDef.output.indexOf) @@ -170,7 +170,7 @@ object PushdownPredicatesAndPruneColumnsForCTEDef extends Rule[LogicalPlan] { object CleanUpTempCTEInfo extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan.transformWithPruning(_.containsPattern(CTE)) { - case cteDef @ CTERelationDef(_, _, Some(_), _) => + case cteDef @ CTERelationDef(_, _, Some(_), _, _) => cteDef.copy(originalPlanWithPredicates = None) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala index 393a66f7c1e4f..5d85e89e1eabe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Plan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, WITH_EXPRESSION} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.util.Utils /** * Rewrites the `With` expressions by adding a `Project` to pre-evaluate the common expressions, or @@ -66,11 +67,25 @@ object RewriteWithExpression extends Rule[LogicalPlan] { } private def applyInternal(p: LogicalPlan): LogicalPlan = { - val inputPlans = p.children.toArray + val inputPlans = p.children + val commonExprIdSet = p.expressions + .flatMap(_.collect { case r: CommonExpressionRef => r.id }) + .groupBy(identity) + .transform((_, v) => v.size) + .filter(_._2 > 1) + .keySet + val commonExprsPerChild = Array.fill(inputPlans.length)(mutable.ListBuffer.empty[(Alias, Long)]) var newPlan: LogicalPlan = p.mapExpressions { expr => - rewriteWithExprAndInputPlans(expr, inputPlans) + rewriteWithExprAndInputPlans(expr, inputPlans, commonExprsPerChild, commonExprIdSet) } - newPlan = newPlan.withNewChildren(inputPlans.toIndexedSeq) + val newChildren = inputPlans.zip(commonExprsPerChild).map { case (inputPlan, commonExprs) => + if (commonExprs.isEmpty) { + inputPlan + } else { + Project(inputPlan.output ++ commonExprs.map(_._1), inputPlan) + } + } + newPlan = newPlan.withNewChildren(newChildren) // Since we add extra Projects with extra columns to pre-evaluate the common expressions, // the current operator may have extra columns if it inherits the output columns from its // child, and we need to project away the extra columns to keep the plan schema unchanged. @@ -85,33 +100,34 @@ object RewriteWithExpression extends Rule[LogicalPlan] { private def rewriteWithExprAndInputPlans( e: Expression, - inputPlans: Array[LogicalPlan]): Expression = { + inputPlans: Seq[LogicalPlan], + commonExprsPerChild: Array[mutable.ListBuffer[(Alias, Long)]], + commonExprIdSet: Set[CommonExpressionId], + isNestedWith: Boolean = false): Expression = { if (!e.containsPattern(WITH_EXPRESSION)) return e e match { - case w: With => + // Do not handle nested With in one pass. Leave it to the next rule executor batch. + case w: With if !isNestedWith => // Rewrite nested With expressions first - val child = rewriteWithExprAndInputPlans(w.child, inputPlans) - val defs = w.defs.map(rewriteWithExprAndInputPlans(_, inputPlans)) + val child = rewriteWithExprAndInputPlans( + w.child, inputPlans, commonExprsPerChild, commonExprIdSet, isNestedWith = true) + val defs = w.defs.map(rewriteWithExprAndInputPlans( + _, inputPlans, commonExprsPerChild, commonExprIdSet, isNestedWith = true)) val refToExpr = mutable.HashMap.empty[CommonExpressionId, Expression] - val childProjections = Array.fill(inputPlans.length)(mutable.ArrayBuffer.empty[Alias]) defs.zipWithIndex.foreach { case (CommonExpressionDef(child, id), index) => - if (child.containsPattern(COMMON_EXPR_REF)) { - throw SparkException.internalError( - "Common expression definition cannot reference other Common expression definitions") - } if (id.canonicalized) { throw SparkException.internalError( "Cannot rewrite canonicalized Common expression definitions") } - if (CollapseProject.isCheap(child)) { + if (CollapseProject.isCheap(child) || !commonExprIdSet.contains(id)) { refToExpr(id) = child } else { - val childProjectionIndex = inputPlans.indexWhere( + val childPlanIndex = inputPlans.indexWhere( c => child.references.subsetOf(c.outputSet) ) - if (childProjectionIndex == -1) { + if (childPlanIndex == -1) { // When we cannot rewrite the common expressions, force to inline them so that the // query can still run. This can happen if the join condition contains `With` and // the common expression references columns from both join sides. @@ -122,36 +138,37 @@ object RewriteWithExpression extends Rule[LogicalPlan] { // if it's ref count is 1. refToExpr(id) = child } else { - val aliasName = if (SQLConf.get.getConf(SQLConf.USE_COMMON_EXPR_ID_FOR_ALIAS)) { - s"_common_expr_${id.id}" - } else { - s"_common_expr_$index" - } - val alias = Alias(child, aliasName)() - val fakeProj = Project(Seq(alias), inputPlans(childProjectionIndex)) - if (PlanHelper.specialExpressionsInUnsupportedOperator(fakeProj).nonEmpty) { - // We have to inline the common expression if it cannot be put in a Project. - refToExpr(id) = child + val commonExprs = commonExprsPerChild(childPlanIndex) + val existingCommonExpr = commonExprs.find(_._2 == id.id) + if (existingCommonExpr.isDefined) { + if (Utils.isTesting) { + assert(existingCommonExpr.get._1.child.semanticEquals(child)) + } + refToExpr(id) = existingCommonExpr.get._1.toAttribute } else { - childProjections(childProjectionIndex) += alias - refToExpr(id) = alias.toAttribute + val aliasName = if (SQLConf.get.getConf(SQLConf.USE_COMMON_EXPR_ID_FOR_ALIAS)) { + s"_common_expr_${id.id}" + } else { + s"_common_expr_$index" + } + val alias = Alias(child, aliasName)() + val fakeProj = Project(Seq(alias), inputPlans(childPlanIndex)) + if (PlanHelper.specialExpressionsInUnsupportedOperator(fakeProj).nonEmpty) { + // We have to inline the common expression if it cannot be put in a Project. + refToExpr(id) = child + } else { + commonExprs.append((alias, id.id)) + refToExpr(id) = alias.toAttribute + } } } } } - for (i <- inputPlans.indices) { - val projectList = childProjections(i) - if (projectList.nonEmpty) { - inputPlans(i) = Project(inputPlans(i).output ++ projectList, inputPlans(i)) - } - } - child.transformWithPruning(_.containsPattern(COMMON_EXPR_REF)) { - case ref: CommonExpressionRef => - if (!refToExpr.contains(ref.id)) { - throw SparkException.internalError("Undefined common expression id " + ref.id) - } + // `child` may contain nested With and we only replace `CommonExpressionRef` that + // references common expressions in the current `With`. + case ref: CommonExpressionRef if refToExpr.contains(ref.id) => if (ref.id.canonicalized) { throw SparkException.internalError( "Cannot rewrite canonicalized Common expression references") @@ -161,7 +178,8 @@ object RewriteWithExpression extends Rule[LogicalPlan] { case c: ConditionalExpression => val newAlwaysEvaluatedInputs = c.alwaysEvaluatedInputs.map( - rewriteWithExprAndInputPlans(_, inputPlans)) + rewriteWithExprAndInputPlans( + _, inputPlans, commonExprsPerChild, commonExprIdSet, isNestedWith)) val newExpr = c.withNewAlwaysEvaluatedInputs(newAlwaysEvaluatedInputs) // Use transformUp to handle nested With. newExpr.transformUpWithPruning(_.containsPattern(WITH_EXPRESSION)) { @@ -174,7 +192,10 @@ object RewriteWithExpression extends Rule[LogicalPlan] { } } - case other => other.mapChildren(rewriteWithExprAndInputPlans(_, inputPlans)) + case other => other.mapChildren( + rewriteWithExprAndInputPlans( + _, inputPlans, commonExprsPerChild, commonExprIdSet, isNestedWith) + ) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 754fea85ec6d7..e867953bcf282 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -90,7 +90,7 @@ object ConstantFolding extends Rule[LogicalPlan] { } // Don't replace ScalarSubquery if its plan is an aggregate that may suffer from a COUNT bug. - case s @ ScalarSubquery(_, _, _, _, _, mayHaveCountBug, _, _) + case s @ ScalarSubquery(_, _, _, _, _, mayHaveCountBug, _) if conf.getConf(SQLConf.DECORRELATE_SUBQUERY_PREVENT_CONSTANT_FOLDING_FOR_COUNT_BUG) && mayHaveCountBug.nonEmpty && mayHaveCountBug.get => s diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index 8c82769dbf4a3..5a4e9f37c3951 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -131,12 +131,12 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { // Filter the plan by applying left semi and left anti joins. withSubquery.foldLeft(newFilter) { - case (p, Exists(sub, _, _, conditions, subHint, _)) => + case (p, Exists(sub, _, _, conditions, subHint)) => val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) val join = buildJoin(outerPlan, rewriteDomainJoinsIfPresent(outerPlan, sub, joinCond), LeftSemi, joinCond, subHint) Project(p.output, join) - case (p, Not(Exists(sub, _, _, conditions, subHint, _))) => + case (p, Not(Exists(sub, _, _, conditions, subHint))) => val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) val join = buildJoin(outerPlan, rewriteDomainJoinsIfPresent(outerPlan, sub, joinCond), LeftAnti, joinCond, subHint) @@ -319,7 +319,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { val introducedAttrs = ArrayBuffer.empty[Attribute] val newExprs = exprs.map { e => e.transformDownWithPruning(_.containsAnyPattern(EXISTS_SUBQUERY, IN_SUBQUERY)) { - case Exists(sub, _, _, conditions, subHint, _) => + case Exists(sub, _, _, conditions, subHint) => val exists = AttributeReference("exists", BooleanType, nullable = false)() val existenceJoin = ExistenceJoin(exists) val newCondition = conditions.reduceLeftOption(And) @@ -507,7 +507,7 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper plan.transformExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION)) { case ScalarSubquery(sub, children, exprId, conditions, hint, - mayHaveCountBugOld, needSingleJoinOld, _) + mayHaveCountBugOld, needSingleJoinOld) if children.nonEmpty => def mayHaveCountBugAgg(a: Aggregate): Boolean = { @@ -560,7 +560,7 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper } ScalarSubquery(newPlan, children, exprId, getJoinCondition(newCond, conditions), hint, Some(mayHaveCountBug), Some(needSingleJoin)) - case Exists(sub, children, exprId, conditions, hint, _) if children.nonEmpty => + case Exists(sub, children, exprId, conditions, hint) if children.nonEmpty => val (newPlan, newCond) = if (SQLConf.get.decorrelateInnerQueryEnabledForExistsIn) { decorrelate(sub, plan, handleCountBug = true) } else { @@ -818,7 +818,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe val subqueryAttrMapping = ArrayBuffer[(Attribute, Attribute)]() val newChild = subqueries.foldLeft(child) { case (currentChild, ScalarSubquery(sub, _, _, conditions, subHint, mayHaveCountBug, - needSingleJoin, _)) => + needSingleJoin)) => val query = DecorrelateInnerQuery.rewriteDomainJoins(currentChild, sub, conditions) val origOutput = query.output.head // The subquery appears on the right side of the join, hence add its hint to the right @@ -1064,8 +1064,7 @@ object OptimizeOneRowRelationSubquery extends Rule[LogicalPlan] { case p: LogicalPlan => p.transformExpressionsUpWithPruning( _.containsPattern(SCALAR_SUBQUERY)) { - case s @ ScalarSubquery( - OneRowSubquery(p @ Project(_, _: OneRowRelation)), _, _, _, _, _, _, _) + case s @ ScalarSubquery(OneRowSubquery(p @ Project(_, _: OneRowRelation)), _, _, _, _, _, _) if !hasCorrelatedSubquery(s.plan) && s.joinCond.isEmpty => assert(p.projectList.size == 1) stripOuterReferences(p.projectList).head diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 08a8cf6bab87a..b408fcefcfb26 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.catalyst.trees.TreePattern.PARAMETER import org.apache.spark.sql.catalyst.types.DataTypeUtils -import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, DateTimeUtils, IntervalUtils, SparkParserUtils} +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, DateTimeUtils, IntervalUtils, SparkParserUtils} import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, convertSpecialTimestamp, convertSpecialTimestampNTZ, getZoneId, stringToDate, stringToTimestamp, stringToTimestampWithoutTimeZone} import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces, TableCatalog, TableWritePrivilege} import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition @@ -144,14 +144,27 @@ class AstBuilder extends DataTypeAstBuilder override def visitSingleCompoundStatement(ctx: SingleCompoundStatementContext): CompoundBody = { val labelCtx = new SqlScriptingLabelContext() - visitCompoundBodyImpl(ctx.compoundBody(), None, allowVarDeclare = true, labelCtx) + val labelText = labelCtx.enterLabeledScope(None, None) + + val script = Option(ctx.compoundBody()) + .map(visitCompoundBodyImpl( + _, + Some(labelText), + allowVarDeclare = true, + labelCtx, + isScope = true + )).getOrElse(CompoundBody(Seq.empty, Some(labelText), isScope = true)) + + labelCtx.exitLabeledScope(None) + script } private def visitCompoundBodyImpl( ctx: CompoundBodyContext, label: Option[String], allowVarDeclare: Boolean, - labelCtx: SqlScriptingLabelContext): CompoundBody = { + labelCtx: SqlScriptingLabelContext, + isScope: Boolean): CompoundBody = { val buff = ListBuffer[CompoundPlanStatement]() ctx.compoundStatements.forEach( compoundStatement => buff += visitCompoundStatementImpl(compoundStatement, labelCtx)) @@ -183,7 +196,7 @@ class AstBuilder extends DataTypeAstBuilder case _ => } - CompoundBody(buff.toSeq, label) + CompoundBody(buff.toSeq, label, isScope) } private def visitBeginEndCompoundBlockImpl( @@ -191,12 +204,14 @@ class AstBuilder extends DataTypeAstBuilder labelCtx: SqlScriptingLabelContext): CompoundBody = { val labelText = labelCtx.enterLabeledScope(Option(ctx.beginLabel()), Option(ctx.endLabel())) - val body = visitCompoundBodyImpl( - ctx.compoundBody(), - Some(labelText), - allowVarDeclare = true, - labelCtx - ) + val body = Option(ctx.compoundBody()) + .map(visitCompoundBodyImpl( + _, + Some(labelText), + allowVarDeclare = true, + labelCtx, + isScope = true + )).getOrElse(CompoundBody(Seq.empty, Some(labelText), isScope = true)) labelCtx.exitLabeledScope(Option(ctx.beginLabel())) body } @@ -226,6 +241,8 @@ class AstBuilder extends DataTypeAstBuilder visitSearchedCaseStatementImpl(searchedCaseContext, labelCtx) case simpleCaseContext: SimpleCaseStatementContext => visitSimpleCaseStatementImpl(simpleCaseContext, labelCtx) + case forStatementContext: ForStatementContext => + visitForStatementImpl(forStatementContext, labelCtx) case stmt => visit(stmt).asInstanceOf[CompoundPlanStatement] } } else { @@ -245,10 +262,12 @@ class AstBuilder extends DataTypeAstBuilder OneRowRelation())) }), conditionalBodies = ctx.conditionalBodies.asScala.toList.map( - body => visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx) + body => + visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx, isScope = false) ), elseBody = Option(ctx.elseBody).map( - body => visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx) + body => + visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx, isScope = false) ) ) } @@ -265,7 +284,13 @@ class AstBuilder extends DataTypeAstBuilder Project( Seq(Alias(expression(boolExpr), "condition")()), OneRowRelation()))} - val body = visitCompoundBodyImpl(ctx.compoundBody(), None, allowVarDeclare = false, labelCtx) + val body = visitCompoundBodyImpl( + ctx.compoundBody(), + None, + allowVarDeclare = false, + labelCtx, + isScope = false + ) labelCtx.exitLabeledScope(Option(ctx.beginLabel())) WhileStatement(condition, body, Some(labelText)) @@ -282,7 +307,8 @@ class AstBuilder extends DataTypeAstBuilder }) val conditionalBodies = ctx.conditionalBodies.asScala.toList.map( - body => visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx) + body => + visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx, isScope = false) ) if (conditions.length != conditionalBodies.length) { @@ -295,7 +321,8 @@ class AstBuilder extends DataTypeAstBuilder conditions = conditions, conditionalBodies = conditionalBodies, elseBody = Option(ctx.elseBody).map( - body => visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx) + body => + visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx, isScope = false) )) } @@ -312,7 +339,8 @@ class AstBuilder extends DataTypeAstBuilder }) val conditionalBodies = ctx.conditionalBodies.asScala.toList.map( - body => visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx) + body => + visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx, isScope = false) ) if (conditions.length != conditionalBodies.length) { @@ -325,7 +353,8 @@ class AstBuilder extends DataTypeAstBuilder conditions = conditions, conditionalBodies = conditionalBodies, elseBody = Option(ctx.elseBody).map( - body => visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx) + body => + visitCompoundBodyImpl(body, None, allowVarDeclare = false, labelCtx, isScope = false) )) } @@ -341,34 +370,66 @@ class AstBuilder extends DataTypeAstBuilder Project( Seq(Alias(expression(boolExpr), "condition")()), OneRowRelation()))} - val body = visitCompoundBodyImpl(ctx.compoundBody(), None, allowVarDeclare = false, labelCtx) + val body = visitCompoundBodyImpl( + ctx.compoundBody(), + None, + allowVarDeclare = false, + labelCtx, + isScope = false + ) labelCtx.exitLabeledScope(Option(ctx.beginLabel())) RepeatStatement(condition, body, Some(labelText)) } + private def visitForStatementImpl( + ctx: ForStatementContext, + labelCtx: SqlScriptingLabelContext): ForStatement = { + val labelText = labelCtx.enterLabeledScope(Option(ctx.beginLabel()), Option(ctx.endLabel())) + + val queryCtx = ctx.query() + val query = withOrigin(queryCtx) { + SingleStatement(visitQuery(queryCtx)) + } + val varName = Option(ctx.multipartIdentifier()).map(_.getText) + val body = visitCompoundBodyImpl( + ctx.compoundBody(), + None, + allowVarDeclare = false, + labelCtx, + isScope = false + ) + labelCtx.exitLabeledScope(Option(ctx.beginLabel())) + + ForStatement(query, varName, body, Some(labelText)) + } + private def leaveOrIterateContextHasLabel( ctx: RuleContext, label: String, isIterate: Boolean): Boolean = { ctx match { case c: BeginEndCompoundBlockContext - if Option(c.beginLabel()).isDefined && - c.beginLabel().multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) => - if (isIterate) { + if Option(c.beginLabel()).exists { b => + b.multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) + } => if (isIterate) { throw SqlScriptingErrors.invalidIterateLabelUsageForCompound(CurrentOrigin.get, label) } true case c: WhileStatementContext - if Option(c.beginLabel()).isDefined && - c.beginLabel().multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) - => true + if Option(c.beginLabel()).exists { b => + b.multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) + } => true case c: RepeatStatementContext - if Option(c.beginLabel()).isDefined && - c.beginLabel().multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) - => true + if Option(c.beginLabel()).exists { b => + b.multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) + } => true case c: LoopStatementContext - if Option(c.beginLabel()).isDefined && - c.beginLabel().multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) - => true + if Option(c.beginLabel()).exists { b => + b.multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) + } => true + case c: ForStatementContext + if Option(c.beginLabel()).exists { b => + b.multipartIdentifier().getText.toLowerCase(Locale.ROOT).equals(label) + } => true case _ => false } } @@ -410,7 +471,13 @@ class AstBuilder extends DataTypeAstBuilder labelCtx: SqlScriptingLabelContext): LoopStatement = { val labelText = labelCtx.enterLabeledScope(Option(ctx.beginLabel()), Option(ctx.endLabel())) - val body = visitCompoundBodyImpl(ctx.compoundBody(), None, allowVarDeclare = false, labelCtx) + val body = visitCompoundBodyImpl( + ctx.compoundBody(), + None, + allowVarDeclare = false, + labelCtx, + isScope = false + ) labelCtx.exitLabeledScope(Option(ctx.beginLabel())) LoopStatement(body, Some(labelText)) @@ -484,7 +551,7 @@ class AstBuilder extends DataTypeAstBuilder throw QueryParsingErrors.duplicateCteDefinitionNamesError( duplicates.map(toSQLId).mkString(", "), ctx) } - UnresolvedWith(plan, ctes.toSeq) + UnresolvedWith(plan, ctes.toSeq, ctx.RECURSIVE() != null) } /** @@ -527,7 +594,10 @@ class AstBuilder extends DataTypeAstBuilder optionalMap(body.queryOrganization)(withQueryResultClauses(_, _, forPipeOperators = false)) } // If there are multiple SELECT just UNION them together into one query. - if (selects.length == 1) { + if (selects.length == 0) { + // This is a "FROM " clause with no other syntax. + from + } else if (selects.length == 1) { selects.head } else { Union(selects.toSeq) @@ -2128,7 +2198,7 @@ class AstBuilder extends DataTypeAstBuilder } val unresolvedTable = UnresolvedInlineTable(aliases, rows.toSeq) - val table = if (conf.getConf(SQLConf.EAGER_EVAL_OF_UNRESOLVED_INLINE_TABLE_ENABLED)) { + val table = if (canEagerlyEvaluateInlineTable(ctx, unresolvedTable)) { EvaluateUnresolvedInlineTable.evaluate(unresolvedTable) } else { unresolvedTable @@ -2136,6 +2206,42 @@ class AstBuilder extends DataTypeAstBuilder table.optionalMap(ctx.tableAlias.strictIdentifier)(aliasPlan) } + /** + * Determines if the inline table can be eagerly evaluated. + */ + private def canEagerlyEvaluateInlineTable( + ctx: InlineTableContext, + table: UnresolvedInlineTable): Boolean = { + if (!conf.getConf(SQLConf.EAGER_EVAL_OF_UNRESOLVED_INLINE_TABLE_ENABLED)) { + return false + } else if (!ResolveDefaultStringTypes.needsResolution(table.expressions)) { + // if there are no strings to be resolved we can always evaluate eagerly + return true + } + + val isSessionCollationSet = conf.defaultStringType != StringType + + // if either of these are true we need to resolve + // the string types first + !isSessionCollationSet && !contextInsideCreate(ctx) + } + + private def contextInsideCreate(ctx: ParserRuleContext): Boolean = { + var currentContext: RuleContext = ctx + + while (currentContext != null) { + if (currentContext.isInstanceOf[CreateTableContext] || + currentContext.isInstanceOf[ReplaceTableContext] || + currentContext.isInstanceOf[CreateViewContext]) { + return true + } + + currentContext = currentContext.parent + } + + false + } + /** * Create an alias (SubqueryAlias) for a join relation. This is practically the same as * visitAliasedQuery and visitNamedExpression, ANTLR4 however requires us to use 3 different @@ -2228,14 +2334,6 @@ class AstBuilder extends DataTypeAstBuilder FunctionIdentifier(ctx.function.getText, Option(ctx.db).map(_.getText)) } - /** - * Create a multi-part identifier. - */ - override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = - withOrigin(ctx) { - ctx.parts.asScala.map(_.getText).toSeq - } - /* ******************************************************************************************** * Expression parsing * ******************************************************************************************** */ @@ -2275,9 +2373,10 @@ class AstBuilder extends DataTypeAstBuilder def visitStarExcept(ctx: StarContext, target: Option[Seq[String]]): Expression = withOrigin(ctx) { val exceptCols = ctx.exceptClause .exceptCols.multipartIdentifier.asScala.map(typedVisit[Seq[String]]) - UnresolvedStarExcept( + UnresolvedStarExceptOrReplace( target, - exceptCols.toSeq) + exceptCols.toSeq, + replacements = None) } /** @@ -2647,15 +2746,16 @@ class AstBuilder extends DataTypeAstBuilder */ override def visitCollate(ctx: CollateContext): Expression = withOrigin(ctx) { val collationName = visitCollateClause(ctx.collateClause()) - Collate(expression(ctx.primaryExpression), collationName) + + Collate(expression(ctx.primaryExpression), UnresolvedCollation(collationName)) } - override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) { - val collationName = ctx.collationName.getText - if (!SQLConf.get.trimCollationEnabled && collationName.toUpperCase().contains("TRIM")) { + override def visitCollateClause(ctx: CollateClauseContext): Seq[String] = withOrigin(ctx) { + val collationName = visitMultipartIdentifier(ctx.collationName) + if (!SQLConf.get.trimCollationEnabled && collationName.last.toUpperCase().contains("TRIM")) { throw QueryCompilationErrors.trimCollationNotEnabledError() } - ctx.identifier.getText + collationName } /** @@ -2663,20 +2763,6 @@ class AstBuilder extends DataTypeAstBuilder */ override def visitCast(ctx: CastContext): Expression = withOrigin(ctx) { val rawDataType = typedVisit[DataType](ctx.dataType()) - ctx.dataType() match { - case context: PrimitiveDataTypeContext => - val typeCtx = context.`type`() - if (typeCtx.start.getType == STRING) { - typeCtx.children.asScala.toSeq match { - case Seq(_, cctx: CollateClauseContext) => - throw QueryParsingErrors.dataTypeUnsupportedError( - rawDataType.typeName, - ctx.dataType().asInstanceOf[PrimitiveDataTypeContext]) - case _ => - } - } - case _ => - } val dataType = CharVarcharUtils.replaceCharVarcharWithStringForCast(rawDataType) ctx.name.getType match { case SqlBaseParser.CAST => @@ -2696,20 +2782,6 @@ class AstBuilder extends DataTypeAstBuilder */ override def visitCastByColon(ctx: CastByColonContext): Expression = withOrigin(ctx) { val rawDataType = typedVisit[DataType](ctx.dataType()) - ctx.dataType() match { - case context: PrimitiveDataTypeContext => - val typeCtx = context.`type`() - if (typeCtx.start.getType == STRING) { - typeCtx.children.asScala.toSeq match { - case Seq(_, cctx: CollateClauseContext) => - throw QueryParsingErrors.dataTypeUnsupportedError( - rawDataType.typeName, - ctx.dataType().asInstanceOf[PrimitiveDataTypeContext]) - case _ => - } - } - case _ => - } val dataType = CharVarcharUtils.replaceCharVarcharWithStringForCast(rawDataType) val cast = Cast(expression(ctx.primaryExpression), dataType) cast.setTagValue(Cast.USER_SPECIFIED_CAST, ()) @@ -3346,7 +3418,7 @@ class AstBuilder extends DataTypeAstBuilder * Create a String literal expression. */ override def visitStringLiteral(ctx: StringLiteralContext): Literal = withOrigin(ctx) { - Literal.create(createString(ctx), conf.defaultStringType) + Literal.create(createString(ctx), StringType) } /** @@ -3422,7 +3494,7 @@ class AstBuilder extends DataTypeAstBuilder /** * Create an [[UnresolvedTableOrView]] from a multi-part identifier. */ - private def createUnresolvedTableOrView( + protected def createUnresolvedTableOrView( ctx: IdentifierReferenceContext, commandName: String, allowTempView: Boolean = true): LogicalPlan = withOrigin(ctx) { @@ -3797,6 +3869,19 @@ class AstBuilder extends DataTypeAstBuilder ctx.asScala.headOption.map(visitCommentSpec) } + protected def visitCollationSpecList( + ctx: java.util.List[CollationSpecContext]): Option[String] = { + ctx.asScala.headOption.map(visitCollationSpec) + } + + override def visitCollationSpec(ctx: CollationSpecContext): String = withOrigin(ctx) { + if (!SQLConf.get.objectLevelCollationsEnabled) { + throw QueryCompilationErrors.objectLevelCollationsNotEnabledError() + } + val collationName = ctx.identifier.getText + CollationFactory.fetchCollation(collationName).collationName + } + /** * Create a [[BucketSpec]]. */ @@ -3928,6 +4013,7 @@ class AstBuilder extends DataTypeAstBuilder * - options * - location * - comment + * - collation * - serde * - clusterBySpec * @@ -3936,8 +4022,8 @@ class AstBuilder extends DataTypeAstBuilder * types like `i INT`, which should be appended to the existing table schema. */ type TableClauses = ( - Seq[Transform], Seq[ColumnDefinition], Option[BucketSpec], Map[String, String], - OptionList, Option[String], Option[String], Option[SerdeInfo], Option[ClusterBySpec]) + Seq[Transform], Seq[ColumnDefinition], Option[BucketSpec], Map[String, String], OptionList, + Option[String], Option[String], Option[String], Option[SerdeInfo], Option[ClusterBySpec]) /** * Validate a create table statement and return the [[TableIdentifier]]. @@ -4224,6 +4310,10 @@ class AstBuilder extends DataTypeAstBuilder throw QueryParsingErrors.cannotCleanReservedTablePropertyError( PROP_EXTERNAL, ctx, "please use CREATE EXTERNAL TABLE") case (PROP_EXTERNAL, _) => false + case (PROP_COLLATION, _) if !legacyOn => + throw QueryParsingErrors.cannotCleanReservedTablePropertyError( + PROP_COLLATION, ctx, "please use the DEFAULT COLLATION clause to specify it") + case (PROP_COLLATION, _) => false // It's safe to set whatever table comment, so we don't make it a reserved table property. case (PROP_COMMENT, _) => true case (k, _) => @@ -4403,6 +4493,7 @@ class AstBuilder extends DataTypeAstBuilder checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx) checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx) checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) + checkDuplicateClauses(ctx.collationSpec(), "DEFAULT COLLATION", ctx) checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx) checkDuplicateClauses(ctx.clusterBySpec(), "CLUSTER BY", ctx) checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx) @@ -4421,6 +4512,7 @@ class AstBuilder extends DataTypeAstBuilder val location = visitLocationSpecList(ctx.locationSpec()) val (cleanedOptions, newLocation) = cleanTableOptions(ctx, options, location) val comment = visitCommentSpecList(ctx.commentSpec()) + val collation = visitCollationSpecList(ctx.collationSpec()) val serdeInfo = getSerdeInfo(ctx.rowFormat.asScala.toSeq, ctx.createFileFormat.asScala.toSeq, ctx) val clusterBySpec = ctx.clusterBySpec().asScala.headOption.map(visitClusterBySpec) @@ -4435,7 +4527,7 @@ class AstBuilder extends DataTypeAstBuilder } (partTransforms, partCols, bucketSpec, cleanedProperties, cleanedOptions, newLocation, comment, - serdeInfo, clusterBySpec) + collation, serdeInfo, clusterBySpec) } protected def getSerdeInfo( @@ -4495,6 +4587,7 @@ class AstBuilder extends DataTypeAstBuilder * ] * [LOCATION path] * [COMMENT table_comment] + * [DEFAULT COLLATION collation_name] * [TBLPROPERTIES (property_name=property_value, ...)] * * partition_fields: @@ -4508,8 +4601,8 @@ class AstBuilder extends DataTypeAstBuilder val columns = Option(ctx.colDefinitionList()).map(visitColDefinitionList).getOrElse(Nil) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) - val (partTransforms, partCols, bucketSpec, properties, options, location, - comment, serdeInfo, clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) + val (partTransforms, partCols, bucketSpec, properties, options, location, comment, + collation, serdeInfo, clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) if (provider.isDefined && serdeInfo.isDefined) { invalidStatement(s"CREATE TABLE ... USING ... ${serdeInfo.get.describe}", ctx) @@ -4527,7 +4620,7 @@ class AstBuilder extends DataTypeAstBuilder clusterBySpec.map(_.asTransform) val tableSpec = UnresolvedTableSpec(properties, provider, options, location, comment, - serdeInfo, external) + collation, serdeInfo, external) Option(ctx.query).map(plan) match { case Some(_) if columns.nonEmpty => @@ -4576,6 +4669,7 @@ class AstBuilder extends DataTypeAstBuilder * ] * [LOCATION path] * [COMMENT table_comment] + * [DEFAULT COLLATION collation_name] * [TBLPROPERTIES (property_name=property_value, ...)] * * partition_fields: @@ -4585,8 +4679,8 @@ class AstBuilder extends DataTypeAstBuilder */ override def visitReplaceTable(ctx: ReplaceTableContext): LogicalPlan = withOrigin(ctx) { val orCreate = ctx.replaceTableHeader().CREATE() != null - val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo, - clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) + val (partTransforms, partCols, bucketSpec, properties, options, location, comment, collation, + serdeInfo, clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) val columns = Option(ctx.colDefinitionList()).map(visitColDefinitionList).getOrElse(Nil) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) @@ -4600,7 +4694,7 @@ class AstBuilder extends DataTypeAstBuilder clusterBySpec.map(_.asTransform) val tableSpec = UnresolvedTableSpec(properties, provider, options, location, comment, - serdeInfo, external = false) + collation, serdeInfo, external = false) Option(ctx.query).map(plan) match { case Some(_) if columns.nonEmpty => @@ -5006,6 +5100,21 @@ class AstBuilder extends DataTypeAstBuilder } } + /** + * Parse a [[AlterTableCollation]] command. + * + * For example: + * {{{ + * ALTER TABLE table1 DEFAULT COLLATION name + * }}} + */ + override def visitAlterTableCollation(ctx: AlterTableCollationContext): LogicalPlan = + withOrigin(ctx) { + val table = createUnresolvedTable( + ctx.identifierReference, "ALTER TABLE ... DEFAULT COLLATION") + AlterTableCollation(table, visitCollationSpec(ctx.collationSpec())) + } + /** * Parse [[SetViewProperties]] or [[SetTableProperties]] commands. * @@ -5089,36 +5198,6 @@ class AstBuilder extends DataTypeAstBuilder visitLocationSpec(ctx.locationSpec)) } - /** - * Create a [[DescribeColumn]] or [[DescribeRelation]] commands. - */ - override def visitDescribeRelation(ctx: DescribeRelationContext): LogicalPlan = withOrigin(ctx) { - val isExtended = ctx.EXTENDED != null || ctx.FORMATTED != null - val relation = createUnresolvedTableOrView(ctx.identifierReference, "DESCRIBE TABLE") - if (ctx.describeColName != null) { - if (ctx.partitionSpec != null) { - throw QueryParsingErrors.descColumnForPartitionUnsupportedError(ctx) - } else { - DescribeColumn( - relation, - UnresolvedAttribute(ctx.describeColName.nameParts.asScala.map(_.getText).toSeq), - isExtended) - } - } else { - val partitionSpec = if (ctx.partitionSpec != null) { - // According to the syntax, visitPartitionSpec returns `Map[String, Option[String]]`. - visitPartitionSpec(ctx.partitionSpec).map { - case (key, Some(value)) => key -> value - case (key, _) => - throw QueryParsingErrors.emptyPartitionKeyError(key, ctx.partitionSpec) - } - } else { - Map.empty[String, String] - } - DescribeRelation(relation, partitionSpec, isExtended) - } - } - /** * Create an [[AnalyzeTable]], or an [[AnalyzeColumn]]. * Example SQL for analyzing a table or a set of partitions : @@ -5911,18 +5990,6 @@ class AstBuilder extends DataTypeAstBuilder if (!SQLConf.get.getConf(SQLConf.OPERATOR_PIPE_SYNTAX_ENABLED)) { operationNotAllowed("Operator pipe SQL syntax using |>", ctx) } - // This helper function adds a table subquery boundary between the new operator to be added - // (such as a filter or sort) and the input plan if one does not already exist. This helps the - // analyzer behave as if we had added the corresponding SQL clause after a table subquery - // containing the input plan. - def withSubqueryAlias(): LogicalPlan = left match { - case s: SubqueryAlias => - s - case u: UnresolvedRelation => - u - case _ => - SubqueryAlias(SubqueryAlias.generateSubqueryName(), left) - } Option(ctx.selectClause).map { c => withSelectQuerySpecification( ctx = ctx, @@ -5952,11 +6019,21 @@ class AstBuilder extends DataTypeAstBuilder }.get val projectList: Seq[NamedExpression] = Seq(UnresolvedStar(None)) ++ extendExpressions Project(projectList, left) + }.getOrElse(Option(ctx.SET).map { _ => + visitOperatorPipeSet(ctx, left) + }.getOrElse(Option(ctx.DROP).map { _ => + val ids: Seq[String] = visitIdentifierSeq(ctx.identifierSeq()) + val projectList: Seq[NamedExpression] = + Seq(UnresolvedStarExceptOrReplace( + target = None, excepts = ids.map(s => Seq(s)), replacements = None)) + Project(projectList, left) + }.getOrElse(Option(ctx.AS).map { _ => + SubqueryAlias(ctx.errorCapturingIdentifier().getText, left) }.getOrElse(Option(ctx.whereClause).map { c => if (ctx.windowClause() != null) { throw QueryParsingErrors.windowClauseInPipeOperatorWhereClauseNotAllowedError(ctx) } - withWhereClause(c, withSubqueryAlias()) + withWhereClause(c, PipeOperator(left)) }.getOrElse(Option(ctx.pivotClause()).map { c => if (ctx.unpivotClause() != null) { throw QueryParsingErrors.unpivotWithPivotInFromClauseNotAllowedError(ctx) @@ -5975,10 +6052,50 @@ class AstBuilder extends DataTypeAstBuilder val all = Option(ctx.setQuantifier()).exists(_.ALL != null) visitSetOperationImpl(left, plan(ctx.right), all, c.getType) }.getOrElse(Option(ctx.queryOrganization).map { c => - withQueryResultClauses(c, withSubqueryAlias(), forPipeOperators = true) + withQueryResultClauses(c, PipeOperator(left), forPipeOperators = true) }.getOrElse( visitOperatorPipeAggregate(ctx, left) - ))))))))) + )))))))))))) + } + + private def visitOperatorPipeSet( + ctx: OperatorPipeRightSideContext, left: LogicalPlan): LogicalPlan = { + val (setIdentifiers: Seq[String], setTargets: Seq[Expression]) = + visitOperatorPipeSetAssignmentSeq(ctx.operatorPipeSetAssignmentSeq()) + var plan = left + setIdentifiers.zip(setTargets).foreach { + case (_, _: Alias) => + operationNotAllowed( + "SQL pipe syntax |> SET operator with an alias assigned with [AS] aliasName", ctx) + case (ident, target) => + // Add an UnresolvedStarExceptOrReplace to exclude the SET expression name from the relation + // and add the new SET expression to the projection list. + // Use a PipeSelect expression to make sure it does not contain any aggregate functions. + val replacement = + Alias(PipeExpression(target, isAggregate = false, PipeOperators.setClause), ident)() + val projectList: Seq[NamedExpression] = + Seq(UnresolvedStarExceptOrReplace( + target = None, excepts = Seq(Seq(ident)), replacements = Some(Seq(replacement)))) + // Add a projection to implement the SET operator using the UnresolvedStarExceptOrReplace + // expression. We do this once per SET assignment to allow for multiple SET assignments with + // optional lateral references to previous ones. + plan = Project(projectList, plan) + } + plan + } + + override def visitOperatorPipeSetAssignmentSeq( + ctx: OperatorPipeSetAssignmentSeqContext): (Seq[String], Seq[Expression]) = { + withOrigin(ctx) { + if (!ctx.DOT.isEmpty) { + operationNotAllowed( + s"SQL pipe syntax |> SET operator with multi-part assignment key " + + s"(only single-part keys are allowed)", ctx) + } + val setIdentifiers: Seq[String] = ctx.errorCapturingIdentifier().asScala.map(_.getText).toSeq + val setTargets: Seq[Expression] = ctx.expression().asScala.map(typedVisit[Expression]).toSeq + (setIdentifiers, setTargets) + } } private def visitOperatorPipeAggregate( @@ -5989,7 +6106,7 @@ class AstBuilder extends DataTypeAstBuilder "The AGGREGATE clause requires a list of aggregate expressions " + "or a list of grouping expressions, or both", ctx) } - // Visit each aggregate expression, and add a PipeAggregate expression on top of it to generate + // Visit each aggregate expression, and add a [[PipeExpression]] on top of it to generate // clear error messages if the expression does not contain at least one aggregate function. val aggregateExpressions: Seq[NamedExpression] = Option(ctx.namedExpressionSeq()).map { n: NamedExpressionSeqContext => @@ -6025,7 +6142,8 @@ class AstBuilder extends DataTypeAstBuilder Seq("GROUPING", "GROUPING_ID").foreach { name => if (f.nameParts.head.equalsIgnoreCase(name)) error(name) } - case _: WindowSpec => error("window functions") + case _: WindowSpec => error("window functions; please update the query to move " + + "the window functions to a subsequent |> SELECT operator instead") case _ => } e.children.foreach(visit) @@ -6034,12 +6152,28 @@ class AstBuilder extends DataTypeAstBuilder a.aggregateExpressions.foreach(visit) // Prepend grouping keys to the list of aggregate functions, since operator pipe AGGREGATE // clause returns the GROUP BY expressions followed by the list of aggregate functions. - val namedGroupingExpressions: Seq[NamedExpression] = - a.groupingExpressions.map { - case n: NamedExpression => n - case e: Expression => UnresolvedAlias(e, None) - } - a.copy(aggregateExpressions = namedGroupingExpressions ++ a.aggregateExpressions) + val newGroupingExpressions = ArrayBuffer.empty[Expression] + val newAggregateExpressions = ArrayBuffer.empty[NamedExpression] + a.groupingExpressions.foreach { + case n: NamedExpression => + newGroupingExpressions += n + newAggregateExpressions += n + // If the grouping expression is an integer literal, create [[UnresolvedOrdinal]] and + // [[UnresolvedPipeAggregateOrdinal]] expressions to represent it in the final grouping + // and aggregate expressions, respectively. This will let the + // [[ResolveOrdinalInOrderByAndGroupBy]] rule detect the ordinal in the aggregate list + // and replace it with the corresponding attribute from the child operator. + case Literal(v: Int, IntegerType) if conf.groupByOrdinal => + newGroupingExpressions += UnresolvedOrdinal(newAggregateExpressions.length + 1) + newAggregateExpressions += UnresolvedAlias(UnresolvedPipeAggregateOrdinal(v), None) + case e: Expression => + newGroupingExpressions += e + newAggregateExpressions += UnresolvedAlias(e, None) + } + newAggregateExpressions.appendAll(a.aggregateExpressions) + a.copy( + groupingExpressions = newGroupingExpressions.toSeq, + aggregateExpressions = newAggregateExpressions.toSeq) } }.getOrElse { // This is a table aggregation with no grouping expressions. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala index 8471c9f9dff13..1bc4f95f95daf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala @@ -163,6 +163,14 @@ class SqlScriptingLabelContext { bl.multipartIdentifier().getText, el.multipartIdentifier().getText) } + case (Some(bl: BeginLabelContext), _) + if bl.multipartIdentifier().parts.size() > 1 => + withOrigin(bl) { + throw SqlScriptingErrors.labelCannotBeQualified( + CurrentOrigin.get, + bl.multipartIdentifier().getText.toLowerCase(Locale.ROOT) + ) + } case (None, Some(el: EndLabelContext)) => withOrigin(el) { throw SqlScriptingErrors.endLabelWithoutBeginLabel( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/DescribeCommandSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/DescribeCommandSchema.scala index 99d2ea7751959..a6ec6f5736300 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/DescribeCommandSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/DescribeCommandSchema.scala @@ -21,13 +21,19 @@ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.types.{MetadataBuilder, StringType} private[sql] object DescribeCommandSchema { - def describeTableAttributes(): Seq[AttributeReference] = Seq( - AttributeReference("col_name", StringType, nullable = false, - new MetadataBuilder().putString("comment", "name of the column").build())(), - AttributeReference("data_type", StringType, nullable = false, - new MetadataBuilder().putString("comment", "data type of the column").build())(), - AttributeReference("comment", StringType, nullable = true, - new MetadataBuilder().putString("comment", "comment of the column").build())()) + def describeJsonTableAttributes(): Seq[AttributeReference] = + Seq( + AttributeReference("json_metadata", StringType, nullable = false, + new MetadataBuilder().putString("comment", "JSON metadata of the table").build())() + ) + def describeTableAttributes(): Seq[AttributeReference] = { + Seq(AttributeReference("col_name", StringType, nullable = false, + new MetadataBuilder().putString("comment", "name of the column").build())(), + AttributeReference("data_type", StringType, nullable = false, + new MetadataBuilder().putString("comment", "data type of the column").build())(), + AttributeReference("comment", StringType, nullable = true, + new MetadataBuilder().putString("comment", "comment of the column").build())()) + } def describeColumnAttributes(): Seq[AttributeReference] = Seq( AttributeReference("info_name", StringType, nullable = false, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala index 3b691f4f87778..13df749c6d584 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala @@ -17,14 +17,42 @@ package org.apache.spark.sql.catalyst.plans +import java.util.HashMap + import org.apache.spark.sql.catalyst.analysis.GetViewColumnByNameAndOrdinal import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical._ object NormalizePlan extends PredicateHelper { - def apply(plan: LogicalPlan): LogicalPlan = - normalizePlan(normalizeExprIds(plan)) + def apply(plan: LogicalPlan): LogicalPlan = { + val withNormalizedInheritAnalysis = normalizeInheritAnalysisRules(plan) + val withNormalizedExprIds = normalizeExprIds(withNormalizedInheritAnalysis) + normalizePlan(withNormalizedExprIds) + } + + /** + * Normalize [[InheritAnalysisRules]] nodes by replacing them with their replacement expressions. + * This is necessary because fixed-point analyzer may produce non-deterministic results when + * resolving original expressions. For example, in a query like: + * + * {{{ SELECT assert_true(1) }}} + * + * Before resolution, we have [[UnresolvedFunction]] whose child is Literal(1). This child will + * first be converted to Cast(Literal(1), BooleanType) by type coercion. Because in this case + * [[Cast]] doesn't require timezone, the expression will be implicitly resolved. Because the + * child of initially unresolved function is resolved, the function can be converted to + * [[AssertTrue]], which is of type [[InheritAnalysisRules]]. However, because the only child of + * [[InheritAnalysisRules]] is the replacement expression, the original expression will be lost + * timezone will never be applied. This causes inconsistencies, because fixed-point semantic is + * to ALWAYS apply timezone, regardless of whether or not the Cast actually needs it. + */ + def normalizeInheritAnalysisRules(plan: LogicalPlan): LogicalPlan = { + plan transformAllExpressions { + case inheritAnalysisRules: InheritAnalysisRules => + inheritAnalysisRules.child + } + } /** * Since attribute references are given globally unique ids during analysis, @@ -68,8 +96,13 @@ object NormalizePlan extends PredicateHelper { * etc., will all now be equivalent. * - Sample the seed will replaced by 0L. * - Join conditions will be resorted by hashCode. + * - CTERelationDef ids will be rewritten using a monitonically increasing counter from 0. + * - CTERelationRef ids will be remapped based on the new CTERelationDef IDs. This is possible, + * because WithCTE returns cteDefs as first children, and the defs will be traversed before the + * refs. */ def normalizePlan(plan: LogicalPlan): LogicalPlan = { + val cteIdNormalizer = new CteIdNormalizer plan transform { case Filter(condition: Expression, child: LogicalPlan) => Filter( @@ -105,6 +138,19 @@ object NormalizePlan extends PredicateHelper { .asInstanceOf[Seq[NamedExpression]] Project(projList, child) case c: KeepAnalyzedQuery => c.storeAnalyzedQuery() + case localRelation: LocalRelation if !localRelation.data.isEmpty => + /** + * A substitute for the [[LocalRelation.data]]. [[GenericInternalRow]] is incomparable for + * maps, because [[ArrayBasedMapData]] doesn't define [[equals]]. + */ + val unsafeProjection = UnsafeProjection.create(localRelation.schema) + localRelation.copy(data = localRelation.data.map { row => + unsafeProjection(row) + }) + case cteRelationDef: CTERelationDef => + cteIdNormalizer.normalizeDef(cteRelationDef) + case cteRelationRef: CTERelationRef => + cteIdNormalizer.normalizeRef(cteRelationRef) } } @@ -125,3 +171,25 @@ object NormalizePlan extends PredicateHelper { case _ => condition // Don't reorder. } } + +class CteIdNormalizer { + private var cteIdCounter: Long = 0 + private val oldToNewIdMapping = new HashMap[Long, Long] + + def normalizeDef(cteRelationDef: CTERelationDef): CTERelationDef = { + try { + oldToNewIdMapping.put(cteRelationDef.id, cteIdCounter) + cteRelationDef.copy(id = cteIdCounter) + } finally { + cteIdCounter += 1 + } + } + + def normalizeRef(cteRelationRef: CTERelationRef): CTERelationRef = { + if (oldToNewIdMapping.containsKey(cteRelationRef.cteId)) { + cteRelationRef.copy(cteId = oldToNewIdMapping.get(cteRelationRef.cteId)) + } else { + cteRelationRef + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index 23813d94c5495..07341f8ca1765 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.plans +import java.lang.{Boolean => JBoolean} import java.util.IdentityHashMap import scala.collection.mutable @@ -32,7 +33,7 @@ import org.apache.spark.sql.catalyst.trees.TreePatternBits import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, StructType} -import org.apache.spark.util.TransientLazy +import org.apache.spark.util.{BestEffortLazyVal, TransientBestEffortLazyVal} import org.apache.spark.util.collection.BitSet /** @@ -54,8 +55,9 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] /** * Returns the set of attributes that are output by this node. */ - @transient - lazy val outputSet: AttributeSet = AttributeSet(output) + def outputSet: AttributeSet = _outputSet() + + private val _outputSet = new TransientBestEffortLazyVal(() => AttributeSet(output)) /** * Returns the output ordering that this plan generates, although the semantics differ in logical @@ -97,16 +99,17 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] */ def references: AttributeSet = _references() - private val _references = new TransientLazy({ - AttributeSet(expressions) -- producedAttributes - }) + private val _references = new TransientBestEffortLazyVal(() => + AttributeSet(expressions) -- producedAttributes) /** * Returns true when the all the expressions in the current node as well as all of its children * are deterministic */ - lazy val deterministic: Boolean = expressions.forall(_.deterministic) && - children.forall(_.deterministic) + def deterministic: Boolean = _deterministic() + + private val _deterministic = new BestEffortLazyVal[JBoolean](() => + expressions.forall(_.deterministic) && children.forall(_.deterministic)) /** * Attributes that are referenced by expressions but not provided by this node's children. @@ -280,7 +283,9 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] } /** Returns all of the expressions present in this query plan operator. */ - final def expressions: Seq[Expression] = { + final def expressions: Seq[Expression] = _expressions() + + private val _expressions = new BestEffortLazyVal[Seq[Expression]](() => { // Recursively find all expressions from a traversable. def seqToExpressions(seq: Iterable[Any]): Iterable[Expression] = seq.flatMap { case e: Expression => e :: Nil @@ -294,7 +299,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] case seq: Iterable[_] => seqToExpressions(seq) case other => Nil }.toSeq - } + }) /** * A variant of `transformUp`, which takes care of the case that the rule replaces a plan node @@ -427,7 +432,10 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] } } - lazy val schema: StructType = DataTypeUtils.fromAttributes(output) + def schema: StructType = _schema() + + private val _schema = new BestEffortLazyVal[StructType](() => + DataTypeUtils.fromAttributes(output)) /** Returns the output schema in the tree format. */ def schemaString: String = schema.treeString @@ -480,11 +488,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] /** * All the top-level subqueries of the current plan node. Nested subqueries are not included. */ - @transient lazy val subqueries: Seq[PlanType] = { + def subqueries: Seq[PlanType] = _subqueries() + + private val _subqueries = new TransientBestEffortLazyVal(() => expressions.filter(_.containsPattern(PLAN_EXPRESSION)).flatMap(_.collect { case e: PlanExpression[_] => e.plan.asInstanceOf[PlanType] }) - } + ) /** * All the subqueries of the current plan node and all its children. Nested subqueries are also @@ -620,7 +630,9 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] * Plan nodes that require special canonicalization should override [[doCanonicalize()]]. * They should remove expressions cosmetic variations themselves. */ - @transient final lazy val canonicalized: PlanType = { + def canonicalized: PlanType = _canonicalized() + + private val _canonicalized = new TransientBestEffortLazyVal(() => { var plan = doCanonicalize() // If the plan has not been changed due to canonicalization, make a copy of it so we don't // mutate the original plan's _isCanonicalizedPlan flag. @@ -629,7 +641,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] } plan._isCanonicalizedPlan = true plan - } + }) /** * Defines how the canonicalization should work for the current plan. @@ -724,6 +736,12 @@ object QueryPlan extends PredicateHelper { } else { ar.withExprId(ExprId(ordinal)) } + + // Top-level Alias is already handled by `QueryPlan#doCanonicalize`. For inner Alias, the id + // doesn't matter and we normalize it to 0 here. + case a: Alias => + Alias(a.child, a.name)( + ExprId(0), a.qualifier, a.explicitMetadata, a.nonInheritableMetadataKeys) }.canonicalized.asInstanceOf[T] } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala index 41bba99673a2b..9f8c62fe58408 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala @@ -159,3 +159,25 @@ sealed abstract class AsOfJoinDirection case object Forward extends AsOfJoinDirection case object Backward extends AsOfJoinDirection case object Nearest extends AsOfJoinDirection + +object LateralJoinType { + + val supported = Seq( + "inner", + "leftouter", "left", "left_outer", + "cross" + ) + + def apply(typ: String): JoinType = typ.toLowerCase(Locale.ROOT).replace("_", "") match { + case "inner" => Inner + case "leftouter" | "left" => LeftOuter + case "cross" => Cross + case _ => + throw new AnalysisException( + errorClass = "UNSUPPORTED_JOIN_TYPE", + messageParameters = Map( + "typ" -> typ, + "supported" -> supported.mkString("'", "', '", "'")) + ) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SqlScriptingLogicalPlans.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SqlScriptingLogicalPlans.scala index e6018e5e57b9c..ad00a5216b4c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SqlScriptingLogicalPlans.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SqlScriptingLogicalPlans.scala @@ -62,16 +62,19 @@ case class SingleStatement(parsedPlan: LogicalPlan) * @param label Label set to CompoundBody by user or UUID otherwise. * It can be None in case when CompoundBody is not part of BeginEndCompoundBlock * for example when CompoundBody is inside loop or conditional block. + * @param isScope Flag indicating if the CompoundBody is a labeled scope. + * Scopes are used for grouping local variables and exception handlers. */ case class CompoundBody( collection: Seq[CompoundPlanStatement], - label: Option[String]) extends Command with CompoundPlanStatement { + label: Option[String], + isScope: Boolean) extends Command with CompoundPlanStatement { override def children: Seq[LogicalPlan] = collection override protected def withNewChildrenInternal( newChildren: IndexedSeq[LogicalPlan]): LogicalPlan = { - CompoundBody(newChildren.map(_.asInstanceOf[CompoundPlanStatement]), label) + CompoundBody(newChildren.map(_.asInstanceOf[CompoundPlanStatement]), label, isScope) } } @@ -267,3 +270,31 @@ case class LoopStatement( LoopStatement(newChildren(0).asInstanceOf[CompoundBody], label) } } + +/** + * Logical operator for FOR statement. + * @param query Query which is executed once, then it's result set is iterated on, row by row. + * @param variableName Name of variable which is used to access the current row during iteration. + * @param body Compound body is a collection of statements that are executed for each row in + * the result set of the query. + * @param label An optional label for the loop which is unique amongst all labels for statements + * within which the FOR statement is contained. + * If an end label is specified it must match the beginning label. + * The label can be used to LEAVE or ITERATE the loop. + */ +case class ForStatement( + query: SingleStatement, + variableName: Option[String], + body: CompoundBody, + label: Option[String]) extends CompoundPlanStatement { + + override def output: Seq[Attribute] = Seq.empty + + override def children: Seq[LogicalPlan] = Seq(query, body) + + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[LogicalPlan]): LogicalPlan = newChildren match { + case IndexedSeq(query: SingleStatement, body: CompoundBody) => + ForStatement(query, variableName, body, label) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 0cb04064a6178..c1261f2b5fac5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -376,10 +376,13 @@ case class Intersect( final override val nodePatterns: Seq[TreePattern] = Seq(INTERSECT) - override def output: Seq[Attribute] = - left.output.zip(right.output).map { case (leftAttr, rightAttr) => - leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable) + override def output: Seq[Attribute] = { + if (conf.getConf(SQLConf.LAZY_SET_OPERATOR_OUTPUT)) { + lazyOutput + } else { + computeOutput() } + } override def metadataOutput: Seq[Attribute] = Nil @@ -396,6 +399,14 @@ case class Intersect( override protected def withNewChildrenInternal( newLeft: LogicalPlan, newRight: LogicalPlan): Intersect = copy(left = newLeft, right = newRight) + + private lazy val lazyOutput: Seq[Attribute] = computeOutput() + + /** We don't use right.output because those rows get excluded from the set. */ + private def computeOutput(): Seq[Attribute] = + left.output.zip(right.output).map { case (leftAttr, rightAttr) => + leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable) + } } case class Except( @@ -403,8 +414,14 @@ case class Except( right: LogicalPlan, isAll: Boolean) extends SetOperation(left, right) { override def nodeName: String = getClass.getSimpleName + ( if ( isAll ) " All" else "" ) - /** We don't use right.output because those rows get excluded from the set. */ - override def output: Seq[Attribute] = left.output + + override def output: Seq[Attribute] = { + if (conf.getConf(SQLConf.LAZY_SET_OPERATOR_OUTPUT)) { + lazyOutput + } else { + computeOutput() + } + } override def metadataOutput: Seq[Attribute] = Nil @@ -416,6 +433,11 @@ case class Except( override protected def withNewChildrenInternal( newLeft: LogicalPlan, newRight: LogicalPlan): Except = copy(left = newLeft, right = newRight) + + private lazy val lazyOutput: Seq[Attribute] = computeOutput() + + /** We don't use right.output because those rows get excluded from the set. */ + private def computeOutput(): Seq[Attribute] = left.output } /** Factory for constructing new `Union` nodes. */ @@ -423,6 +445,21 @@ object Union { def apply(left: LogicalPlan, right: LogicalPlan): Union = { Union (left :: right :: Nil) } + + // updating nullability to make all the children consistent + def mergeChildOutputs(childOutputs: Seq[Seq[Attribute]]): Seq[Attribute] = { + childOutputs.transpose.map { attrs => + val firstAttr = attrs.head + val nullable = attrs.exists(_.nullable) + val newDt = attrs.map(_.dataType).reduce(StructType.unionLikeMerge) + if (firstAttr.dataType == newDt) { + firstAttr.withNullability(nullable) + } else { + AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)( + firstAttr.exprId, firstAttr.qualifier) + } + } + } } /** @@ -479,18 +516,11 @@ case class Union( AttributeSet.fromAttributeSets(children.map(_.outputSet)).size } - // updating nullability to make all the children consistent override def output: Seq[Attribute] = { - children.map(_.output).transpose.map { attrs => - val firstAttr = attrs.head - val nullable = attrs.exists(_.nullable) - val newDt = attrs.map(_.dataType).reduce(StructType.unionLikeMerge) - if (firstAttr.dataType == newDt) { - firstAttr.withNullability(nullable) - } else { - AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)( - firstAttr.exprId, firstAttr.qualifier) - } + if (conf.getConf(SQLConf.LAZY_SET_OPERATOR_OUTPUT)) { + lazyOutput + } else { + computeOutput() } } @@ -509,6 +539,10 @@ case class Union( children.length > 1 && !(byName || allowMissingCol) && childrenResolved && allChildrenCompatible } + private lazy val lazyOutput: Seq[Attribute] = computeOutput() + + private def computeOutput(): Seq[Attribute] = Union.mergeChildOutputs(children.map(_.output)) + /** * Maps the constraints containing a given (original) sequence of attributes to those with a * given (reference) sequence of attributes. Given the nature of union, we expect that the @@ -801,10 +835,12 @@ object View { * @param child The final query of this CTE. * @param cteRelations A sequence of pair (alias, the CTE definition) that this CTE defined * Each CTE can see the base tables and the previously defined CTEs only. + * @param allowRecursion A boolean flag if recursion is allowed. */ case class UnresolvedWith( child: LogicalPlan, - cteRelations: Seq[(String, SubqueryAlias)]) extends UnaryNode { + cteRelations: Seq[(String, SubqueryAlias)], + allowRecursion: Boolean = false) extends UnaryNode { final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_WITH) override def output: Seq[Attribute] = child.output @@ -830,12 +866,17 @@ case class UnresolvedWith( * pushdown to help ensure rule idempotency. * @param underSubquery If true, it means we don't need to add a shuffle for this CTE relation as * subquery reuse will be applied to reuse CTE relation output. + * @param recursionAnchor A helper plan node that temporary stores the anchor term of recursive + * definitions. In the beginning of recursive resolution the `ResolveWithCTE` + * rule updates this parameter and once it is resolved the same rule resolves + * the recursive [[CTERelationRef]] references and removes this parameter. */ case class CTERelationDef( child: LogicalPlan, id: Long = CTERelationDef.newId, originalPlanWithPredicates: Option[(LogicalPlan, Seq[Expression])] = None, - underSubquery: Boolean = false) extends UnaryNode { + underSubquery: Boolean = false, + recursionAnchor: Option[LogicalPlan] = None) extends UnaryNode { final override val nodePatterns: Seq[TreePattern] = Seq(CTE) @@ -843,6 +884,13 @@ case class CTERelationDef( copy(child = newChild) override def output: Seq[Attribute] = if (resolved) child.output else Nil + + lazy val recursive: Boolean = child.exists{ + // if the reference is found inside the child, referencing to this CTE definition, + // and already marked as recursive, then this CTE definition is recursive. + case CTERelationRef(this.id, _, _, _, _, true) => true + case _ => false + } } object CTERelationDef { @@ -859,13 +907,15 @@ object CTERelationDef { * de-duplication. * @param statsOpt The optional statistics inferred from the corresponding CTE * definition. + * @param recursive If this is a recursive reference. */ case class CTERelationRef( cteId: Long, _resolved: Boolean, override val output: Seq[Attribute], override val isStreaming: Boolean, - statsOpt: Option[Statistics] = None) extends LeafNode with MultiInstanceRelation { + statsOpt: Option[Statistics] = None, + recursive: Boolean = false) extends LeafNode with MultiInstanceRelation { final override val nodePatterns: Seq[TreePattern] = Seq(CTE) @@ -2014,6 +2064,9 @@ case class Deduplicate( } case class DeduplicateWithinWatermark(keys: Seq[Attribute], child: LogicalPlan) extends UnaryNode { + // Ensure that references include event time columns so they are not pruned away. + override def references: AttributeSet = AttributeSet(keys) ++ + AttributeSet(child.output.filter(_.metadata.contains(EventTimeWatermark.delayKey))) override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output final override val nodePatterns: Seq[TreePattern] = Seq(DISTINCT_LIKE) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala index 2f5d4b9c86e25..dbd2c0ba8e420 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala @@ -261,3 +261,15 @@ case class AlterTableClusterBy( protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = copy(table = newChild) } + +/** + * The logical plan of the ALTER TABLE ... DEFAULT COLLATION name command. + */ +case class AlterTableCollation( + table: LogicalPlan, collation: String) extends AlterTableCommand { + override def changes: Seq[TableChange] = { + Seq(TableChange.setProperty(TableCatalog.PROP_COLLATION, collation)) + } + + protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = copy(table = newChild) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index b465e0e11612f..58c62a90225aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -21,8 +21,8 @@ import org.apache.spark.{SparkIllegalArgumentException, SparkUnsupportedOperatio import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, AssignmentUtils, EliminateSubqueryAliases, FieldName, NamedRelation, PartitionSpec, ResolvedIdentifier, ResolvedProcedure, TypeCheckResult, UnresolvedException, UnresolvedProcedure, ViewSchemaMode} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.catalog.{FunctionResource, RoutineLanguage} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec -import org.apache.spark.sql.catalyst.catalog.FunctionResource import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, Expression, MetadataAttribute, NamedExpression, UnaryExpression, Unevaluable, V2ExpressionUtils} import org.apache.spark.sql.catalyst.plans.DescribeCommandSchema import org.apache.spark.sql.catalyst.trees.BinaryLike @@ -459,6 +459,12 @@ trait V2CreateTableAsSelectPlan newQuery: LogicalPlan): V2CreateTableAsSelectPlan } +/** + * A trait used for logical plan nodes that create V1 table definitions, + * and so that rules from the catalyst module can identify them. + */ +trait V1CreateTablePlan extends LogicalPlan + /** A trait used for logical plan nodes that create or replace V2 table definitions. */ trait V2CreateTablePlan extends LogicalPlan { def name: LogicalPlan @@ -1066,6 +1072,26 @@ case class CreateFunction( copy(child = newChild) } +/** + * The logical plan of the CREATE FUNCTION command for SQL Functions. + */ +case class CreateUserDefinedFunction( + child: LogicalPlan, + inputParamText: Option[String], + returnTypeText: String, + exprText: Option[String], + queryText: Option[String], + comment: Option[String], + isDeterministic: Option[Boolean], + containsSQL: Option[Boolean], + language: RoutineLanguage, + isTableFunc: Boolean, + ignoreIfExists: Boolean, + replace: Boolean) extends UnaryCommand { + override protected def withNewChildInternal(newChild: LogicalPlan): CreateUserDefinedFunction = + copy(child = newChild) +} + /** * The logical plan of the DROP FUNCTION command. */ @@ -1332,6 +1358,7 @@ case class CreateView( child: LogicalPlan, userSpecifiedColumns: Seq[(String, Option[String])], comment: Option[String], + collation: Option[String], properties: Map[String, String], originalText: Option[String], query: LogicalPlan, @@ -1480,6 +1507,7 @@ trait TableSpecBase { def provider: Option[String] def location: Option[String] def comment: Option[String] + def collation: Option[String] def serde: Option[SerdeInfo] def external: Boolean } @@ -1490,6 +1518,7 @@ case class UnresolvedTableSpec( optionExpression: OptionList, location: Option[String], comment: Option[String], + collation: Option[String], serde: Option[SerdeInfo], external: Boolean) extends UnaryExpression with Unevaluable with TableSpecBase { @@ -1535,10 +1564,11 @@ case class TableSpec( options: Map[String, String], location: Option[String], comment: Option[String], + collation: Option[String], serde: Option[SerdeInfo], external: Boolean) extends TableSpecBase { def withNewLocation(newLocation: Option[String]): TableSpec = { - TableSpec(properties, provider, options, newLocation, comment, serde, external) + TableSpec(properties, provider, options, newLocation, comment, collation, serde, external) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala index 30e223c3c3c87..6e19a1d6bbc8c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala @@ -176,6 +176,13 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution { override def createPartitioning(numPartitions: Int): Partitioning = { RangePartitioning(ordering, numPartitions) } + + def areAllClusterKeysMatched(expressions: Seq[Expression]): Boolean = { + expressions.length == ordering.length && + expressions.zip(ordering).forall { + case (x, o) => x.semanticEquals(o.child) + } + } } /** @@ -394,6 +401,9 @@ case class KeyGroupedPartitioning( } } + case o @ OrderedDistribution(_) if SQLConf.get.v2BucketingAllowSorting => + o.areAllClusterKeysMatched(expressions) + case _ => false } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala index 76d36fab2096a..bdbf698db2e01 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala @@ -22,7 +22,8 @@ import org.apache.spark.internal.{Logging, MessageWithContext} import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.QueryPlanningTracker -import org.apache.spark.sql.catalyst.trees.TreeNode +import org.apache.spark.sql.catalyst.rules.RuleExecutor.getForceIterationValue +import org.apache.spark.sql.catalyst.trees.{TreeNode, TreeNodeTag} import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.errors.QueryExecutionErrors @@ -30,6 +31,27 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.Utils object RuleExecutor { + + /** + * A tag used to explicitly request an additional iteration of the current batch during + * rule execution, even if the query plan remains unchanged. Increment the tag's value + * to enforce another iteration. + */ + private val FORCE_ADDITIONAL_ITERATION = TreeNodeTag[Int]("forceAdditionalIteration") + + /** + * Increments the value of the FORCE_ADDITIONAL_ITERATION tag on the given plan to + * explicitly force another iteration of the current batch during rule execution. + */ + def forceAdditionalIteration(plan: TreeNode[_]): Unit = { + val oldValue = getForceIterationValue(plan) + plan.setTagValue(FORCE_ADDITIONAL_ITERATION, oldValue + 1) + } + + private def getForceIterationValue(plan: TreeNode[_]): Int = { + plan.getTagValue(FORCE_ADDITIONAL_ITERATION).getOrElse(0) + } + protected val queryExecutionMeter = QueryExecutionMetering() /** Dump statistics about time spent running specific rules. */ @@ -303,7 +325,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { continue = false } - if (curPlan.fastEquals(lastPlan)) { + if (isFixedPointReached(lastPlan, curPlan)) { logTrace( s"Fixed point reached for batch ${batch.name} after ${iteration - 1} iterations.") continue = false @@ -317,4 +339,9 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { curPlan } + + private def isFixedPointReached(oldPlan: TreeType, newPlan: TreeType): Boolean = { + oldPlan.fastEquals(newPlan) && + getForceIterationValue(newPlan) <= getForceIterationValue(oldPlan) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala index 5ae2ca0d532b7..ee5245054bcca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala @@ -51,6 +51,7 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAliases" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveBinaryArithmetic" :: + "org.apache.spark.sql.catalyst.analysis.ResolveCollationName" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveEncodersInUDF" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions" :: @@ -107,6 +108,8 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences" :: "org.apache.spark.sql.catalyst.analysis.UpdateAttributeNullability" :: "org.apache.spark.sql.catalyst.analysis.ResolveUpdateEventTimeWatermarkColumn" :: + "org.apache.spark.sql.catalyst.expressions.EliminatePipeOperators" :: + "org.apache.spark.sql.catalyst.expressions.ValidateAndStripPipeExpressions" :: // Catalyst Optimizer rules "org.apache.spark.sql.catalyst.optimizer.BooleanSimplification" :: "org.apache.spark.sql.catalyst.optimizer.CollapseProject" :: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala index 24b787054fb13..9856a26346f6a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala @@ -63,6 +63,7 @@ object TreePattern extends Enumeration { val LAMBDA_VARIABLE: Value = Value val LATERAL_COLUMN_ALIAS_REFERENCE: Value = Value val LATERAL_SUBQUERY: Value = Value + val LAZY_EXPRESSION: Value = Value val LIKE_FAMLIY: Value = Value val LIST_SUBQUERY: Value = Value val LITERAL: Value = Value @@ -78,6 +79,8 @@ object TreePattern extends Enumeration { val OUTER_REFERENCE: Value = Value val PARAMETER: Value = Value val PARAMETERIZED_QUERY: Value = Value + val PIPE_EXPRESSION: Value = Value + val PIPE_OPERATOR: Value = Value val PIVOT: Value = Value val PLAN_EXPRESSION: Value = Value val PYTHON_UDF: Value = Value @@ -89,6 +92,9 @@ object TreePattern extends Enumeration { val SCALA_UDF: Value = Value val SESSION_WINDOW: Value = Value val SORT: Value = Value + val SQL_FUNCTION_EXPRESSION: Value = Value + val SQL_SCALAR_FUNCTION: Value = Value + val SQL_TABLE_FUNCTION: Value = Value val SUBQUERY_ALIAS: Value = Value val SUM: Value = Value val TIME_WINDOW: Value = Value @@ -145,6 +151,7 @@ object TreePattern extends Enumeration { // Unresolved expression patterns (Alphabetically ordered) val UNRESOLVED_ALIAS: Value = Value val UNRESOLVED_ATTRIBUTE: Value = Value + val UNRESOLVED_COLLATION: Value = Value val UNRESOLVED_DESERIALIZER: Value = Value val UNRESOLVED_DF_STAR: Value = Value val UNRESOLVED_HAVING: Value = Value @@ -153,8 +160,7 @@ object TreePattern extends Enumeration { val UNRESOLVED_FUNCTION: Value = Value val UNRESOLVED_HINT: Value = Value val UNRESOLVED_WINDOW_EXPRESSION: Value = Value - val UNRESOLVED_IDENTIFIER_WITH_CTE: Value = Value - val UNRESOLVED_OUTER_REFERENCE: Value = Value + val UNRESOLVED_PLAN_ID: Value = Value // Unresolved Plan patterns (Alphabetically ordered) val UNRESOLVED_FUNC: Value = Value @@ -169,8 +175,4 @@ object TreePattern extends Enumeration { // Execution Plan patterns (alphabetically ordered) val EXCHANGE: Value = Value - - // Lazy analysis expression patterns (alphabetically ordered) - val LAZY_ANALYSIS_EXPRESSION: Value = Value - val LAZY_OUTER_REFERENCE: Value = Value } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala index 628fdcebd3084..6ba7e528ea230 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala @@ -74,7 +74,7 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils { def replaceCharVarcharWithStringForCast(dt: DataType): DataType = { if (SQLConf.get.charVarcharAsString) { replaceCharVarcharWithString(dt) - } else if (hasCharVarchar(dt)) { + } else if (hasCharVarchar(dt) && !SQLConf.get.preserveCharVarcharTypeInfo) { logWarning(log"The Spark cast operator does not support char/varchar type and simply treats" + log" them as string type. Please use string type directly to avoid confusion. Otherwise," + log" you can set ${MDC(CONFIG, SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key)} " + @@ -164,7 +164,11 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils { case CharType(length) if charFuncName.isDefined => StaticInvoke( classOf[CharVarcharCodegenUtils], - StringType, + if (SQLConf.get.preserveCharVarcharTypeInfo) { + CharType(length) + } else { + StringType + }, charFuncName.get, expr :: Literal(length) :: Nil, returnNullable = false) @@ -172,7 +176,11 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils { case VarcharType(length) if varcharFuncName.isDefined => StaticInvoke( classOf[CharVarcharCodegenUtils], - StringType, + if (SQLConf.get.preserveCharVarcharTypeInfo) { + VarcharType(length) + } else { + StringType + }, varcharFuncName.get, expr :: Literal(length) :: Nil, returnNullable = false) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index c9ca3ed864c16..1f741169898e9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -667,33 +667,44 @@ object DateTimeUtils extends SparkDateTimeUtils { * @param zoneId The time zone ID at which the operation is performed. * @return A timestamp value, expressed in microseconds since 1970-01-01 00:00:00Z. */ - def timestampAdd(unit: String, quantity: Int, micros: Long, zoneId: ZoneId): Long = { + def timestampAdd(unit: String, quantity: Long, micros: Long, zoneId: ZoneId): Long = { try { unit.toUpperCase(Locale.ROOT) match { case "MICROSECOND" => timestampAddInterval(micros, 0, 0, quantity, zoneId) case "MILLISECOND" => timestampAddInterval(micros, 0, 0, - Math.multiplyExact(quantity.toLong, MICROS_PER_MILLIS), zoneId) + Math.multiplyExact(quantity, MICROS_PER_MILLIS), zoneId) case "SECOND" => timestampAddInterval(micros, 0, 0, - Math.multiplyExact(quantity.toLong, MICROS_PER_SECOND), zoneId) + Math.multiplyExact(quantity, MICROS_PER_SECOND), zoneId) case "MINUTE" => timestampAddInterval(micros, 0, 0, - Math.multiplyExact(quantity.toLong, MICROS_PER_MINUTE), zoneId) + Math.multiplyExact(quantity, MICROS_PER_MINUTE), zoneId) case "HOUR" => timestampAddInterval(micros, 0, 0, - Math.multiplyExact(quantity.toLong, MICROS_PER_HOUR), zoneId) + Math.multiplyExact(quantity, MICROS_PER_HOUR), zoneId) case "DAY" | "DAYOFYEAR" => - timestampAddInterval(micros, 0, quantity, 0, zoneId) + // Given that more than `Int32.MaxValue` days will cause an `ArithmeticException` due to + // overflow, we can safely cast the quantity to an `Int` here. Same follows for larger + // unites. + timestampAddInterval(micros, 0, Math.toIntExact(quantity), 0, zoneId) case "WEEK" => - timestampAddInterval(micros, 0, Math.multiplyExact(quantity, DAYS_PER_WEEK), 0, zoneId) + timestampAddInterval( + micros, + 0, + Math.multiplyExact(Math.toIntExact(quantity), DAYS_PER_WEEK), + 0, + zoneId) case "MONTH" => - timestampAddMonths(micros, quantity, zoneId) + timestampAddMonths(micros, Math.toIntExact(quantity), zoneId) case "QUARTER" => - timestampAddMonths(micros, Math.multiplyExact(quantity, 3), zoneId) + timestampAddMonths(micros, Math.multiplyExact(Math.toIntExact(quantity), 3), zoneId) case "YEAR" => - timestampAddMonths(micros, Math.multiplyExact(quantity, MONTHS_PER_YEAR), zoneId) + timestampAddMonths( + micros, + Math.multiplyExact(Math.toIntExact(quantity), MONTHS_PER_YEAR), + zoneId) } } catch { case _: scala.MatchError => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtils.scala index 118dd92c3ed54..f2925314e2e2b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtils.scala @@ -206,8 +206,7 @@ object UnsafeRowUtils { */ def isBinaryStable(dataType: DataType): Boolean = !dataType.existsRecursively { case st: StringType => - val collation = CollationFactory.fetchCollation(st.collationId) - (!collation.supportsBinaryEquality) + !st.supportsBinaryEquality case _ => false } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala index 9a0528468842c..4b892da9db255 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala @@ -32,6 +32,8 @@ import scala.util.control.NonFatal import scala.xml.SAXException import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.hadoop.hdfs.BlockMissingException +import org.apache.hadoop.security.AccessControlException import org.apache.spark.{SparkIllegalArgumentException, SparkUpgradeException} import org.apache.spark.internal.Logging @@ -655,6 +657,10 @@ class XmlTokenizer( e) case NonFatal(e) => ExceptionUtils.getRootCause(e) match { + case _: AccessControlException | _: BlockMissingException => + reader.close() + reader = null + throw e case _: RuntimeException | _: IOException if options.ignoreCorruptFiles => logWarning( "Skipping the rest of" + diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala index 848e6ff45c5a2..ecde7c1715bd5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala @@ -30,6 +30,9 @@ import scala.util.control.Exception._ import scala.util.control.NonFatal import scala.xml.SAXException +import org.apache.hadoop.hdfs.BlockMissingException +import org.apache.hadoop.security.AccessControlException + import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD @@ -157,6 +160,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean) logWarning("Skipped missing file", e) Some(StructType(Nil)) case e: FileNotFoundException if !options.ignoreMissingFiles => throw e + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case e @ (_: IOException | _: RuntimeException) if options.ignoreCorruptFiles => logWarning("Skipped the rest of the content in the corrupted file", e) Some(StructType(Nil)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala index db94659b1033b..9b8584604d32f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala @@ -85,9 +85,10 @@ class CatalogManager( * in the fallback configuration, spark.sql.sources.useV1SourceList */ private[sql] def v2SessionCatalog: CatalogPlugin = { - conf.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION).map { _ => - catalogs.getOrElseUpdate(SESSION_CATALOG_NAME, loadV2SessionCatalog()) - }.getOrElse(defaultSessionCatalog) + conf.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION) match { + case "builtin" => defaultSessionCatalog + case _ => catalogs.getOrElseUpdate(SESSION_CATALOG_NAME, loadV2SessionCatalog()) + } } private var _currentNamespace: Option[Array[String]] = None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala index e1f114a6170a4..97cc263c56c5f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala @@ -53,6 +53,7 @@ private[sql] object CatalogV2Util { */ val TABLE_RESERVED_PROPERTIES = Seq(TableCatalog.PROP_COMMENT, + TableCatalog.PROP_COLLATION, TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER, TableCatalog.PROP_OWNER, @@ -459,7 +460,7 @@ private[sql] object CatalogV2Util { def convertTableProperties(t: TableSpec): Map[String, String] = { val props = convertTableProperties( t.properties, t.options, t.serde, t.location, t.comment, - t.provider, t.external) + t.collation, t.provider, t.external) withDefaultOwnership(props) } @@ -469,6 +470,7 @@ private[sql] object CatalogV2Util { serdeInfo: Option[SerdeInfo], location: Option[String], comment: Option[String], + collation: Option[String], provider: Option[String], external: Boolean = false): Map[String, String] = { properties ++ @@ -478,6 +480,7 @@ private[sql] object CatalogV2Util { (if (external) Some(TableCatalog.PROP_EXTERNAL -> "true") else None) ++ provider.map(TableCatalog.PROP_PROVIDER -> _) ++ comment.map(TableCatalog.PROP_COMMENT -> _) ++ + collation.map(TableCatalog.PROP_COLLATION -> _) ++ location.map(TableCatalog.PROP_LOCATION -> _) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala index 4a5a607e8a8ae..570ab1338dbf2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala @@ -85,6 +85,7 @@ private[sql] object V1Table { TableCatalog.OPTION_PREFIX + key -> value } ++ v1Table.provider.map(TableCatalog.PROP_PROVIDER -> _) ++ v1Table.comment.map(TableCatalog.PROP_COMMENT -> _) ++ + v1Table.collation.map(TableCatalog.PROP_COLLATION -> _) ++ v1Table.storage.locationUri.map { loc => TableCatalog.PROP_LOCATION -> CatalogUtils.URIToString(loc) } ++ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 03471ae8a3da5..afae0565133b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, FunctionIdentif import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, FunctionAlreadyExistsException, NamespaceAlreadyExistsException, NoSuchFunctionException, NoSuchNamespaceException, NoSuchPartitionException, NoSuchTableException, Star, TableAlreadyExistsException, UnresolvedRegex} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, InvalidUDFClassException} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, AttributeSet, CreateMap, CreateStruct, Expression, GroupingID, NamedExpression, SpecifiedWindowFrame, WindowFrame, WindowFunction, WindowSpecDefinition} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, AttributeSet, CreateMap, CreateStruct, Expression, GroupingID, NamedExpression, SortOrder, SpecifiedWindowFrame, WindowFrame, WindowFunction, WindowSpecDefinition} import org.apache.spark.sql.catalyst.expressions.aggregate.AnyValue import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.plans.logical.{Assignment, InputParameter, Join, LogicalPlan, SerdeInfo, Window} @@ -351,6 +351,19 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) } + def collatedStringsInMapKeysNotSupportedError(): Throwable = { + new AnalysisException( + errorClass = "UNSUPPORTED_FEATURE.COLLATIONS_IN_MAP_KEYS", + messageParameters = Map.empty) + } + + def objectLevelCollationsNotEnabledError(): Throwable = { + new AnalysisException( + errorClass = "UNSUPPORTED_FEATURE.OBJECT_LEVEL_COLLATIONS", + messageParameters = Map.empty + ) + } + def trimCollationNotEnabledError(): Throwable = { new AnalysisException( errorClass = "UNSUPPORTED_FEATURE.TRIM_COLLATION", @@ -725,28 +738,32 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "windowExpr" -> toSQLExpr(windowExpr))) } - def distinctInverseDistributionFunctionUnsupportedError(funcName: String): Throwable = { + def distinctWithOrderingFunctionUnsupportedError(funcName: String): Throwable = { new AnalysisException( - errorClass = "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", - messageParameters = Map("funcName" -> toSQLId(funcName))) + errorClass = "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", + messageParameters = Map("funcName" -> toSQLId(funcName)) + ) } - def inverseDistributionFunctionMissingWithinGroupError(funcName: String): Throwable = { + def functionMissingWithinGroupError(funcName: String): Throwable = { new AnalysisException( - errorClass = "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", - messageParameters = Map("funcName" -> toSQLId(funcName))) + errorClass = "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", + messageParameters = Map("funcName" -> toSQLId(funcName)) + ) } - def wrongNumOrderingsForInverseDistributionFunctionError( + def wrongNumOrderingsForFunctionError( funcName: String, validOrderingsNumber: Int, actualOrderingsNumber: Int): Throwable = { new AnalysisException( - errorClass = "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WRONG_NUM_ORDERINGS", + errorClass = "INVALID_WITHIN_GROUP_EXPRESSION.WRONG_NUM_ORDERINGS", messageParameters = Map( "funcName" -> toSQLId(funcName), "expectedNum" -> validOrderingsNumber.toString, - "actualNum" -> actualOrderingsNumber.toString)) + "actualNum" -> actualOrderingsNumber.toString + ) + ) } def aliasNumberNotMatchColumnNumberError( @@ -1049,6 +1066,18 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "operation" -> operation)) } + def functionAndOrderExpressionMismatchError( + functionName: String, + functionArg: Expression, + orderExpr: Seq[SortOrder]): Throwable = { + new AnalysisException( + errorClass = "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + messageParameters = Map( + "funcName" -> toSQLId(functionName), + "funcArg" -> toSQLExpr(functionArg), + "orderingExpr" -> orderExpr.map(order => toSQLExpr(order.child)).mkString(", "))) + } + def wrongCommandForObjectTypeError( operation: String, requiredType: String, @@ -1593,6 +1622,10 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat notSupportedForV2TablesError("ALTER TABLE ... SET [SERDE|SERDEPROPERTIES]") } + def describeAsJsonNotSupportedForV2TablesError(): Throwable = { + notSupportedForV2TablesError("DESCRIBE TABLE AS JSON") + } + def loadDataNotSupportedForV2TablesError(): Throwable = { notSupportedForV2TablesError("LOAD DATA") } @@ -2150,6 +2183,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "ability" -> ability)) } + def tableValuedArgumentsNotYetImplementedForSqlFunctions( + action: String, functionName: String): Throwable = { + new AnalysisException( + errorClass = "TABLE_VALUED_ARGUMENTS_NOT_YET_IMPLEMENTED_FOR_SQL_FUNCTIONS", + messageParameters = Map( + "action" -> action, + "functionName" -> functionName)) + } + def tableValuedFunctionTooManyTableArgumentsError(num: Int): Throwable = { new AnalysisException( errorClass = "TABLE_VALUED_FUNCTION_TOO_MANY_TABLE_ARGUMENTS", @@ -2638,12 +2680,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "comment" -> comment)) } - def invalidPartitionColumnKeyInTableError(key: String, tblName: String): Throwable = { + def invalidPartitionColumnKeyInTableError(key: String, tableName: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_1231", + errorClass = "PARTITIONS_NOT_FOUND", messageParameters = Map( - "key" -> key, - "tblName" -> toSQLId(tblName))) + "partitionList" -> toSQLId(key), + "tableName" -> toSQLId(tableName))) } def invalidPartitionSpecError( @@ -4114,6 +4156,83 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) } + def unsupportedSinglePassAnalyzerFeature(feature: String): AnalysisException = { + new AnalysisException( + errorClass = "UNSUPPORTED_SINGLE_PASS_ANALYZER_FEATURE", + messageParameters = Map("feature" -> feature) + ) + } + + def ambiguousResolverExtension( + operator: LogicalPlan, + extensionNames: Seq[String]): AnalysisException = { + new AnalysisException( + errorClass = "AMBIGUOUS_RESOLVER_EXTENSION", + messageParameters = Map( + "operator" -> operator.getClass.getName, + "extensions" -> extensionNames.mkString(", ") + ) + ) + } + + def fixedPointFailedSinglePassSucceeded( + singlePassResult: LogicalPlan, + fixedPointException: Throwable): Throwable = { + new ExtendedAnalysisException( + new AnalysisException( + errorClass = "HYBRID_ANALYZER_EXCEPTION.FIXED_POINT_FAILED_SINGLE_PASS_SUCCEEDED", + messageParameters = Map("singlePassOutput" -> singlePassResult.toString), + cause = Some(fixedPointException) + ), + plan = singlePassResult + ) + } + + def hybridAnalyzerOutputSchemaComparisonMismatch( + fixedPointOutputSchema: StructType, + singlePassOutputSchema: StructType): Throwable = { + + def structToString(struct: StructType) = + struct.fields.map(structFieldToStringWithMetadata(_)).mkString(",") + + def structFieldToStringWithMetadata(sf: StructField) = + s"(${sf.name},${sf.dataType},${sf.nullable},${sf.metadata})" + + new AnalysisException( + errorClass = "HYBRID_ANALYZER_EXCEPTION.OUTPUT_SCHEMA_COMPARISON_MISMATCH", + messageParameters = Map( + "fixedPointOutputSchema" -> structToString(fixedPointOutputSchema), + "singlePassOutputSchema" -> structToString(singlePassOutputSchema) + ) + ) + } + + def hybridAnalyzerLogicalPlanComparisonMismatch( + fixedPointOutput: LogicalPlan, + singlePassOutput: LogicalPlan): Throwable = { + new AnalysisException( + errorClass = "HYBRID_ANALYZER_EXCEPTION.LOGICAL_PLAN_COMPARISON_MISMATCH", + messageParameters = Map( + "fixedPointOutput" -> fixedPointOutput.toString, + "singlePassOutput" -> singlePassOutput.toString + ) + ) + } + + def resolutionValidationError(cause: Throwable, plan: LogicalPlan): Throwable = { + new ExtendedAnalysisException( + new AnalysisException( + errorClass = "INTERNAL_ERROR", + cause = Some(cause), + messageParameters = Map( + "message" -> ("The analysis phase failed with an internal error. Reason: " + + cause.getMessage) + ) + ), + plan = plan + ) + } + def avroNotLoadedSqlFunctionsUnusable(functionName: String): Throwable = { new AnalysisException( errorClass = "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE", @@ -4172,4 +4291,44 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) ) } + + def numColumnsMismatch( + operator: String, + firstNumColumns: Int, + invalidOrdinalNum: Int, + invalidNumColumns: Int, + origin: Origin): Throwable = { + new AnalysisException( + errorClass = "NUM_COLUMNS_MISMATCH", + messageParameters = Map( + "operator" -> toSQLStmt(operator), + "firstNumColumns" -> firstNumColumns.toString, + "invalidOrdinalNum" -> ordinalNumber(invalidOrdinalNum), + "invalidNumColumns" -> invalidNumColumns.toString + ), + origin = origin + ) + } + + def incompatibleColumnTypeError( + operator: String, + columnOrdinalNumber: Int, + tableOrdinalNumber: Int, + dataType1: DataType, + dataType2: DataType, + hint: String, + origin: Origin): Throwable = { + new AnalysisException( + errorClass = "INCOMPATIBLE_COLUMN_TYPE", + messageParameters = Map( + "operator" -> toSQLStmt(operator), + "columnOrdinalNumber" -> ordinalNumber(columnOrdinalNumber), + "tableOrdinalNumber" -> ordinalNumber(tableOrdinalNumber), + "dataType1" -> toSQLType(dataType1), + "dataType2" -> toSQLType(dataType2), + "hint" -> hint + ), + origin = origin + ) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 0852e773c87b4..1ae2e5445c0c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -225,8 +225,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE errorClass = "INVALID_ARRAY_INDEX", messageParameters = Map( "indexValue" -> toSQLValue(index, IntegerType), - "arraySize" -> toSQLValue(numElements, IntegerType), - "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key)), + "arraySize" -> toSQLValue(numElements, IntegerType)), context = getQueryContext(context), summary = getSummary(context)) } @@ -239,8 +238,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE errorClass = "INVALID_ARRAY_INDEX_IN_ELEMENT_AT", messageParameters = Map( "indexValue" -> toSQLValue(index, IntegerType), - "arraySize" -> toSQLValue(numElements, IntegerType), - "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key)), + "arraySize" -> toSQLValue(numElements, IntegerType)), context = getQueryContext(context), summary = getSummary(context)) } @@ -267,12 +265,13 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE summary = "") } - def ansiDateTimeParseError(e: Exception): SparkDateTimeException = { + def ansiDateTimeParseError(e: Exception, suggestedFunc: String): SparkDateTimeException = { new SparkDateTimeException( errorClass = "CANNOT_PARSE_TIMESTAMP", messageParameters = Map( "message" -> e.getMessage, - "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key)), + "func" -> toSQLId(suggestedFunc) + ), context = Array.empty, summary = "") } @@ -2475,11 +2474,11 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE ) } - def timestampAddOverflowError(micros: Long, amount: Int, unit: String): ArithmeticException = { + def timestampAddOverflowError(micros: Long, amount: Long, unit: String): ArithmeticException = { new SparkArithmeticException( errorClass = "DATETIME_OVERFLOW", messageParameters = Map( - "operation" -> (s"add ${toSQLValue(amount, IntegerType)} $unit to " + + "operation" -> (s"add ${toSQLValue(amount, LongType)} $unit to " + s"${toSQLValue(DateTimeUtils.microsToInstant(micros), TimestampType)}")), context = Array.empty, summary = "") @@ -2602,6 +2601,14 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE cause = null) } + def cannotFindBaseSnapshotCheckpoint(lineage: String): Throwable = { + new SparkException ( + errorClass = + "CANNOT_LOAD_STATE_STORE.CANNOT_FIND_BASE_SNAPSHOT_CHECKPOINT", + messageParameters = Map("lineage" -> lineage), + cause = null) + } + def unexpectedFileSize( dfsFile: Path, localFile: File, @@ -2779,6 +2786,16 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE Map.empty ) + def invalidFileExtensionError(functionName: String, extension: String): RuntimeException = { + new SparkIllegalArgumentException( + errorClass = "INVALID_PARAMETER_VALUE.EXTENSION", + messageParameters = Map( + "functionName" -> toSQLId(functionName), + "parameter" -> toSQLId("extension"), + "fileExtension" -> toSQLId(extension), + "acceptable" -> "Extension is limited to exactly 3 letters (e.g. csv, tsv, etc...)")) + } + def invalidCharsetError(functionName: String, charset: String): RuntimeException = { new SparkIllegalArgumentException( errorClass = "INVALID_PARAMETER_VALUE.CHARSET", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/SqlScriptingErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/SqlScriptingErrors.scala index f1c07200d503b..da492cce22f2c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/SqlScriptingErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/SqlScriptingErrors.scala @@ -103,6 +103,14 @@ private[sql] object SqlScriptingErrors { messageParameters = Map("invalidStatement" -> toSQLStmt(stmt))) } + def positionalParametersAreNotSupportedWithSqlScripting(): Throwable = { + new SqlScriptingException( + origin = null, + errorClass = "UNSUPPORTED_FEATURE.SQL_SCRIPTING_WITH_POSITIONAL_PARAMETERS", + cause = null, + messageParameters = Map.empty) + } + def labelDoesNotExist( origin: Origin, labelName: String, @@ -125,4 +133,14 @@ private[sql] object SqlScriptingErrors { cause = null, messageParameters = Map("labelName" -> toSQLStmt(labelName))) } + + def labelCannotBeQualified( + origin: Origin, + labelName: String): Throwable = { + new SqlScriptingException( + origin = origin, + errorClass = "INVALID_LABEL_USAGE.QUALIFIED_LABEL_NAME", + cause = null, + messageParameters = Map("labelName" -> toSQLStmt(labelName))) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 123759c6c8b80..727d54b6bbd2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.analysis.{HintErrorLogger, Resolver} import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator import org.apache.spark.sql.catalyst.plans.logical.HintErrorHandler -import org.apache.spark.sql.catalyst.util.{CollationFactory, DateTimeUtils} +import org.apache.spark.sql.catalyst.util.{CollationFactory, CollationNames, DateTimeUtils} import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types.{AtomicType, StringType, TimestampNTZType, TimestampType} @@ -247,6 +247,78 @@ object SQLConf { .intConf .createWithDefault(100) + val ANALYZER_SINGLE_PASS_RESOLVER_ENABLED = + buildConf("spark.sql.analyzer.singlePassResolver.enabled") + .internal() + .doc( + "When true, use the single-pass Resolver instead of the fixed-point Analyzer. " + + "This is an alternative Analyzer framework, which resolves the parsed logical plan in a " + + "single post-order traversal. It uses ExpressionResolver to resolve expressions and " + + "NameScope to control the visibility of names. In contrast to the current fixed-point " + + "framework, subsequent in-tree traversals are disallowed. Most of the fixed-point " + + "Analyzer code is reused in the form of specific node transformation functions " + + "(AliasResolution.resolve, FunctionResolution.resolveFunction, etc). " + + "This feature is currently under development." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER = + buildConf("spark.sql.analyzer.singlePassResolver.dualRunWithLegacy") + .internal() + .doc( + "When true, run both analyzers to check if single-pass Analyzer correctly produces " + + "the same analyzed plan as the fixed-point Analyzer for the existing set of features " + + "defined in the ResolverGuard" + ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val ANALYZER_SINGLE_PASS_RESOLVER_VALIDATION_ENABLED = + buildConf("spark.sql.analyzer.singlePassResolver.validationEnabled") + .internal() + .doc( + "When true, validate the Resolver output with ResolutionValidator. " + + "The ResolutionValidator validates the resolved logical plan tree in one pass " + + "and asserts the internal contracts. It uses the ExpressionResolutionValidator " + + "internally to validate resolved expression trees in the same manner." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(true) + + val ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED = + buildConf("spark.sql.analyzer.singlePassResolver.trackResolvedNodes.enabled") + .internal() + .doc( + "When true, keep track of resolved nodes in order to assert that the single-pass " + + "invariant is never broken. While true, if a resolver attempts to resolve the same node " + + "twice, INTERNAL_ERROR exception is thrown. Used only for testing due to memory impact " + + "of storing each node in a HashSet." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val ANALYZER_SINGLE_PASS_RESOLVER_RELATION_BRIDGING_ENABLED = + buildConf("spark.sql.analyzer.singlePassResolver.relationBridging.enabled") + .internal() + .doc( + "When set to true, the single-pass Resolver will reuse the relation metadata that was " + + "previously resolved in fixed-point run. This makes sense only in " + + "ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER mode. In that case HybridAnalyzer " + + "enables the AnalyzerBridgeState and passes it to the single-pass Analyzer after the " + + "fixed-point run is complete. Single-pass Resolver uses this AnalyzerBridgeState to " + + "construct a special RelationMetadataProvider implementation - " + + "BridgedRelationMetadataProvider. This component simply reuses cached relation metadata " + + "and avoids any blocking calls (catalog RPCs or table metadata reads)." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(Utils.isTesting) + val MULTI_COMMUTATIVE_OP_OPT_THRESHOLD = buildConf("spark.sql.analyzer.canonicalization.multiCommutativeOpMemoryOptThreshold") .internal() @@ -328,6 +400,19 @@ object SQLConf { .booleanConf .createWithDefault(Utils.isTesting) + val EXPRESSION_TREE_CHANGE_LOG_LEVEL = buildConf("spark.sql.expressionTreeChangeLog.level") + .internal() + .doc("Configures the log level for logging the change from the unresolved expression tree to " + + "the resolved expression tree in the single-pass bottom-up Resolver. The value can be " + + "'trace', 'debug', 'info', 'warn', or 'error'. The default log level is 'trace'.") + .version("4.0.0") + .stringConf + .transform(_.toUpperCase(Locale.ROOT)) + .checkValue(logLevel => Set("TRACE", "DEBUG", "INFO", "WARN", "ERROR").contains(logLevel), + "Invalid value for 'spark.sql.expressionTreeChangeLog.level'. Valid values are " + + "'trace', 'debug', 'info', 'warn' and 'error'.") + .createWithDefault("trace") + val LIGHTWEIGHT_PLAN_CHANGE_VALIDATION = buildConf("spark.sql.lightweightPlanChangeValidation") .internal() .doc(s"Similar to ${PLAN_CHANGE_VALIDATION.key}, this validates plan changes and runs after " + @@ -600,7 +685,7 @@ object SQLConf { val AUTO_BROADCASTJOIN_THRESHOLD = buildConf("spark.sql.autoBroadcastJoinThreshold") .doc("Configures the maximum size in bytes for a table that will be broadcast to all worker " + - "nodes when performing a join. By setting this value to -1 broadcasting can be disabled.") + "nodes when performing a join. By setting this value to -1 broadcasting can be disabled.") .version("1.1.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("10MB") @@ -616,7 +701,7 @@ object SQLConf { val LIMIT_INITIAL_NUM_PARTITIONS = buildConf("spark.sql.limit.initialNumPartitions") .internal() .doc("Initial number of partitions to try when executing a take on a query. Higher values " + - "lead to more partitions read. Lower values might lead to longer execution times as more" + + "lead to more partitions read. Lower values might lead to longer execution times as more " + "jobs will be run") .version("3.4.0") .intConf @@ -767,25 +852,35 @@ object SQLConf { .checkValue(_ > 0, "The initial number of partitions must be positive.") .createOptional - lazy val TRIM_COLLATION_ENABLED = - buildConf("spark.sql.collation.trim.enabled") + lazy val ALLOW_COLLATIONS_IN_MAP_KEYS = + buildConf("spark.sql.collation.allowInMapKeys") + .doc("Allow for non-UTF8_BINARY collated strings inside of map's keys") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + lazy val OBJECT_LEVEL_COLLATIONS_ENABLED = + buildConf("spark.sql.collation.objectLevel.enabled") .internal() .doc( - "Trim collation feature is under development and its use should be done under this" + - "feature flag. Trim collation trims trailing whitespaces from strings." + "Object level collations feature is under development and its use should be done " + + "under this feature flag. The feature allows setting default collation for all " + + "underlying columns within that object, except the ones that were previously created." ) .version("4.0.0") .booleanConf .createWithDefault(Utils.isTesting) - val ALLOW_READING_UNKNOWN_COLLATIONS = - buildConf(SqlApiConfHelper.ALLOW_READING_UNKNOWN_COLLATIONS) + lazy val TRIM_COLLATION_ENABLED = + buildConf("spark.sql.collation.trim.enabled") .internal() - .doc("Enables spark to read unknown collation name as UTF8_BINARY. If the config is " + - "not enabled, when spark encounters an unknown collation name, it will throw an error.") + .doc( + "Trim collation feature is under development and its use should be done under this" + + "feature flag. Trim collation trims trailing whitespaces from strings." + ) .version("4.0.0") .booleanConf - .createWithDefault(false) + .createWithDefault(Utils.isTesting) val DEFAULT_COLLATION = buildConf(SqlApiConfHelper.DEFAULT_COLLATION) @@ -1017,8 +1112,8 @@ object SQLConf { val FILE_COMPRESSION_FACTOR = buildConf("spark.sql.sources.fileCompressionFactor") .internal() .doc("When estimating the output data size of a table scan, multiply the file size with this " + - "factor as the estimated data size, in case the data is compressed in the file and lead to" + - " a heavily underestimated result.") + "factor as the estimated data size, in case the data is compressed in the file and lead to " + + "a heavily underestimated result.") .version("2.3.1") .doubleConf .checkValue(_ > 0, "the value of fileCompressionFactor must be greater than 0") @@ -1270,7 +1365,7 @@ object SQLConf { val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec") .doc("Sets the compression codec used when writing ORC files. If either `compression` or " + "`orc.compress` is specified in the table-specific options/properties, the precedence " + - "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`." + + "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`. " + "Acceptable values include: none, uncompressed, snappy, zlib, lzo, zstd, lz4, brotli.") .version("2.3.0") .stringConf @@ -1441,7 +1536,7 @@ object SQLConf { "to produce the partition columns instead of table scans. It applies when all the columns " + "scanned are partition columns and the query has an aggregate operator that satisfies " + "distinct semantics. By default the optimization is disabled, and deprecated as of Spark " + - "3.0 since it may return incorrect results when the files are empty, see also SPARK-26709." + + "3.0 since it may return incorrect results when the files are empty, see also SPARK-26709. " + "It will be removed in the future releases. If you must use, use 'SparkSessionExtensions' " + "instead to inject it as a custom rule.") .version("2.1.1") @@ -1638,7 +1733,7 @@ object SQLConf { val V2_BUCKETING_SHUFFLE_ENABLED = buildConf("spark.sql.sources.v2.bucketing.shuffle.enabled") - .doc("During a storage-partitioned join, whether to allow to shuffle only one side." + + .doc("During a storage-partitioned join, whether to allow to shuffle only one side. " + "When only one side is KeyGroupedPartitioning, if the conditions are met, spark will " + "only shuffle the other side. This optimization will reduce the amount of data that " + s"needs to be shuffle. This config requires ${V2_BUCKETING_ENABLED.key} to be enabled") @@ -1648,9 +1743,9 @@ object SQLConf { val V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS = buildConf("spark.sql.sources.v2.bucketing.allowJoinKeysSubsetOfPartitionKeys.enabled") - .doc("Whether to allow storage-partition join in the case where join keys are" + + .doc("Whether to allow storage-partition join in the case where join keys are " + "a subset of the partition keys of the source tables. At planning time, " + - "Spark will group the partitions by only those keys that are in the join keys." + + "Spark will group the partitions by only those keys that are in the join keys. " + s"This is currently enabled only if ${REQUIRE_ALL_CLUSTER_KEYS_FOR_DISTRIBUTION.key} " + "is false." ) @@ -1681,6 +1776,16 @@ object SQLConf { .booleanConf .createWithDefault(false) + val V2_BUCKETING_SORTING_ENABLED = + buildConf("spark.sql.sources.v2.bucketing.sorting.enabled") + .doc(s"When turned on, Spark will recognize the specific distribution reported by " + + s"a V2 data source through SupportsReportPartitioning, and will try to avoid a shuffle " + + s"if possible when sorting by those columns. This config requires " + + s"${V2_BUCKETING_ENABLED.key} to be enabled.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val BUCKETING_MAX_BUCKETS = buildConf("spark.sql.sources.bucketing.maxBuckets") .doc("The maximum number of buckets allowed.") .version("2.4.0") @@ -1978,7 +2083,7 @@ object SQLConf { val WHOLESTAGE_BROADCAST_CLEANED_SOURCE_THRESHOLD = buildConf("spark.sql.codegen.broadcastCleanedSourceThreshold") .internal() - .doc("A threshold (in string length) to determine if we should make the generated code a" + + .doc("A threshold (in string length) to determine if we should make the generated code a " + "broadcast variable in whole stage codegen. To disable this, set the threshold to < 0; " + "otherwise if the size is above the threshold, it'll use broadcast variable. Note that " + "maximum string length allowed in Java is Integer.MAX_VALUE, so anything above it would " + @@ -2230,6 +2335,17 @@ object SQLConf { .intConf .createWithDefault(1) + val STREAMING_STATE_STORE_ENCODING_FORMAT = + buildConf("spark.sql.streaming.stateStore.encodingFormat") + .doc("The encoding format used for stateful operators to store information " + + "in the state store") + .version("4.0.0") + .stringConf + .transform(_.toLowerCase(Locale.ROOT)) + .checkValue(v => Set("unsaferow", "avro").contains(v), + "Valid values are 'unsaferow' and 'avro'") + .createWithDefault("unsaferow") + val STATE_STORE_COMPRESSION_CODEC = buildConf("spark.sql.streaming.stateStore.compression.codec") .internal() @@ -3273,6 +3389,24 @@ object SQLConf { .booleanConf .createWithDefault(false) + val PYTHON_UDF_MAX_RECORDS_PER_BATCH = + buildConf("spark.sql.execution.python.udf.maxRecordsPerBatch") + .doc("When using Python UDFs, limit the maximum number of records that can be batched " + + "for serialization/deserialization.") + .version("4.0.0") + .intConf + .checkValue(_ > 0, "The value of spark.sql.execution.python.udf.maxRecordsPerBatch " + + "must be positive.") + .createWithDefault(100) + + val PYTHON_UDF_BUFFER_SIZE = + buildConf("spark.sql.execution.python.udf.buffer.size") + .doc( + s"Same as `${BUFFER_SIZE.key}` but only applies to Python UDF executions. If it is not " + + s"set, the fallback is `${BUFFER_SIZE.key}`.") + .version("4.0.0") + .fallbackConf(BUFFER_SIZE) + val PANDAS_UDF_BUFFER_SIZE = buildConf("spark.sql.execution.pandas.udf.buffer.size") .doc( @@ -3287,7 +3421,7 @@ object SQLConf { buildConf("spark.sql.execution.pandas.structHandlingMode") .doc( "The conversion mode of struct type when creating pandas DataFrame. " + - "When \"legacy\"," + + "When \"legacy\", " + "1. when Arrow optimization is disabled, convert to Row object, " + "2. when Arrow optimization is enabled, convert to dict or raise an Exception " + "if there are duplicated nested field names. " + @@ -3319,6 +3453,17 @@ object SQLConf { .booleanConf .createWithDefault(false) + val PYTHON_UDF_ARROW_CONCURRENCY_LEVEL = + buildConf("spark.sql.execution.pythonUDF.arrow.concurrency.level") + .doc("The level of concurrency to execute Arrow-optimized Python UDF. " + + "This can be useful if Python UDFs use I/O intensively.") + .version("4.0.0") + .intConf + .checkValue(_ > 1, + "The value of spark.sql.execution.pythonUDF.arrow.concurrency.level" + + " must be more than one.") + .createOptional + val PYTHON_TABLE_UDF_ARROW_ENABLED = buildConf("spark.sql.execution.pythonUDTF.arrow.enabled") .doc("Enable Arrow optimization for Python UDTFs.") @@ -3364,7 +3509,7 @@ object SQLConf { buildConf("spark.sql.execution.pyspark.python") .internal() .doc("Python binary executable to use for PySpark in executors when running Python " + - "UDF, pandas UDF and pandas function APIs." + + "UDF, pandas UDF and pandas function APIs. " + "If not set, it falls back to 'spark.pyspark.python' by default.") .version("3.5.0") .stringConf @@ -3593,7 +3738,7 @@ object SQLConf { val ANSI_ENABLED = buildConf(SqlApiConfHelper.ANSI_ENABLED_KEY) .doc("When true, Spark SQL uses an ANSI compliant dialect instead of being Hive compliant. " + "For example, Spark will throw an exception at runtime instead of returning null results " + - "when the inputs to a SQL operator/function are invalid." + + "when the inputs to a SQL operator/function are invalid. " + "For full details of this dialect, you can find them in the section \"ANSI Compliance\" of " + "Spark's documentation. Some ANSI dialect features may be not from the ANSI SQL " + "standard directly, but their behaviors align with ANSI SQL's style") @@ -3684,7 +3829,7 @@ object SQLConf { .internal() .doc("When true, use the common expression ID for the alias when rewriting With " + "expressions. Otherwise, use the index of the common expression definition. When true " + - "this avoids duplicate alias names, but is helpful to set to false for testing to ensure" + + "this avoids duplicate alias names, but is helpful to set to false for testing to ensure " + "that alias names are consistent.") .version("4.0.0") .booleanConf @@ -3979,7 +4124,7 @@ object SQLConf { .createWithDefault(true) val ARTIFACTS_SESSION_ISOLATION_ALWAYS_APPLY_CLASSLOADER = - buildConf("spark.sql.artifact.isolation.always.apply.classloader") + buildConf("spark.sql.artifact.isolation.alwaysApplyClassloader") .internal() .doc("When enabled, the classloader holding per-session artifacts will always be applied " + "during SQL executions (useful for Spark Connect). When disabled, the classloader will " + @@ -4146,7 +4291,7 @@ object SQLConf { val LEGACY_ALLOW_UNTYPED_SCALA_UDF = buildConf("spark.sql.legacy.allowUntypedScalaUDF") .internal() - .doc("When set to true, user is allowed to use org.apache.spark.sql.functions." + + .doc("When set to true, user is allowed to use org.apache.spark.sql.functions. " + "udf(f: AnyRef, dataType: DataType). Otherwise, an exception will be thrown at runtime.") .version("3.0.0") .booleanConf @@ -4183,7 +4328,7 @@ object SQLConf { val MAX_TO_STRING_FIELDS = buildConf("spark.sql.debug.maxToStringFields") .doc("Maximum number of fields of sequence-like entries can be converted to strings " + - "in debug output. Any elements beyond the limit will be dropped and replaced by a" + + "in debug output. Any elements beyond the limit will be dropped and replaced by a " + """ "... N more fields" placeholder.""") .version("3.0.0") .intConf @@ -4289,10 +4434,16 @@ object SQLConf { s"the $SESSION_CATALOG_NAME and must be consistent with it; for example, if a table can " + s"be loaded by the $SESSION_CATALOG_NAME, this catalog must also return the table " + s"metadata. To delegate operations to the $SESSION_CATALOG_NAME, implementations can " + - "extend 'CatalogExtension'.") + "extend 'CatalogExtension'. The value should be either 'builtin' which represents the " + + "spark's builit-in V2SessionCatalog, or a fully qualified class name of the catalog " + + "implementation.") .version("3.0.0") .stringConf - .createOptional + .transform { + case builtin if builtin.equalsIgnoreCase("builtin") => "builtin" + case fullClassName => fullClassName + } + .createWithDefault("builtin") object MapKeyDedupPolicy extends Enumeration { val EXCEPTION, LAST_WIN = Value @@ -4319,7 +4470,7 @@ object SQLConf { val LEGACY_CTE_PRECEDENCE_POLICY = buildConf("spark.sql.legacy.ctePrecedencePolicy") .internal() .doc("When LEGACY, outer CTE definitions takes precedence over inner definitions. If set to " + - "EXCEPTION, AnalysisException is thrown while name conflict is detected in nested CTE." + + "EXCEPTION, AnalysisException is thrown while name conflict is detected in nested CTE. " + "The default is CORRECTED, inner CTE definitions take precedence. This config " + "will be removed in future versions and CORRECTED will be the only behavior.") .version("3.0.0") @@ -4526,6 +4677,40 @@ object SQLConf { .booleanConf .createWithDefault(false) + val VARIANT_ALLOW_READING_SHREDDED = + buildConf("spark.sql.variant.allowReadingShredded") + .internal() + .doc("When true, the Parquet reader is allowed to read shredded or unshredded variant. " + + "When false, it only reads unshredded variant.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + + val PUSH_VARIANT_INTO_SCAN = + buildConf("spark.sql.variant.pushVariantIntoScan") + .internal() + .doc("When true, replace variant type in the scan schema with a struct containing " + + "requested fields.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val VARIANT_WRITE_SHREDDING_ENABLED = + buildConf("spark.sql.variant.writeShredding.enabled") + .internal() + .doc("When true, the Parquet writer is allowed to write shredded variant. ") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val VARIANT_FORCE_SHREDDING_SCHEMA_FOR_TEST = + buildConf("spark.sql.variant.forceShreddingSchemaForTest") + .internal() + .doc("FOR INTERNAL TESTING ONLY. Sets shredding schema for Variant.") + .version("4.0.0") + .stringConf + .createWithDefault("") + val LEGACY_CSV_ENABLE_DATE_TIME_PARSING_FALLBACK = buildConf("spark.sql.legacy.csv.enableDateTimeParsingFallback") .internal() @@ -4729,7 +4914,7 @@ object SQLConf { .doc("When true, NULL-aware anti join execution will be planed into " + "BroadcastHashJoinExec with flag isNullAwareAntiJoin enabled, " + "optimized from O(M*N) calculation into O(M) calculation " + - "using Hash lookup instead of Looping lookup." + + "using Hash lookup instead of Looping lookup. " + "Only support for singleColumn NAAJ for now.") .version("3.1.0") .booleanConf @@ -4805,6 +4990,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val PRESERVE_CHAR_VARCHAR_TYPE_INFO = buildConf("spark.sql.preserveCharVarcharTypeInfo") + .doc("When true, Spark does not replace CHAR/VARCHAR types the STRING type, which is the " + + "default behavior of Spark 3.0 and earlier versions. This means the length checks for " + + "CHAR/VARCHAR types is enforced and CHAR type is also properly padded.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val READ_SIDE_CHAR_PADDING = buildConf("spark.sql.readSideCharPadding") .doc("When true, Spark applies string padding when reading CHAR type columns/fields, " + "in addition to the write-side padding. This config is true by default to better enforce " + @@ -5093,7 +5286,7 @@ object SQLConf { "the sequence of steps that the query performs in a composable fashion.") .version("4.0.0") .booleanConf - .createWithDefault(Utils.isTesting) + .createWithDefault(true) val LEGACY_PERCENTILE_DISC_CALCULATION = buildConf("spark.sql.legacy.percentileDiscCalculation") .internal() @@ -5121,7 +5314,7 @@ object SQLConf { buildConf("spark.sql.legacy.raiseErrorWithoutErrorClass") .internal() .doc("When set to true, restores the legacy behavior of `raise_error` and `assert_true` to " + - "not return the `[USER_RAISED_EXCEPTION]` prefix." + + "not return the `[USER_RAISED_EXCEPTION]` prefix. " + "For example, `raise_error('error!')` returns `error!` instead of " + "`[USER_RAISED_EXCEPTION] Error!`.") .version("4.0.0") @@ -5179,7 +5372,7 @@ object SQLConf { .internal() .doc("When set to true, datetime formatter used for csv, json and xml " + "will support zone offsets that have seconds in it. e.g. LA timezone offset prior to 1883" + - "was -07:52:58. When this flag is not set we lose seconds information." ) + " was -07:52:58. When this flag is not set we lose seconds information." ) .version("4.0.0") .booleanConf .createWithDefault(true) @@ -5260,7 +5453,7 @@ object SQLConf { val LEGACY_BANG_EQUALS_NOT = buildConf("spark.sql.legacy.bangEqualsNot") .internal() .doc("When set to true, '!' is a lexical equivalent for 'NOT'. That is '!' can be used " + - "outside of the documented prefix usage in a logical expression." + + "outside of the documented prefix usage in a logical expression. " + "Examples are: `expr ! IN (1, 2)` and `expr ! BETWEEN 1 AND 2`, but also `IF ! EXISTS`." ) .version("4.0.0") @@ -5277,6 +5470,19 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LAZY_SET_OPERATOR_OUTPUT = buildConf("spark.sql.lazySetOperatorOutput.enabled") + .internal() + .doc( + "When set to true, Except/Intersect/Union operator's output will be a lazy val. It " + + "is a performance optimization for querires with a large number of stacked set operators. " + + "This is because of rules like WidenSetOperationTypes that traverse the logical plan tree " + + "and call output on each Except/Intersect/Union node. Such traversal has quadratic " + + "complexity: O(number_of_nodes * (1 + 2 + 3 + ... + number_of_nodes))." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(true) + /** * Holds information about keys that have been deprecated. * @@ -5369,7 +5575,7 @@ object SQLConf { RemovedConfig("spark.sql.legacy.compareDateTimestampInTimestamp", "3.0.0", "true", "It was removed to prevent errors like SPARK-23549 for non-default value."), RemovedConfig("spark.sql.parquet.int64AsTimestampMillis", "3.0.0", "false", - "The config was deprecated since Spark 2.3." + + "The config was deprecated since Spark 2.3. " + s"Use '${PARQUET_OUTPUT_TIMESTAMP_TYPE.key}' instead of it."), RemovedConfig("spark.sql.execution.pandas.respectSessionTimeZone", "3.0.0", "true", "The non-default behavior is considered as a bug, see SPARK-22395. " + @@ -5445,6 +5651,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def planChangeBatches: Option[String] = getConf(PLAN_CHANGE_LOG_BATCHES) + def expressionTreeChangeLogLevel: String = getConf(EXPRESSION_TREE_CHANGE_LOG_LEVEL) + def dynamicPartitionPruningEnabled: Boolean = getConf(DYNAMIC_PARTITION_PRUNING_ENABLED) def dynamicPartitionPruningUseStats: Boolean = getConf(DYNAMIC_PARTITION_PRUNING_USE_STATS) @@ -5572,18 +5780,20 @@ class SQLConf extends Serializable with Logging with SqlApiConf { } } + def allowCollationsInMapKeys: Boolean = getConf(ALLOW_COLLATIONS_IN_MAP_KEYS) + + def objectLevelCollationsEnabled: Boolean = getConf(OBJECT_LEVEL_COLLATIONS_ENABLED) + def trimCollationEnabled: Boolean = getConf(TRIM_COLLATION_ENABLED) override def defaultStringType: StringType = { - if (getConf(DEFAULT_COLLATION).toUpperCase(Locale.ROOT) == "UTF8_BINARY") { + if (getConf(DEFAULT_COLLATION).toUpperCase(Locale.ROOT) == CollationNames.UTF8_BINARY) { StringType } else { - StringType(CollationFactory.collationNameToId(getConf(DEFAULT_COLLATION))) + StringType(getConf(DEFAULT_COLLATION)) } } - override def allowReadingUnknownCollations: Boolean = getConf(ALLOW_READING_UNKNOWN_COLLATIONS) - def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED) def adaptiveExecutionLogLevel: String = getConf(ADAPTIVE_EXECUTION_LOG_LEVEL) @@ -5607,6 +5817,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def stateStoreCheckpointFormatVersion: Int = getConf(STATE_STORE_CHECKPOINT_FORMAT_VERSION) + def stateStoreEncodingFormat: String = getConf(STREAMING_STATE_STORE_ENCODING_FORMAT) + def checkpointRenamedFileCheck: Boolean = getConf(CHECKPOINT_RENAMEDFILE_CHECK_ENABLED) def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED) @@ -5852,6 +6064,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def v2BucketingAllowCompatibleTransforms: Boolean = getConf(SQLConf.V2_BUCKETING_ALLOW_COMPATIBLE_TRANSFORMS) + def v2BucketingAllowSorting: Boolean = + getConf(SQLConf.V2_BUCKETING_SORTING_ENABLED) + def dataFrameSelfJoinAutoResolveAmbiguity: Boolean = getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY) @@ -5995,6 +6210,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def pythonUDFWorkerFaulthandlerEnabled: Boolean = getConf(PYTHON_UDF_WORKER_FAULTHANLDER_ENABLED) + def pythonUDFArrowConcurrencyLevel: Option[Int] = getConf(PYTHON_UDF_ARROW_CONCURRENCY_LEVEL) + def pysparkPlotMaxRows: Int = getConf(PYSPARK_PLOT_MAX_ROWS) def arrowSparkREnabled: Boolean = getConf(ARROW_SPARKR_EXECUTION_ENABLED) @@ -6188,6 +6405,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def charVarcharAsString: Boolean = getConf(SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING) + def preserveCharVarcharTypeInfo: Boolean = getConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO) + def readSideCharPadding: Boolean = getConf(SQLConf.READ_SIDE_CHAR_PADDING) def cliPrintHeader: Boolean = getConf(SQLConf.CLI_PRINT_HEADER) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala index cd17a63e5d433..a14c584fdc6a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala @@ -280,7 +280,7 @@ object StaticSQLConf { buildStaticConf("spark.sql.streaming.ui.enabledCustomMetricList") .internal() .doc("Configures a list of custom metrics on Structured Streaming UI, which are enabled. " + - "The list contains the name of the custom metrics separated by comma. In aggregation" + + "The list contains the name of the custom metrics separated by comma. In aggregation " + "only sum used. The list of supported custom metrics is state store provider specific " + "and it can be found out for example from query progress log entry.") .version("3.1.0") @@ -295,4 +295,14 @@ object StaticSQLConf { .version("3.1.0") .stringConf .createWithDefault("") + + val DATA_FRAME_DEBUGGING_ENABLED = + buildStaticConf("spark.python.sql.dataFrameDebugging.enabled") + .internal() + .doc( + "Enable the DataFrame debugging. This feature is enabled by default, but has a " + + "non-trivial performance overhead because of the stack trace collection.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala index 1e0bac331dc75..53b82f9e86f77 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, NamedExpression} import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, NamedTransform, Transform} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringHelper, StringType, StructField, StructType} import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.SparkSchemaUtils @@ -304,6 +304,17 @@ private[spark] object SchemaUtils { } } + def checkNoCollationsInMapKeys(schema: DataType): Unit = schema match { + case m: MapType => + if (hasNonUTF8BinaryCollation(m.keyType)) { + throw QueryCompilationErrors.collatedStringsInMapKeysNotSupportedError() + } + checkNoCollationsInMapKeys(m.valueType) + case s: StructType => s.fields.foreach(field => checkNoCollationsInMapKeys(field.dataType)) + case a: ArrayType => checkNoCollationsInMapKeys(a.elementType) + case _ => + } + /** * Replaces any collated string type with non collated StringType * recursively in the given data type. @@ -317,7 +328,7 @@ private[spark] object SchemaUtils { StructType(fields.map { field => field.copy(dataType = replaceCollatedStringWithString(field.dataType)) }) - case _: StringType => StringType + case st: StringType => StringHelper.removeCollation(st) case _ => dt } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/QueryPlanningTrackerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/QueryPlanningTrackerSuite.scala index 972b98780bcca..500bbef3c89bf 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/QueryPlanningTrackerSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/QueryPlanningTrackerSuite.scala @@ -95,7 +95,13 @@ class QueryPlanningTrackerSuite extends SparkFunSuite { val mockCallback = mock[QueryPlanningTrackerCallback] val mockPlan1 = mock[LogicalPlan] val mockPlan2 = mock[LogicalPlan] + val mockPlan3 = mock[LogicalPlan] + val mockPlan4 = mock[LogicalPlan] val t = new QueryPlanningTracker(Some(mockCallback)) + t.setAnalysisFailed(mockPlan3) + verify(mockCallback, times(1)).analysisFailed(t, mockPlan3) + t.setAnalysisFailed(mockPlan4) + verify(mockCallback, times(1)).analysisFailed(t, mockPlan4) t.setAnalyzed(mockPlan1) verify(mockCallback, times(1)).analyzed(t, mockPlan1) t.setAnalyzed(mockPlan2) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExceptionPositionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExceptionPositionSuite.scala index 55f59f7a22574..325862127d366 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExceptionPositionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExceptionPositionSuite.scala @@ -40,7 +40,6 @@ class AnalysisExceptionPositionSuite extends AnalysisTest { } test("SPARK-34057: UnresolvedTableOrView should retain sql text position") { - verifyTableOrViewPosition("DESCRIBE TABLE unknown", "unknown") verifyTableOrPermanentViewPosition("ANALYZE TABLE unknown COMPUTE STATISTICS", "unknown") verifyTableOrViewPosition("ANALYZE TABLE unknown COMPUTE STATISTICS FOR COLUMNS col", "unknown") verifyTableOrViewPosition("ANALYZE TABLE unknown COMPUTE STATISTICS FOR ALL COLUMNS", "unknown") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 939801e3f07af..2ffe6de974c74 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -81,6 +81,24 @@ class AnalysisSuite extends AnalysisTest with Matchers { } } + test(s"do not fail if a leaf node has char/varchar type output and " + + s"${SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key} is true") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + val schema1 = new StructType().add("c", CharType(5)) + val schema2 = new StructType().add("c", VarcharType(5)) + val schema3 = new StructType().add("c", ArrayType(CharType(5))) + Seq(schema1, schema2, schema3).foreach { schema => + val table = new InMemoryTable("t", schema, Array.empty, Map.empty[String, String].asJava) + DataSourceV2Relation( + table, + DataTypeUtils.toAttributes(schema), + None, + None, + CaseInsensitiveStringMap.empty()).analyze + } + } + } + test("union project *") { val plan = (1 to 120) .map(_ => testRelation) @@ -777,6 +795,14 @@ class AnalysisSuite extends AnalysisTest with Matchers { PosExplode($"list"), Seq("first_pos", "first_val")), Seq("second_pos", "second_val")))) } + test("SPARK-50497 Non-generator function with multiple aliases") { + assertAnalysisErrorCondition(parsePlan("SELECT 'length' (a)"), + "MULTI_ALIAS_WITHOUT_GENERATOR", + Map("expr" -> "\"length\"", "names" -> "a"), + Array(ExpectedContext("SELECT 'length' (a)", 0, 18)) + ) + } + test("SPARK-24151: CURRENT_DATE, CURRENT_TIMESTAMP should be case insensitive") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { val input = Project(Seq( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala index 71744f4d15105..58e6cd7fe1695 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala @@ -198,6 +198,21 @@ trait AnalysisTest extends PlanTest { } } + protected def assertParseErrorClass( + parser: String => Any, + sqlCommand: String, + errorClass: String, + parameters: Map[String, String], + queryContext: Array[ExpectedContext] = Array.empty): Unit = { + val e = parseException(parser)(sqlCommand) + checkError( + exception = e, + condition = errorClass, + parameters = parameters, + queryContext = queryContext + ) + } + protected def interceptParseException(parser: String => Any)( sqlCommand: String, messages: String*)(condition: Option[String] = None): Unit = { val e = parseException(parser)(sqlCommand) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercionSuite.scala index 8cf7d78b510be..139e89828f8e5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercionSuite.scala @@ -1057,11 +1057,11 @@ class AnsiTypeCoercionSuite extends TypeCoercionSuiteBase { ArrayType(IntegerType)) shouldCast( ArrayType(StringType), - AbstractArrayType(StringTypeWithCollation), + AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true)), ArrayType(StringType)) shouldCast( ArrayType(IntegerType), - AbstractArrayType(StringTypeWithCollation), + AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true)), ArrayType(StringType)) shouldCast( ArrayType(StringType), @@ -1075,11 +1075,11 @@ class AnsiTypeCoercionSuite extends TypeCoercionSuiteBase { ArrayType(ArrayType(IntegerType))) shouldCast( ArrayType(ArrayType(StringType)), - AbstractArrayType(AbstractArrayType(StringTypeWithCollation)), + AbstractArrayType(AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true))), ArrayType(ArrayType(StringType))) shouldCast( ArrayType(ArrayType(IntegerType)), - AbstractArrayType(AbstractArrayType(StringTypeWithCollation)), + AbstractArrayType(AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true))), ArrayType(ArrayType(StringType))) shouldCast( ArrayType(ArrayType(StringType)), @@ -1088,16 +1088,16 @@ class AnsiTypeCoercionSuite extends TypeCoercionSuiteBase { // Invalid casts involving casting arrays into non-complex types. shouldNotCast(ArrayType(IntegerType), IntegerType) - shouldNotCast(ArrayType(StringType), StringTypeWithCollation) + shouldNotCast(ArrayType(StringType), StringTypeWithCollation(supportsTrimCollation = true)) shouldNotCast(ArrayType(StringType), IntegerType) - shouldNotCast(ArrayType(IntegerType), StringTypeWithCollation) + shouldNotCast(ArrayType(IntegerType), StringTypeWithCollation(supportsTrimCollation = true)) // Invalid casts involving casting arrays of arrays into arrays of non-complex types. shouldNotCast(ArrayType(ArrayType(IntegerType)), AbstractArrayType(IntegerType)) shouldNotCast(ArrayType(ArrayType(StringType)), - AbstractArrayType(StringTypeWithCollation)) + AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true))) shouldNotCast(ArrayType(ArrayType(StringType)), AbstractArrayType(IntegerType)) shouldNotCast(ArrayType(ArrayType(IntegerType)), - AbstractArrayType(StringTypeWithCollation)) + AbstractArrayType(StringTypeWithCollation(supportsTrimCollation = true))) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala index 6b034d3dbee09..133670d5fcced 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.util.ArrayImplicits._ class CreateTablePartitioningValidationSuite extends AnalysisTest { val tableSpec = - UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), None, None, None, false) + UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), None, None, None, None, false) test("CreateTableAsSelect: fail missing top-level column") { val plan = CreateTableAsSelect( UnresolvedIdentifier(Array("table_name").toImmutableArraySeq), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/LimitExpressionResolverSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/LimitExpressionResolverSuite.scala new file mode 100644 index 0000000000000..fdab4df379a71 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/LimitExpressionResolverSuite.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Cast, Expression, Literal} +import org.apache.spark.sql.errors.QueryErrorsBase +import org.apache.spark.sql.types.IntegerType + +class LimitExpressionResolverSuite extends SparkFunSuite with QueryErrorsBase { + + private class IdentityExpressionResolver extends TreeNodeResolver[Expression, Expression] { + override def resolve(expression: Expression): Expression = expression + } + + private val expressionResolver = new IdentityExpressionResolver + private val limitExpressionResolver = new LimitExpressionResolver(expressionResolver) + + test("Basic LIMIT without errors") { + val expr = Literal(42, IntegerType) + assert(limitExpressionResolver.resolve(expr) == expr) + } + + test("Unfoldable LIMIT") { + val col = AttributeReference(name = "foo", dataType = IntegerType)() + checkError( + exception = intercept[AnalysisException] { + limitExpressionResolver.resolve(col) + }, + condition = "INVALID_LIMIT_LIKE_EXPRESSION.IS_UNFOLDABLE", + parameters = Map("name" -> "limit", "expr" -> toSQLExpr(col)) + ) + } + + test("LIMIT with non-integer") { + val anyNonInteger = Literal("42") + checkError( + exception = intercept[AnalysisException] { + limitExpressionResolver.resolve(anyNonInteger) + }, + condition = "INVALID_LIMIT_LIKE_EXPRESSION.DATA_TYPE", + parameters = Map( + "name" -> "limit", + "expr" -> toSQLExpr(anyNonInteger), + "dataType" -> toSQLType(anyNonInteger.dataType) + ) + ) + } + + test("LIMIT with null") { + val expr = Cast(Literal(null), IntegerType) + checkError( + exception = intercept[AnalysisException] { + limitExpressionResolver.resolve(expr) + }, + condition = "INVALID_LIMIT_LIKE_EXPRESSION.IS_NULL", + parameters = Map( + "name" -> "limit", + "expr" -> toSQLExpr(expr) + ) + ) + } + + test("LIMIT with negative integer") { + val expr = Literal(-1, IntegerType) + checkError( + exception = intercept[AnalysisException] { + limitExpressionResolver.resolve(expr) + }, + condition = "INVALID_LIMIT_LIKE_EXPRESSION.IS_NEGATIVE", + parameters = Map( + "name" -> "limit", + "expr" -> toSQLExpr(expr), + "v" -> toSQLValue(-1, IntegerType) + ) + ) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolutionValidatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolutionValidatorSuite.scala new file mode 100644 index 0000000000000..922e94ea442b3 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolutionValidatorSuite.scala @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import scala.collection.immutable +import scala.reflect.runtime.universe.typeOf + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.expressions.{ + Add, + Alias, + AttributeReference, + Cast, + GreaterThan, + Literal, + NamedExpression, + TimeAdd +} +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan, Project} +import org.apache.spark.sql.types.{ + BooleanType, + DayTimeIntervalType, + DecimalType, + IntegerType, + StringType, + TimestampType +} + +class ResolutionValidatorSuite extends SparkFunSuite with SQLConfHelper { + private val resolveMethodNamesToIgnore = Seq( + // [[Resolver]] accepts [[UnresolvedInlineTable]], [[ResolvedInlineTable]] and + // [[LocalRelation]], but produces only [[ResolvedInlineTable]] and [[LocalRelation]], so + // we omit one of them here. + // See [[Resolver.resolveInlineTable]] scaladoc for more info. + "resolveResolvedInlineTable" + ) + + private val colInteger = AttributeReference(name = "colInteger", dataType = IntegerType)() + private val colBoolean = AttributeReference(name = "colBoolean", dataType = BooleanType)() + private val colTimestamp = AttributeReference(name = "colTimestamp", dataType = TimestampType)() + + test("All resolve* methods must have validate* counterparts") { + val actualMethodNames = typeOf[ResolutionValidator].decls + .collect { + case decl if decl.isMethod => decl.name.toString + } + .filter(name => { + name.startsWith("validate") + }) + val actualMethodNamesSet = immutable.HashSet(actualMethodNames.toSeq: _*) + + val resolveMethodNamesToIgnoreSet = immutable.HashSet(resolveMethodNamesToIgnore: _*) + + typeOf[Resolver].decls + .collect { + case decl if decl.isMethod => decl.name.toString + } + .filter(name => { + name.startsWith("resolve") && !resolveMethodNamesToIgnoreSet.contains(name) + }) + .map(name => { + "validate" + name.stripPrefix("resolve") + }) + .foreach(name => { + assert(actualMethodNamesSet.contains(name), name) + }) + } + + test("Project") { + validate( + Project( + projectList = Seq(colInteger, colBoolean, colInteger), + child = LocalRelation(output = Seq(colInteger, colBoolean)) + ) + ) + validate( + Project( + projectList = Seq(colInteger), + child = LocalRelation(output = colBoolean) + ), + error = Some("Project list contains nonexisting attribute") + ) + } + + test("Filter") { + validate( + Project( + projectList = Seq(colBoolean), + child = Filter( + condition = colBoolean, + child = LocalRelation(output = colBoolean) + ) + ) + ) + validate( + Project( + projectList = Seq(colInteger), + child = Filter( + condition = colInteger, + child = LocalRelation(output = colInteger) + ) + ), + error = Some("Non-boolean condition") + ) + validate( + Project( + projectList = Seq(colBoolean), + child = Filter( + condition = AttributeReference(name = "colBooleanOther", dataType = BooleanType)(), + child = LocalRelation(output = colBoolean) + ) + ), + error = Some("Condition references nonexisting attribute") + ) + } + + test("Predicate") { + validate( + Project( + projectList = Seq(colInteger), + child = Filter( + condition = GreaterThan(colInteger, colInteger), + child = LocalRelation(output = colInteger) + ) + ) + ) + validate( + Project( + projectList = Seq(colInteger), + child = Filter( + condition = GreaterThan(colInteger, colBoolean), + child = LocalRelation(output = Seq(colInteger, colBoolean)) + ) + ), + error = Some("Input data types mismatch") + ) + } + + test("BinaryExpression") { + validate( + Project( + projectList = Seq( + Alias( + child = Add( + left = Literal(5), + right = Literal(1) + ), + "Add" + )(NamedExpression.newExprId) + ), + child = LocalRelation(output = colInteger) + ) + ) + validate( + Project( + projectList = Seq( + Alias( + child = Add( + left = Literal(5), + right = Literal("1") + ), + "AddWrongInputTypes" + )(NamedExpression.newExprId) + ), + child = LocalRelation(output = colInteger) + ), + error = Some("checkInputDataTypes mismatch") + ) + validate( + Project( + projectList = Seq( + Alias( + child = TimeAdd( + start = Cast( + child = Literal("2024-10-01"), + dataType = TimestampType, + timeZoneId = Option(conf.sessionLocalTimeZone) + ), + interval = Cast( + child = Literal(1), + dataType = DayTimeIntervalType(DayTimeIntervalType.DAY, DayTimeIntervalType.DAY), + timeZoneId = Option(conf.sessionLocalTimeZone) + ) + ), + "AddNoTimezone" + )(NamedExpression.newExprId) + ), + child = LocalRelation(output = colInteger) + ), + error = Some("TimezoneId is not set for TimeAdd") + ) + } + + test("TimeZoneAwareExpression") { + validate( + Project( + projectList = Seq( + Alias( + Cast( + child = colInteger, + dataType = DecimalType.USER_DEFAULT, + timeZoneId = Option(conf.sessionLocalTimeZone) + ), + "withTimezone" + )(NamedExpression.newExprId) + ), + child = LocalRelation(output = colInteger) + ) + ) + validate( + Project( + projectList = Seq( + Alias( + Cast( + child = colTimestamp, + dataType = StringType + ), + "withoutTimezone" + )(NamedExpression.newExprId) + ), + child = LocalRelation(output = colTimestamp) + ), + error = Some("TimezoneId is not set") + ) + } + + def validate(plan: LogicalPlan, error: Option[String] = None): Unit = { + def errorWrapper(error: String)(body: => Unit): Unit = { + withClue(error) { + intercept[Throwable] { + body + } + } + } + + def noopWrapper(body: => Unit) = { + body + } + + val wrapper = error + .map(error => { errorWrapper(error) _ }) + .getOrElse { noopWrapper _ } + + val validator = new ResolutionValidator + wrapper { + validator.validatePlan(plan) + } + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimezoneAwareExpressionResolverSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimezoneAwareExpressionResolverSuite.scala new file mode 100644 index 0000000000000..d5c5387d4b763 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/TimezoneAwareExpressionResolverSuite.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.expressions.{ + AttributeReference, + Cast, + Expression, + TimeZoneAwareExpression +} +import org.apache.spark.sql.types.{IntegerType, StringType} + +class TimezoneAwareExpressionResolverSuite extends SparkFunSuite { + + class HardCodedExpressionResolver(resolvedExpression: Expression) + extends TreeNodeResolver[Expression, Expression] { + override def resolve(expression: Expression): Expression = resolvedExpression + } + + private val unresolvedChild = + AttributeReference(name = "unresolvedChild", dataType = StringType)() + private val resolvedChild = AttributeReference(name = "resolvedChild", dataType = IntegerType)() + private val castExpression = Cast(child = unresolvedChild, dataType = IntegerType) + private val expressionResolver = new HardCodedExpressionResolver(resolvedChild) + private val timezoneAwareExpressionResolver = new TimezoneAwareExpressionResolver( + expressionResolver + ) + + test("TimeZoneAwareExpression resolution") { + assert(castExpression.children.head == unresolvedChild) + assert(castExpression.timeZoneId.isEmpty) + assert(castExpression.getTagValue(Cast.USER_SPECIFIED_CAST).isEmpty) + + castExpression.setTagValue(Cast.USER_SPECIFIED_CAST, ()) + val resolvedExpression = + timezoneAwareExpressionResolver.resolve(castExpression).asInstanceOf[TimeZoneAwareExpression] + + assert(resolvedExpression.children.head == resolvedChild) + assert(resolvedExpression.timeZoneId.nonEmpty) + assert(resolvedExpression.getTagValue(Cast.USER_SPECIFIED_CAST).nonEmpty) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/TypeCoercionResolverSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/TypeCoercionResolverSuite.scala new file mode 100644 index 0000000000000..7e0107147c9ac --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/resolver/TypeCoercionResolverSuite.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis.resolver + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.analysis.AnsiTypeCoercion +import org.apache.spark.sql.catalyst.expressions.{Add, Cast, Expression, Literal} +import org.apache.spark.sql.types.{DoubleType, IntegerType} + +class TypeCoercionResolverSuite extends SparkFunSuite with SQLConfHelper { + + class HardCodedExpressionResolver(resolvedExpression: Expression) + extends TreeNodeResolver[Expression, Expression] { + override def resolve(expression: Expression): Expression = resolvedExpression + } + + private val integerChild = Literal(1, IntegerType) + private val doubleChild = Literal(1.1, DoubleType) + private val castIntegerChild = Cast(child = integerChild, dataType = DoubleType) + private val expressionResolver = new HardCodedExpressionResolver(castIntegerChild) + private val timezoneAwareExpressionResolver = new TimezoneAwareExpressionResolver( + expressionResolver + ) + private val typeCoercionRules = Seq( + AnsiTypeCoercion.ImplicitTypeCasts.transform + ) + private val typeCoercionResolver = + new TypeCoercionResolver(timezoneAwareExpressionResolver, typeCoercionRules) + + test("TypeCoercion resolution - with children reinstantiation") { + val expression = Add(left = doubleChild, right = integerChild) + val resolvedExpression = typeCoercionResolver.resolve(expression).asInstanceOf[Add] + // left child remains the same + assert(resolvedExpression.left == doubleChild) + // right first gets resolved to castIntegerChild. However, after the Cast gets + // re-resolved with timezone, it won't be equal to castIntegerChild because of re-instantiation + assert(resolvedExpression.right.isInstanceOf[Cast]) + val newRightChild = resolvedExpression.right.asInstanceOf[Cast] + assert(newRightChild != castIntegerChild) + assert(newRightChild.timeZoneId.nonEmpty) + // not a user-specified cast + assert(newRightChild.getTagValue(Cast.USER_SPECIFIED_CAST).isEmpty) + } + + test("TypeCoercion resolution - no children reinstantiation") { + val expression = Add(left = doubleChild, right = castIntegerChild) + val resolvedExpression = typeCoercionResolver.resolve(expression).asInstanceOf[Add] + assert(resolvedExpression.left == doubleChild) + // Cast that isn't a product of type coercion resolution won't be re-instantiated with timezone + assert(resolvedExpression.right == castIntegerChild) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala index 79c6d07d6d218..645b80ffaacb8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala @@ -488,4 +488,41 @@ class RowEncoderSuite extends CodegenInterpretedPlanTest { val data = Row(mutable.ArraySeq.make(Array(Row("key", "value".getBytes)))) val row = encoder.createSerializer()(data) } + + test("do not allow serializing too long strings into char/varchar") { + Seq(CharType(5), VarcharType(5)).foreach { typ => + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + val schema = new StructType().add("c", typ) + val encoder = ExpressionEncoder(schema).resolveAndBind() + val value = "abcdef" + checkError( + exception = intercept[SparkRuntimeException]({ + val row = toRow(encoder, Row(value)) + }), + condition = "EXCEED_LIMIT_LENGTH", + parameters = Map("limit" -> "5") + ) + } + } + } + + test("do not allow deserializing too long strings into char/varchar") { + Seq(CharType(5), VarcharType(5)).foreach { typ => + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + val fromSchema = new StructType().add("c", StringType) + val fromEncoder = ExpressionEncoder(fromSchema).resolveAndBind() + val toSchema = new StructType().add("c", typ) + val toEncoder = ExpressionEncoder(toSchema).resolveAndBind() + val value = "abcdef" + val row = toRow(fromEncoder, Row(value)) + checkError( + exception = intercept[SparkRuntimeException]({ + val value = fromRow(toEncoder, row) + }), + condition = "EXCEED_LIMIT_LENGTH", + parameters = Map("limit" -> "5") + ) + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala index 7e545d3321054..e0d3a176b1a43 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala @@ -479,4 +479,17 @@ class CanonicalizeSuite extends SparkFunSuite { } } } + + test("unit test for gatherCommutative()") { + val addExpression = Add( + Literal(1), + Add( + Literal(2), + Literal(3) + ) + ) + val commutativeExpressions = addExpression.gatherCommutative(addExpression, + { case Add(l, r, _) => Seq(l, r)}) + assert(commutativeExpressions == Seq(Literal(1), Literal(2), Literal(3))) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala index e87b54339821f..cec49a5ae1de0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala @@ -729,6 +729,8 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { assert(Cast.canUpCast(DateType, TimestampNTZType)) assert(Cast.canUpCast(TimestampType, TimestampNTZType)) assert(Cast.canUpCast(TimestampNTZType, TimestampType)) + assert(Cast.canUpCast(IntegerType, StringType("UTF8_LCASE"))) + assert(Cast.canUpCast(CalendarIntervalType, StringType("UTF8_LCASE"))) assert(!Cast.canUpCast(TimestampType, DateType)) assert(!Cast.canUpCast(TimestampNTZType, DateType)) } @@ -1013,6 +1015,13 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { } } + test("allow type conversions between calendar interval type and char/varchar types") { + Seq(CharType(10), VarcharType(10)) + .foreach { typ => + assert(cast(Literal.default(CalendarIntervalType), typ).checkInputDataTypes().isSuccess) + } + } + test("SPARK-35720: cast string to timestamp without timezone") { specialTs.foreach { s => val expectedTs = LocalDateTime.parse(s) @@ -1409,4 +1418,43 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { assert(!Cast(timestampLiteral, TimestampNTZType).resolved) assert(!Cast(timestampNTZLiteral, TimestampType).resolved) } + + test("Casting between TimestampType and StringType requires timezone") { + val timestampLiteral = Literal.create(1L, TimestampType) + assert(!Cast(timestampLiteral, StringType).resolved) + assert(!Cast(timestampLiteral, StringType("UTF8_LCASE")).resolved) + } + + test(s"Casting from char/varchar") { + Seq(CharType(10), VarcharType(10)).foreach { typ => + Seq( + IntegerType -> ("123", 123), + LongType -> ("123 ", 123L), + BooleanType -> ("true ", true), + BooleanType -> ("false", false), + DoubleType -> ("1.2", 1.2) + ).foreach { case (toType, (from, to)) => + checkEvaluation(cast(Literal.create(from, typ), toType), to) + } + } + } + + test("Casting to char/varchar") { + Seq(CharType(10), VarcharType(10)).foreach { typ => + Seq( + IntegerType -> (123, "123"), + LongType -> (123L, "123"), + BooleanType -> (true, "true"), + BooleanType -> (false, "false"), + DoubleType -> (1.2, "1.2") + ).foreach { case (fromType, (from, to)) => + val paddedTo = if (typ.isInstanceOf[CharType]) { + to.padTo(10, ' ') + } else { + to + } + checkEvaluation(cast(Literal.create(from, fromType), typ), paddedTo) + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala index 77a3d6df69221..9e9eeaf2f80d0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala @@ -17,67 +17,63 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CollationFactory, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { + private val fullyQualifiedPrefix = s"${CollationFactory.CATALOG}.${CollationFactory.SCHEMA}." + private val UTF8_BINARY_COLLATION_NAME = ResolvedCollation("UTF8_BINARY") + private val UTF8_LCASE_COLLATION_NAME = ResolvedCollation("UTF8_LCASE") + test("validate default collation") { val collationId = CollationFactory.collationNameToId("UTF8_BINARY") assert(collationId == 0) - val collateExpr = Collate(Literal("abc"), "UTF8_BINARY") + val collateExpr = Collate(Literal("abc"), UTF8_BINARY_COLLATION_NAME) assert(collateExpr.dataType === StringType(collationId)) assert(collateExpr.dataType.asInstanceOf[StringType].collationId == 0) checkEvaluation(collateExpr, "abc") } test("collate against literal") { - val collateExpr = Collate(Literal("abc"), "UTF8_LCASE") + val collateExpr = Collate(Literal("abc"), UTF8_LCASE_COLLATION_NAME) val collationId = CollationFactory.collationNameToId("UTF8_LCASE") assert(collateExpr.dataType === StringType(collationId)) checkEvaluation(collateExpr, "abc") } test("check input types") { - val collateExpr = Collate(Literal("abc"), "UTF8_BINARY") + val collateExpr = Collate(Literal("abc"), UTF8_BINARY_COLLATION_NAME) assert(collateExpr.checkInputDataTypes().isSuccess) val collateExprExplicitDefault = - Collate(Literal.create("abc", StringType(0)), "UTF8_BINARY") + Collate(Literal.create("abc", StringType(0)), UTF8_BINARY_COLLATION_NAME) assert(collateExprExplicitDefault.checkInputDataTypes().isSuccess) val collateExprExplicitNonDefault = - Collate(Literal.create("abc", StringType(1)), "UTF8_BINARY") + Collate(Literal.create("abc", StringType(1)), UTF8_BINARY_COLLATION_NAME) assert(collateExprExplicitNonDefault.checkInputDataTypes().isSuccess) - val collateOnNull = Collate(Literal.create(null, StringType(1)), "UTF8_BINARY") + val collateOnNull = Collate(Literal.create(null, StringType(1)), UTF8_BINARY_COLLATION_NAME) assert(collateOnNull.checkInputDataTypes().isSuccess) - val collateOnInt = Collate(Literal(1), "UTF8_BINARY") + val collateOnInt = Collate(Literal(1), UTF8_BINARY_COLLATION_NAME) assert(collateOnInt.checkInputDataTypes().isFailure) } - test("collate on non existing collation") { - checkError( - exception = intercept[SparkException] { Collate(Literal("abc"), "UTF8_BS") }, - condition = "COLLATION_INVALID_NAME", - sqlState = "42704", - parameters = Map("collationName" -> "UTF8_BS", "proposals" -> "UTF8_LCASE")) - } - test("collation on non-explicit default collation") { - checkEvaluation(Collation(Literal("abc")), "UTF8_BINARY") + checkEvaluation(Collation(Literal("abc")), fullyQualifiedPrefix + "UTF8_BINARY") } test("collation on explicitly collated string") { checkEvaluation( Collation(Literal.create("abc", StringType(CollationFactory.UTF8_LCASE_COLLATION_ID))), - "UTF8_LCASE") + fullyQualifiedPrefix + "UTF8_LCASE") checkEvaluation( - Collation(Collate(Literal("abc"), "UTF8_LCASE")), - "UTF8_LCASE") + Collation(Collate(Literal("abc"), UTF8_LCASE_COLLATION_NAME)), + fullyQualifiedPrefix + "UTF8_LCASE") } test("Array operations on arrays of collated strings") { @@ -222,7 +218,7 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { ).foreach { case (collation, normalized) => checkEvaluation(Collation(Literal.create("abc", StringType(collation))), - normalized) + fullyQualifiedPrefix + normalized) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 5cd974838fa24..09650a0dcc022 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -1893,26 +1893,26 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-38195: add a quantity of interval units to a timestamp") { // Check case-insensitivity checkEvaluation( - TimestampAdd("Hour", Literal(1), Literal(LocalDateTime.of(2022, 2, 15, 12, 57, 0))), + TimestampAdd("Hour", Literal(1L), Literal(LocalDateTime.of(2022, 2, 15, 12, 57, 0))), LocalDateTime.of(2022, 2, 15, 13, 57, 0)) // Check nulls as input values checkEvaluation( TimestampAdd( "MINUTE", - Literal.create(null, IntegerType), + Literal.create(null, LongType), Literal(LocalDateTime.of(2022, 2, 15, 12, 57, 0))), null) checkEvaluation( TimestampAdd( "MINUTE", - Literal(1), + Literal(1L), Literal.create(null, TimestampType)), null) // Check crossing the daylight saving time checkEvaluation( TimestampAdd( "HOUR", - Literal(6), + Literal(6L), Literal(Instant.parse("2022-03-12T23:30:00Z")), Some("America/Los_Angeles")), Instant.parse("2022-03-13T05:30:00Z")) @@ -1920,7 +1920,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( TimestampAdd( "DAY", - Literal(2), + Literal(2L), Literal(LocalDateTime.of(2020, 2, 28, 10, 11, 12)), Some("America/Los_Angeles")), LocalDateTime.of(2020, 3, 1, 10, 11, 12)) @@ -1940,7 +1940,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { quantity, timestamp, Some(tz)), - IntegerType, tsType) + LongType, tsType) } } } @@ -1961,84 +1961,127 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 checkEvaluation( - TimestampAdd("DAY", Literal(1), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), + TimestampAdd("DAY", Literal(1L), + Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), skippedTime) // timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 04:00:00 checkEvaluation( - TimestampAdd("HOUR", Literal(24), + TimestampAdd("HOUR", Literal(24L), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), skippedTime + MICROS_PER_HOUR) // timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 checkEvaluation( - TimestampAdd("HOUR", Literal(23), + TimestampAdd("HOUR", Literal(23L), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), skippedTime) // timestampadd(SECOND, SECONDS_PER_DAY, 2011-03-12 03:00:00) = 2011-03-13 04:00:00 checkEvaluation( TimestampAdd( - "SECOND", Literal(SECONDS_PER_DAY.toInt), + "SECOND", Literal(SECONDS_PER_DAY), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), skippedTime + MICROS_PER_HOUR) // timestampadd(SECOND, SECONDS_PER_DAY, 2011-03-12 03:00:00) = 2011-03-13 03:59:59 checkEvaluation( TimestampAdd( - "SECOND", Literal(SECONDS_PER_DAY.toInt - 1), + "SECOND", Literal(SECONDS_PER_DAY - 1), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), skippedTime + MICROS_PER_HOUR - MICROS_PER_SECOND) // timestampadd(DAY, 1, 2011-11-05 02:00:00) = 2011-11-06 02:00:00 checkEvaluation( - TimestampAdd("DAY", Literal(1), + TimestampAdd("DAY", Literal(1L), Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)), repeatedTime + MICROS_PER_HOUR) // timestampadd(DAY, 1, 2011-11-05 01:00:00) = 2011-11-06 01:00:00 (pre-transition) checkEvaluation( - TimestampAdd("DAY", Literal(1), + TimestampAdd("DAY", Literal(1L), Literal(repeatedTime - 25 * MICROS_PER_HOUR, TimestampType)), repeatedTime - MICROS_PER_HOUR) // timestampadd(DAY, -1, 2011-11-07 01:00:00) = 2011-11-06 01:00:00 (post-transition) checkEvaluation( - TimestampAdd("DAY", Literal(-1), + TimestampAdd("DAY", Literal(-1L), Literal(repeatedTime + 24 * MICROS_PER_HOUR, TimestampType)), repeatedTime) // timestampadd(MONTH, 1, 2011-10-06 01:00:00) = 2011-11-06 01:00:00 (pre-transition) checkEvaluation( TimestampAdd( - "MONTH", Literal(1), + "MONTH", Literal(1L), Literal(repeatedTime - MICROS_PER_HOUR - 31 * MICROS_PER_DAY, TimestampType)), repeatedTime - MICROS_PER_HOUR) // timestampadd(MONTH, -1, 2011-12-06 01:00:00) = 2011-11-06 01:00:00 (post-transition) checkEvaluation( TimestampAdd( - "MONTH", Literal(-1), + "MONTH", Literal(-1L), Literal(repeatedTime + 30 * MICROS_PER_DAY, TimestampType)), repeatedTime) // timestampadd(HOUR, 23, 2011-11-05 02:00:00) = 2011-11-06 01:00:00 (pre-transition) checkEvaluation( - TimestampAdd("HOUR", Literal(23), + TimestampAdd("HOUR", Literal(23L), Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)), repeatedTime - MICROS_PER_HOUR) // timestampadd(HOUR, 24, 2011-11-05 02:00:00) = 2011-11-06 01:00:00 (post-transition) checkEvaluation( - TimestampAdd("HOUR", Literal(24), + TimestampAdd("HOUR", Literal(24L), Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)), repeatedTime) } } + test("SPARK-50669: timestampadd with long types") { + // A value that is larger than Int.MaxValue. + val longValue = 10_000_000_000L + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { + checkEvaluation( + TimestampAdd("MICROSECOND", Literal(longValue), Literal(0L, TimestampType)), + longValue) + checkEvaluation( + TimestampAdd("MILLISECOND", Literal(longValue), Literal(0L, TimestampType)), + longValue * MICROS_PER_MILLIS) + checkEvaluation( + TimestampAdd("SECOND", Literal(longValue), Literal(0L, TimestampType)), + longValue * MICROS_PER_SECOND) + checkEvaluation( + TimestampAdd("MINUTE", Literal(longValue), Literal(0L, TimestampType)), + longValue * MICROS_PER_MINUTE) + + // Add a smaller value so overflow doesn't happen. + val valueToAdd = 1_000L + checkEvaluation( + TimestampAdd("HOUR", Literal(valueToAdd), Literal(0L, TimestampType)), + valueToAdd * MICROS_PER_HOUR) + checkEvaluation( + TimestampAdd("DAY", Literal(valueToAdd), Literal(0L, TimestampType)), + valueToAdd * MICROS_PER_DAY) + checkEvaluation( + TimestampAdd("WEEK", Literal(valueToAdd), Literal(0L, TimestampType)), + valueToAdd * MICROS_PER_DAY * DAYS_PER_WEEK) + + // Make sure overflow are thrown for larger values. + val overflowVal = Long.MaxValue + Seq("MILLISECOND", "SECOND", "MINUTE", "HOUR", "DAY", "WEEK").foreach { interval => + checkErrorInExpression[SparkArithmeticException](TimestampAdd(interval, + Literal(overflowVal), + Literal(0L, TimestampType)), + condition = "DATETIME_OVERFLOW", + parameters = Map("operation" -> + s"add ${overflowVal}L $interval to TIMESTAMP '1970-01-01 00:00:00'")) + } + } + } + test("SPARK-42635: timestampadd unit conversion overflow") { withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { checkErrorInExpression[SparkArithmeticException](TimestampAdd("DAY", - Literal(106751992), + Literal(106751992L), Literal(0L, TimestampType)), condition = "DATETIME_OVERFLOW", - parameters = Map("operation" -> "add 106751992 DAY to TIMESTAMP '1970-01-01 00:00:00'")) + parameters = Map("operation" -> "add 106751992L DAY to TIMESTAMP '1970-01-01 00:00:00'")) checkErrorInExpression[SparkArithmeticException](TimestampAdd("QUARTER", - Literal(1431655764), + Literal(1431655764L), Literal(0L, TimestampType)), condition = "DATETIME_OVERFLOW", parameters = Map("operation" -> - "add 1431655764 QUARTER to TIMESTAMP '1970-01-01 00:00:00'")) + "add 1431655764L QUARTER to TIMESTAMP '1970-01-01 00:00:00'")) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala index 92ef24bb8ec63..019c953a3b0ac 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala @@ -625,8 +625,8 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val s1 = "aaa" val s2 = "AAA" - val murmur3Hash1 = Murmur3Hash(Seq(Collate(Literal(s1), collation)), 42) - val murmur3Hash2 = Murmur3Hash(Seq(Collate(Literal(s2), collation)), 42) + val murmur3Hash1 = Murmur3Hash(Seq(Collate(Literal(s1), ResolvedCollation(collation))), 42) + val murmur3Hash2 = Murmur3Hash(Seq(Collate(Literal(s2), ResolvedCollation(collation))), 42) // Interpreted hash values for s1 and s2 val interpretedHash1 = murmur3Hash1.eval() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index 3a58cb92cecf2..0ec1a93b5cd29 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -273,8 +273,9 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with } test("json_tuple escaping") { - GenerateUnsafeProjection.generate( - JsonTuple(Literal("\"quote") :: Literal("\"quote") :: Nil) :: Nil) + checkJsonTuple( + JsonTuple(Literal("\"quote") :: Literal("\"quote") :: Nil), + InternalRow.fromSeq(Seq(null).map(UTF8String.fromString))) } test("json_tuple - hive key 1") { @@ -420,7 +421,7 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with test("from_json escaping") { val schema = StructType(StructField("\"quote", IntegerType) :: Nil) GenerateUnsafeProjection.generate( - JsonToStructs(schema, Map.empty, Literal("\"quote"), UTC_OPT).replacement :: Nil) + JsonToStructs(schema, Map.empty, Literal("\"quote"), UTC_OPT) :: Nil) } test("from_json") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index b351d69d3a0bb..5da5c6ac412cc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -25,13 +25,12 @@ import java.util.TimeZone import scala.collection.mutable import scala.reflect.runtime.universe.TypeTag -import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.{SparkFunSuite, SparkRuntimeException} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection} import org.apache.spark.sql.catalyst.encoders.ExamplePointUDT import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLType import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.types.DayTimeIntervalType._ @@ -91,16 +90,8 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { // ExamplePointUDT.sqlType is ArrayType(DoubleType, false). checkEvaluation(Literal.default(new ExamplePointUDT), Array()) - // DateType without default value` - List(CharType(1), VarcharType(1)).foreach(errType => { - checkError( - exception = intercept[SparkException] { - Literal.default(errType) - }, - condition = "INTERNAL_ERROR", - parameters = Map("message" -> s"No default value for type: ${toSQLType(errType)}.") - ) - }) + checkEvaluation(Literal.default(CharType(5)), " ") + checkEvaluation(Literal.default(VarcharType(5)), "") } test("boolean literals") { @@ -160,6 +151,42 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal.create("\u0000"), "\u0000") } + test("char literals") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + val typ = CharType(5) + checkEvaluation(Literal.create("", typ), " ") + checkEvaluation(Literal.create("test", typ), "test ") + checkEvaluation(Literal.create("test ", typ), "test ") + checkEvaluation(Literal.create("\u0000", typ), "\u0000 ") + + checkError( + exception = intercept[SparkRuntimeException]({ + Literal.create("123456", typ) + }), + condition = "EXCEED_LIMIT_LENGTH", + parameters = Map("limit" -> "5") + ) + } + } + + test("varchar literals") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + val typ = VarcharType(5) + checkEvaluation(Literal.create("", typ), "") + checkEvaluation(Literal.create("test", typ), "test") + checkEvaluation(Literal.create("test ", typ), "test ") + checkEvaluation(Literal.create("\u0000", typ), "\u0000") + + checkError( + exception = intercept[SparkRuntimeException]({ + Literal.create("123456", typ) + }), + condition = "EXCEED_LIMIT_LENGTH", + parameters = Map("limit" -> "5") + ) + } + } + test("sum two literals") { checkEvaluation(Add(Literal(1), Literal(1)), 2) checkEvaluation(Add(Literal.create(1), Literal.create(1)), 2) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala index a0c75b703ade4..4a7bf807d1de9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala @@ -674,4 +674,14 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { checkInAndInSet(In(Literal(Double.NaN), Seq(Literal(Double.NaN), Literal(2d), Literal.create(null, DoubleType))), true) } + + test("In and InSet logging limits") { + assert(In(Literal(1), Seq(Literal(1), Literal(2))).simpleString(1) + === "1 IN (1,... 1 more fields)") + assert(In(Literal(1), Seq(Literal(1), Literal(2))).simpleString(2) === "1 IN (1,2)") + assert(In(Literal(1), Seq(Literal(1))).simpleString(1) === "1 IN (1)") + assert(InSet(Literal(1), Set(1, 2)).simpleString(1) === "1 INSET 1, ... 1 more fields") + assert(InSet(Literal(1), Set(1, 2)).simpleString(2) === "1 INSET 1, 2") + assert(InSet(Literal(1), Set(1)).simpleString(1) === "1 INSET 1") + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala index 2d58d9d3136aa..9e6b59b51138d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala @@ -52,7 +52,6 @@ class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { testRandStr(1, "c") testRandStr(5, "ceV0P") testRandStr(10, "ceV0PXaR2I") - testRandStr(10L, "ceV0PXaR2I") def testUniform(first: Any, second: Any, result: Any): Unit = { checkEvaluationWithoutCodegen( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyStringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyStringSuite.scala index 783fba3bfc0df..2a5f76cab3619 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyStringSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyStringSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.UTC_OPT +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{UTF8String, VariantVal} @@ -89,6 +90,9 @@ class ToPrettyStringSuite extends SparkFunSuite with ExpressionEvalHelper { test("Char as pretty strings") { checkEvaluation(ToPrettyString(Literal.create('a', CharType(5))), "a") + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + checkEvaluation(ToPrettyString(Literal.create('a', CharType(5))), "a ") + } } test("Byte as pretty strings") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSuite.scala index 891e2d048b7a8..b0ed1ecabb8d6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSuite.scala @@ -18,7 +18,9 @@ package org.apache.spark.sql.catalyst.expressions.codegen import org.scalatest.{Assertions, BeforeAndAfterEach} +import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} import org.scalatest.matchers.must.Matchers +import org.scalatest.time.SpanSugar._ import org.apache.spark.{SparkIllegalArgumentException, TestUtils} import org.apache.spark.deploy.SparkSubmitTestUtils @@ -46,7 +48,10 @@ class BufferHolderSparkSubmitSuite "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-ea", unusedJar.toString) - runSparkSubmit(argsForSparkSubmit) + // Given that the default timeout of runSparkSubmit is 60 seconds, try 3 times in total. + eventually(timeout(210.seconds), interval(70.seconds)) { + runSparkSubmit(argsForSparkSubmit) + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonParserSuite.scala index 587e22e787b87..89cdd38a3e7b4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonParserSuite.scala @@ -24,6 +24,19 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String class JacksonParserSuite extends SparkFunSuite { + test("feature mask should remain unchanged") { + val options = new JSONOptions(Map.empty[String, String], "GMT", "") + val parser = new JacksonParser(StructType.fromDDL("a string"), options, false, Nil) + val input = """{"a": {"b": 1}}""".getBytes + // The creating function is usually called inside `parser.parse`, but we need the JSON parser + // here for testing purpose. + val jsonParser = options.buildJsonFactory().createParser(input) + val oldFeature = jsonParser.getFeatureMask + val result = parser.parse[Array[Byte]](input, (_, _) => jsonParser, UTF8String.fromBytes) + assert(result === Seq(InternalRow(UTF8String.fromString("""{"b": 1}""")))) + assert(jsonParser.getFeatureMask == oldFeature) + } + test("skipping rows using pushdown filters") { def check( input: String = """{"i":1, "s": "a"}""", diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala index fc2697d55f6d0..4cc2ee99284a5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala @@ -46,7 +46,7 @@ class BooleanSimplificationSuite extends PlanTest with ExpressionEvalHelper { $"e".boolean, $"f".boolean, $"g".boolean, $"h".boolean) val testRelationWithData = LocalRelation.fromExternalRows( - testRelation.output, Seq(Row(1, 2, 3, "abc")) + testRelation.output, Seq(Row(1, 2, 3, "abc", true, true, true, true)) ) val testNotNullableRelation = LocalRelation($"a".int.notNull, $"b".int.notNull, $"c".int.notNull, @@ -54,7 +54,7 @@ class BooleanSimplificationSuite extends PlanTest with ExpressionEvalHelper { $"h".boolean.notNull) val testNotNullableRelationWithData = LocalRelation.fromExternalRows( - testNotNullableRelation.output, Seq(Row(1, 2, 3, "abc")) + testNotNullableRelation.output, Seq(Row(1, 2, 3, "abc", true, true, true, true)) ) private def checkCondition(input: Expression, expected: LogicalPlan): Unit = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala index 02631c4cf61c9..2dcab5cfd29c4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.Add +import org.apache.spark.sql.catalyst.expressions.{Add, GenericInternalRow} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ @@ -189,7 +189,9 @@ class LimitPushdownSuite extends PlanTest { } test("full outer join where neither side is limited and left side has larger statistics") { - val xBig = testRelation.copy(data = Seq.fill(10)(null)).subquery("x") + val nulls = new GenericInternalRow( + Seq.fill(testRelation.output.length)(null).toArray.asInstanceOf[Array[Any]]) + val xBig = testRelation.copy(data = Seq.fill(10)(nulls)).subquery("x") assert(xBig.stats.sizeInBytes > y.stats.sizeInBytes) Seq(Some("x.a".attr === "y.b".attr), None).foreach { condition => val originalQuery = xBig.join(y, FullOuter, condition).limit(1).analyze @@ -204,7 +206,9 @@ class LimitPushdownSuite extends PlanTest { } test("full outer join where neither side is limited and right side has larger statistics") { - val yBig = testRelation.copy(data = Seq.fill(10)(null)).subquery("y") + val nulls = new GenericInternalRow( + Seq.fill(testRelation.output.length)(null).toArray.asInstanceOf[Array[Any]]) + val yBig = testRelation.copy(data = Seq.fill(10)(nulls)).subquery("y") assert(x.stats.sizeInBytes < yBig.stats.sizeInBytes) Seq(Some("x.a".attr === "y.b".attr), None).foreach { condition => val originalQuery = x.join(yBig, FullOuter, condition).limit(1).analyze diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprsSuite.scala index 7af2be2db01d1..eed06da609f8e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprsSuite.scala @@ -292,7 +292,7 @@ class OptimizeJsonExprsSuite extends PlanTest with ExpressionEvalHelper { Seq("""{"a":1, "b":2, "c": 123, "d": "test"}""", null).foreach(v => { val row = create_row(v) - checkEvaluation(e1, replace(e2).eval(row), row) + checkEvaluation(e1, e2.eval(row), row) }) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala index ac10fbfa3a3ee..95b55797b294c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala @@ -21,6 +21,7 @@ import org.apache.logging.log4j.Level import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.expressions.InSet import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor @@ -152,4 +153,14 @@ class OptimizerLoggingSuite extends PlanTest { verifyLog(Level.INFO, Seq("Batch Has No Effect")) } } + + test("SPARK-50329: toString for InSet should be valid for unresolved plan") { + val input = LocalRelation($"a".int, $"b".string, $"c".double) + val inSetPredicate = InSet($"a", Set(1, 2)) + val query = input.select($"a", $"b").where(inSetPredicate) + val analyzed = query.analyze + + assert(query.toString.contains("'a INSET (values with unresolved data types)")) + assert(analyzed.toString.contains("INSET 1, 2")) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReorderAssociativeOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReorderAssociativeOperatorSuite.scala index 7733e58547fe0..69c303d4773b4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReorderAssociativeOperatorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReorderAssociativeOperatorSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest} import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor +import org.apache.spark.sql.internal.SQLConf class ReorderAssociativeOperatorSuite extends PlanTest { @@ -109,15 +110,17 @@ class ReorderAssociativeOperatorSuite extends PlanTest { } test("SPARK-50380: conditional branches with error expression") { - val originalQuery1 = testRelation.select(If($"a" === 1, 1L, Literal(1).div(0) + $"b")).analyze - val optimized1 = Optimize.execute(originalQuery1) - comparePlans(optimized1, originalQuery1) - - val originalQuery2 = testRelation.select( - If($"a" === 1, 1, ($"b" + Literal(Int.MaxValue)) + 1).as("col")).analyze - val optimized2 = Optimize.execute(originalQuery2) - val correctAnswer2 = testRelation.select( - If($"a" === 1, 1, $"b" + (Literal(Int.MaxValue) + 1)).as("col")).analyze - comparePlans(optimized2, correctAnswer2) + withSQLConf(SQLConf.ANSI_ENABLED.key -> true.toString) { + val originalQuery1 = testRelation.select(If($"a" === 1, 1L, Literal(1).div(0) + $"b")).analyze + val optimized1 = Optimize.execute(originalQuery1) + comparePlans(optimized1, originalQuery1) + + val originalQuery2 = testRelation.select( + If($"a" === 1, 1, ($"b" + Literal(Int.MaxValue)) + 1).as("col")).analyze + val optimized2 = Optimize.execute(originalQuery2) + val correctAnswer2 = testRelation.select( + If($"a" === 1, 1, $"b" + (Literal(Int.MaxValue) + 1)).as("col")).analyze + comparePlans(optimized2, correctAnswer2) + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala index 0aeca961aa513..8918b58ca1b56 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.catalyst.optimizer -import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.analysis.TempResolvedColumn import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ @@ -29,7 +28,7 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor class RewriteWithExpressionSuite extends PlanTest { object Optimizer extends RuleExecutor[LogicalPlan] { - val batches = Batch("Rewrite With expression", Once, + val batches = Batch("Rewrite With expression", FixedPoint(5), PullOutGroupingExpressions, RewriteWithExpression) :: Nil } @@ -84,13 +83,11 @@ class RewriteWithExpressionSuite extends PlanTest { ref * ref } - val plan = testRelation.select(outerExpr.as("col")) comparePlans( - Optimizer.execute(plan), + Optimizer.execute(testRelation.select(outerExpr.as("col"))), testRelation - .select((testRelation.output :+ (a + a).as("_common_expr_0")): _*) - .select((testRelation.output ++ Seq($"_common_expr_0", - ($"_common_expr_0" + $"_common_expr_0" + b).as("_common_expr_1"))): _*) + .select(star(), (a + a).as("_common_expr_0")) + .select(a, b, ($"_common_expr_0" + $"_common_expr_0" + b).as("_common_expr_1")) .select(($"_common_expr_1" * $"_common_expr_1").as("col")) .analyze ) @@ -104,42 +101,61 @@ class RewriteWithExpressionSuite extends PlanTest { val outerExpr = With(b + b) { case Seq(ref) => ref * ref + innerExpr } - - val plan = testRelation.select(outerExpr.as("col")) - val rewrittenInnerExpr = (a + a).as("_common_expr_0") - val rewrittenOuterExpr = (b + b).as("_common_expr_1") - val finalExpr = rewrittenOuterExpr.toAttribute * rewrittenOuterExpr.toAttribute + - (rewrittenInnerExpr.toAttribute + rewrittenInnerExpr.toAttribute) + val finalExpr = $"_common_expr_1" * $"_common_expr_1" + ($"_common_expr_0" + $"_common_expr_0") comparePlans( - Optimizer.execute(plan), + Optimizer.execute(testRelation.select(outerExpr.as("col"))), testRelation - .select((testRelation.output :+ rewrittenInnerExpr): _*) - .select((testRelation.output :+ rewrittenInnerExpr.toAttribute :+ rewrittenOuterExpr): _*) + .select(star(), (b + b).as("_common_expr_1")) + .select(star(), (a + a).as("_common_expr_0")) .select(finalExpr.as("col")) .analyze ) } - test("correlated nested WITH expression is not supported") { + test("correlated nested WITH expression is supported") { val Seq(a, b) = testRelation.output val outerCommonExprDef = CommonExpressionDef(b + b, CommonExpressionId(0)) val outerRef = new CommonExpressionRef(outerCommonExprDef) + val rewrittenOuterExpr = (b + b).as("_common_expr_0") // The inner expression definition references the outer expression val commonExprDef1 = CommonExpressionDef(a + a + outerRef, CommonExpressionId(1)) val ref1 = new CommonExpressionRef(commonExprDef1) val innerExpr1 = With(ref1 + ref1, Seq(commonExprDef1)) - val outerExpr1 = With(outerRef + innerExpr1, Seq(outerCommonExprDef)) - intercept[SparkException](Optimizer.execute(testRelation.select(outerExpr1.as("col")))) + comparePlans( + Optimizer.execute(testRelation.select(outerExpr1.as("col"))), + testRelation + // The first Project contains the common expression of the outer With + .select(star(), rewrittenOuterExpr) + // The second Project contains the common expression of the inner With, which references + // the common expression of the outer With. + .select(star(), (a + a + $"_common_expr_0").as("_common_expr_1")) + // The final Project contains the final result expression, which references both common + // expressions. + .select(($"_common_expr_0" + ($"_common_expr_1" + $"_common_expr_1")).as("col")) + .analyze + ) - val commonExprDef2 = CommonExpressionDef(a + a) + val commonExprDef2 = CommonExpressionDef(a + a, CommonExpressionId(2)) val ref2 = new CommonExpressionRef(commonExprDef2) // The inner main expression references the outer expression - val innerExpr2 = With(ref2 + outerRef, Seq(commonExprDef1)) - + val innerExpr2 = With(ref2 + ref2 + outerRef, Seq(commonExprDef2)) val outerExpr2 = With(outerRef + innerExpr2, Seq(outerCommonExprDef)) - intercept[SparkException](Optimizer.execute(testRelation.select(outerExpr2.as("col")))) + comparePlans( + Optimizer.execute(testRelation.select(outerExpr2.as("col"))), + testRelation + // The first Project contains the common expression of the outer With + .select(star(), rewrittenOuterExpr) + // The second Project contains the common expression of the inner With, which does not + // reference the common expression of the outer With. + .select(star(), (a + a).as("_common_expr_2")) + // The final Project contains the final result expression, which references both common + // expressions. + .select(($"_common_expr_0" + + ($"_common_expr_2" + $"_common_expr_2" + $"_common_expr_0")).as("col")) + .analyze + ) } test("WITH expression in filter") { @@ -389,17 +405,16 @@ class RewriteWithExpressionSuite extends PlanTest { Optimizer.execute(plan), testRelation .select(a, b, (b + 2).as("_common_expr_0")) - .select(a, b, $"_common_expr_0", (b + 2).as("_common_expr_1")) .window( Seq(windowExpr(count(a), windowSpec(Seq($"_common_expr_0" * $"_common_expr_0"), Nil, frame)).as("col2")), - Seq($"_common_expr_1" * $"_common_expr_1"), + Seq($"_common_expr_0" * $"_common_expr_0"), Nil ) .select(a, b, $"col2") - .select(a, b, $"col2", (a + 1).as("_common_expr_2")) + .select(a, b, $"col2", (a + 1).as("_common_expr_1")) .window( - Seq(windowExpr(sum($"_common_expr_2" * $"_common_expr_2"), + Seq(windowExpr(sum($"_common_expr_1" * $"_common_expr_1"), windowSpec(Seq(a), Nil, frame)).as("col3")), Seq(a), Nil @@ -452,4 +467,37 @@ class RewriteWithExpressionSuite extends PlanTest { testRelation.groupBy($"b")(avg("a").as("a")).where($"a" === 1).analyze ) } + + test("SPARK-50679: duplicated common expressions in different With") { + val a = testRelation.output.head + val exprDef = CommonExpressionDef(a + a) + val exprRef = new CommonExpressionRef(exprDef) + val expr1 = With(exprRef * exprRef, Seq(exprDef)) + val expr2 = With(exprRef - exprRef, Seq(exprDef)) + val plan = testRelation.select(expr1.as("c1"), expr2.as("c2")).analyze + comparePlans( + Optimizer.execute(plan), + testRelation + .select(star(), (a + a).as("_common_expr_0")) + .select( + ($"_common_expr_0" * $"_common_expr_0").as("c1"), + ($"_common_expr_0" - $"_common_expr_0").as("c2")) + .analyze + ) + + val wrongExprDef = CommonExpressionDef(a * a, exprDef.id) + val wrongExprRef = new CommonExpressionRef(wrongExprDef) + val expr3 = With(wrongExprRef + wrongExprRef, Seq(wrongExprDef)) + val wrongPlan = testRelation.select(expr1.as("c1"), expr3.as("c3")).analyze + intercept[AssertionError](Optimizer.execute(wrongPlan)) + } + + test("SPARK-50683: inline the common expression in With if used once") { + val a = testRelation.output.head + val exprDef = CommonExpressionDef(a + a) + val exprRef = new CommonExpressionRef(exprDef) + val expr = With(exprRef + 1, Seq(exprDef)) + val plan = testRelation.select(expr.as("col")) + comparePlans(Optimizer.execute(plan), testRelation.select((a + a + 1).as("col"))) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 5e871208698af..0ec2c80282fc2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -2655,7 +2655,7 @@ class DDLParserSuite extends AnalysisTest { val createTableResult = CreateTable(UnresolvedIdentifier(Seq("my_tab")), columnsWithDefaultValue, Seq.empty[Transform], UnresolvedTableSpec(Map.empty[String, String], Some("parquet"), - OptionList(Seq.empty), None, None, None, false), false) + OptionList(Seq.empty), None, None, None, None, false), false) // Parse the CREATE TABLE statement twice, swapping the order of the NOT NULL and DEFAULT // options, to make sure that the parser accepts any ordering of these options. comparePlans(parsePlan( @@ -2668,7 +2668,7 @@ class DDLParserSuite extends AnalysisTest { "b STRING NOT NULL DEFAULT 'abc') USING parquet"), ReplaceTable(UnresolvedIdentifier(Seq("my_tab")), columnsWithDefaultValue, Seq.empty[Transform], UnresolvedTableSpec(Map.empty[String, String], Some("parquet"), - OptionList(Seq.empty), None, None, None, false), false)) + OptionList(Seq.empty), None, None, None, None, false), false)) // These ALTER TABLE statements should parse successfully. comparePlans( parsePlan("ALTER TABLE t1 ADD COLUMN x int NOT NULL DEFAULT 42"), @@ -2828,12 +2828,12 @@ class DDLParserSuite extends AnalysisTest { "CREATE TABLE my_tab(a INT, b INT NOT NULL GENERATED ALWAYS AS (a+1)) USING parquet"), CreateTable(UnresolvedIdentifier(Seq("my_tab")), columnsWithGenerationExpr, Seq.empty[Transform], UnresolvedTableSpec(Map.empty[String, String], Some("parquet"), - OptionList(Seq.empty), None, None, None, false), false)) + OptionList(Seq.empty), None, None, None, None, false), false)) comparePlans(parsePlan( "REPLACE TABLE my_tab(a INT, b INT NOT NULL GENERATED ALWAYS AS (a+1)) USING parquet"), ReplaceTable(UnresolvedIdentifier(Seq("my_tab")), columnsWithGenerationExpr, Seq.empty[Transform], UnresolvedTableSpec(Map.empty[String, String], Some("parquet"), - OptionList(Seq.empty), None, None, None, false), false)) + OptionList(Seq.empty), None, None, None, None, false), false)) // Two generation expressions checkError( exception = parseException("CREATE TABLE my_tab(a INT, " + @@ -2903,6 +2903,7 @@ class DDLParserSuite extends AnalysisTest { None, None, None, + None, false ), false @@ -2925,6 +2926,7 @@ class DDLParserSuite extends AnalysisTest { None, None, None, + None, false ), false @@ -3198,4 +3200,49 @@ class DDLParserSuite extends AnalysisTest { condition = "INTERNAL_ERROR", parameters = Map("message" -> "INSERT OVERWRITE DIRECTORY is not supported.")) } + + test("create table with bad collation name") { + checkError( + exception = internalException("CREATE TABLE t DEFAULT COLLATION XD"), + condition = "COLLATION_INVALID_NAME", + parameters = Map("proposals" -> "id, xh, af", "collationName" -> "XD") + ) + } + + private val testSuppCollations = + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI", "UNICODE_CI_RTRIM", "sr", "sr_CI_AI") + + test("create table with default collation") { + testSuppCollations.foreach { collation => + comparePlans(parsePlan( + s"CREATE TABLE t (c STRING) USING parquet DEFAULT COLLATION ${collation.toLowerCase()}"), + CreateTable(UnresolvedIdentifier(Seq("t")), + Seq(ColumnDefinition("c", StringType)), + Seq.empty[Transform], + UnresolvedTableSpec(Map.empty[String, String], Some("parquet"), OptionList(Seq.empty), + None, None, Some(collation), None, false), false)) + } + } + + test("replace table with default collation") { + testSuppCollations.foreach { collation => + comparePlans(parsePlan( + s"REPLACE TABLE t (c STRING) USING parquet DEFAULT COLLATION ${collation.toLowerCase()}"), + ReplaceTable(UnresolvedIdentifier(Seq("t")), + Seq(ColumnDefinition("c", StringType)), + Seq.empty[Transform], + UnresolvedTableSpec(Map.empty[String, String], Some("parquet"), OptionList(Seq.empty), + None, None, Some(collation), None, false), false)) + } + } + + test("alter table collation") { + testSuppCollations.foreach { collation => + comparePlans(parsePlan( + s"ALTER TABLE t DEFAULT COLLATION ${collation.toLowerCase()}"), + AlterTableCollation(UnresolvedTable(Seq("t"), + "ALTER TABLE ... DEFAULT COLLATION"), collation) + ) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index c556a92373954..9e5555c4c6c0c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -323,19 +323,9 @@ class PlanParserSuite extends AnalysisTest { assertEqual( "from db.a select b, c where d < 1", table("db", "a").where($"d" < 1).select($"b", $"c")) assertEqual("from a select distinct b, c", Distinct(table("a").select($"b", $"c"))) - - // Weird "FROM table" queries, should be invalid anyway - val sql1 = "from a" - checkError( - exception = parseException(sql1), - condition = "PARSE_SYNTAX_ERROR", - parameters = Map("error" -> "end of input", "hint" -> "")) - - val sql2 = "from (from a union all from b) c select *" - checkError( - exception = parseException(sql2), - condition = "PARSE_SYNTAX_ERROR", - parameters = Map("error" -> "'union'", "hint" -> "")) + assertEqual("from a", table("a")) + assertEqual("from (from a union all from b) c select *", + table("a").union(table("b")).subquery("c").select(star())) } test("multi select query") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala index 3bb84f603dc67..e129c6dbba052 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Alias, EqualTo, Expression, In, Literal, ScalarSubquery} import org.apache.spark.sql.catalyst.plans.SQLHelper -import org.apache.spark.sql.catalyst.plans.logical.{CaseStatement, CompoundBody, CreateVariable, IfElseStatement, IterateStatement, LeaveStatement, LoopStatement, Project, RepeatStatement, SingleStatement, WhileStatement} +import org.apache.spark.sql.catalyst.plans.logical.{CaseStatement, CompoundBody, CreateVariable, ForStatement, IfElseStatement, IterateStatement, LeaveStatement, LoopStatement, Project, RepeatStatement, SingleStatement, WhileStatement} import org.apache.spark.sql.errors.DataTypeErrors.toSQLId import org.apache.spark.sql.exceptions.SqlScriptingException import org.apache.spark.sql.internal.SQLConf @@ -82,7 +82,7 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { } } - test("empty BEGIN END block") { + test("empty singleCompoundStatement") { val sqlScriptText = """ |BEGIN @@ -91,6 +91,20 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(tree.collection.isEmpty) } + test("empty beginEndCompoundBlock") { + val sqlScriptText = + """ + |BEGIN + | BEGIN + | END; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[CompoundBody]) + val innerBody = tree.collection.head.asInstanceOf[CompoundBody] + assert(innerBody.collection.isEmpty) + } + test("multiple ; in row - should fail") { val sqlScriptText = """ @@ -439,6 +453,21 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(ifStmt.conditions.head.getText == "1=1") } + test("if with empty body") { + val sqlScriptText = + """BEGIN + | IF 1 = 1 THEN + | END IF; + |END + """.stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'IF'", "hint" -> "")) + } + test("if else") { val sqlScriptText = """BEGIN @@ -623,6 +652,21 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(whileStmt.label.contains("lbl")) } + test("while with empty body") { + val sqlScriptText = + """BEGIN + | WHILE 1 = 1 DO + | END WHILE; + |END + """.stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'WHILE'", "hint" -> "")) + } + test("while with complex condition") { val sqlScriptText = """ @@ -1067,6 +1111,21 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(repeatStmt.label.contains("lbl")) } + test("repeat with empty body") { + val sqlScriptText = + """BEGIN + | REPEAT UNTIL 1 = 1 + | END REPEAT; + |END + """.stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'1'", "hint" -> "")) + } + test("repeat with complex condition") { val sqlScriptText = """ @@ -1176,7 +1235,6 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { head.asInstanceOf[SingleStatement].getText == "SELECT 42") assert(whileStmt.label.contains("lbl")) - } test("searched case statement") { @@ -1198,6 +1256,22 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(caseStmt.conditions.head.getText == "1 = 1") } + test("searched case statement with empty body") { + val sqlScriptText = + """BEGIN + | CASE + | WHEN 1 = 1 THEN + | END CASE; + |END + """.stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'CASE'", "hint" -> "")) + } + test("searched case statement - multi when") { val sqlScriptText = """ @@ -1336,6 +1410,21 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { checkSimpleCaseStatementCondition(caseStmt.conditions.head, _ == Literal(1), _ == Literal(1)) } + test("simple case statement with empty body") { + val sqlScriptText = + """BEGIN + | CASE 1 + | WHEN 1 THEN + | END CASE; + |END + """.stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'CASE'", "hint" -> "")) + } test("simple case statement - multi when") { val sqlScriptText = @@ -1483,6 +1572,21 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(whileStmt.label.contains("lbl")) } + test("loop with empty body") { + val sqlScriptText = + """BEGIN + | LOOP + | END LOOP; + |END + """.stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'LOOP'", "hint" -> "")) + } + test("loop with if else block") { val sqlScriptText = """BEGIN @@ -1823,6 +1927,25 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { parameters = Map("label" -> toSQLId("l_loop"))) } + test("unique label names: nested for loops") { + val sqlScriptText = + """BEGIN + |f_loop: FOR x AS SELECT 1 DO + | f_loop: FOR y AS SELECT 2 DO + | SELECT 1; + | END FOR; + |END FOR; + |END + """.stripMargin + val exception = intercept[SqlScriptingException] { + parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + } + checkError( + exception = exception, + condition = "LABEL_ALREADY_EXISTS", + parameters = Map("label" -> toSQLId("f_loop"))) + } + test("unique label names: begin-end block on the same level") { val sqlScriptText = """BEGIN @@ -1858,10 +1981,13 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { | SELECT 4; |UNTIL 1=1 |END REPEAT; + |lbl: FOR x AS SELECT 1 DO + | SELECT 5; + |END FOR; |END """.stripMargin val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] - assert(tree.collection.length == 4) + assert(tree.collection.length == 5) assert(tree.collection.head.isInstanceOf[CompoundBody]) assert(tree.collection.head.asInstanceOf[CompoundBody].label.get == "lbl") assert(tree.collection(1).isInstanceOf[WhileStatement]) @@ -1870,6 +1996,23 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { assert(tree.collection(2).asInstanceOf[LoopStatement].label.get == "lbl") assert(tree.collection(3).isInstanceOf[RepeatStatement]) assert(tree.collection(3).asInstanceOf[RepeatStatement].label.get == "lbl") + assert(tree.collection(4).isInstanceOf[ForStatement]) + assert(tree.collection(4).asInstanceOf[ForStatement].label.get == "lbl") + } + + test("qualified label name: label cannot be qualified") { + val sqlScriptText = + """ + |BEGIN + | part1.part2: BEGIN + | END; + |END""".stripMargin + checkError( + exception = intercept[SqlScriptingException] { + parsePlan(sqlScriptText) + }, + condition = "INVALID_LABEL_USAGE.QUALIFIED_LABEL_NAME", + parameters = Map("labelName" -> "PART1.PART2")) } test("unique label names: nested labeled scope statements") { @@ -1879,7 +2022,9 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { | lbl_1: WHILE 1=1 DO | lbl_2: LOOP | lbl_3: REPEAT - | SELECT 4; + | lbl_4: FOR x AS SELECT 1 DO + | SELECT 4; + | END FOR; | UNTIL 1=1 | END REPEAT; | END LOOP; @@ -1905,6 +2050,271 @@ class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { // Repeat statement val repeatStatement = loopStatement.body.collection.head.asInstanceOf[RepeatStatement] assert(repeatStatement.label.get == "lbl_3") + // For statement + val forStatement = repeatStatement.body.collection.head.asInstanceOf[ForStatement] + assert(forStatement.label.get == "lbl_4") + } + + test("for statement") { + val sqlScriptText = + """ + |BEGIN + | lbl: FOR x AS SELECT 5 DO + | SELECT 1; + | END FOR; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT 5") + assert(forStmt.variableName.contains("x")) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 1) + assert(forStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(forStmt.body.collection.head.asInstanceOf[SingleStatement].getText == "SELECT 1") + + assert(forStmt.label.contains("lbl")) + } + + test("for statement - empty body") { + val sqlScriptText = + """ + |BEGIN + | lbl: FOR x AS SELECT 5 DO + | END FOR; + |END""".stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'FOR'", "hint" -> "")) + } + + test("for statement - no label") { + val sqlScriptText = + """ + |BEGIN + | FOR x AS SELECT 5 DO + | SELECT 1; + | END FOR; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT 5") + assert(forStmt.variableName.contains("x")) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 1) + assert(forStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(forStmt.body.collection.head.asInstanceOf[SingleStatement].getText == "SELECT 1") + + // when not explicitly set, label is random UUID + assert(forStmt.label.isDefined) + } + + test("for statement - with complex subquery") { + val sqlScriptText = + """ + |BEGIN + | lbl: FOR x AS SELECT c1, c2 FROM t WHERE c2 = 5 GROUP BY c1 ORDER BY c1 DO + | SELECT x.c1; + | SELECT x.c2; + | END FOR; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT c1, c2 FROM t WHERE c2 = 5 GROUP BY c1 ORDER BY c1") + assert(forStmt.variableName.contains("x")) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 2) + assert(forStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(forStmt.body.collection.head.asInstanceOf[SingleStatement].getText == "SELECT x.c1") + assert(forStmt.body.collection(1).isInstanceOf[SingleStatement]) + assert(forStmt.body.collection(1).asInstanceOf[SingleStatement].getText == "SELECT x.c2") + + assert(forStmt.label.contains("lbl")) + } + + test("for statement - nested") { + val sqlScriptText = + """ + |BEGIN + | lbl1: FOR i AS SELECT 1 DO + | lbl2: FOR j AS SELECT 2 DO + | SELECT i + j; + | END FOR lbl2; + | END FOR lbl1; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT 1") + assert(forStmt.variableName.contains("i")) + assert(forStmt.label.contains("lbl1")) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 1) + assert(forStmt.body.collection.head.isInstanceOf[ForStatement]) + val nestedForStmt = forStmt.body.collection.head.asInstanceOf[ForStatement] + + assert(nestedForStmt.query.isInstanceOf[SingleStatement]) + assert(nestedForStmt.query.getText == "SELECT 2") + assert(nestedForStmt.variableName.contains("j")) + assert(nestedForStmt.label.contains("lbl2")) + + assert(nestedForStmt.body.isInstanceOf[CompoundBody]) + assert(nestedForStmt.body.collection.length == 1) + assert(nestedForStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(nestedForStmt.body.collection. + head.asInstanceOf[SingleStatement].getText == "SELECT i + j") + } + + test("for statement - no variable") { + val sqlScriptText = + """ + |BEGIN + | lbl: FOR SELECT 5 DO + | SELECT 1; + | END FOR; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT 5") + assert(forStmt.variableName.isEmpty) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 1) + assert(forStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(forStmt.body.collection.head.asInstanceOf[SingleStatement].getText == "SELECT 1") + + assert(forStmt.label.contains("lbl")) + } + + test("for statement - no variable - empty body") { + val sqlScriptText = + """ + |BEGIN + | lbl: FOR SELECT 5 DO + | END FOR; + |END""".stripMargin + checkError( + exception = intercept[ParseException] { + parsePlan(sqlScriptText) + }, + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'FOR'", "hint" -> "")) + } + + test("for statement - no variable - no label") { + val sqlScriptText = + """ + |BEGIN + | FOR SELECT 5 DO + | SELECT 1; + | END FOR; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT 5") + assert(forStmt.variableName.isEmpty) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 1) + assert(forStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(forStmt.body.collection.head.asInstanceOf[SingleStatement].getText == "SELECT 1") + + // when not explicitly set, label is random UUID + assert(forStmt.label.isDefined) + } + + test("for statement - no variable - with complex subquery") { + val sqlScriptText = + """ + |BEGIN + | lbl: FOR SELECT c1, c2 FROM t WHERE c2 = 5 GROUP BY c1 ORDER BY c1 DO + | SELECT 1; + | SELECT 2; + | END FOR; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT c1, c2 FROM t WHERE c2 = 5 GROUP BY c1 ORDER BY c1") + assert(forStmt.variableName.isEmpty) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 2) + assert(forStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(forStmt.body.collection.head.asInstanceOf[SingleStatement].getText == "SELECT 1") + assert(forStmt.body.collection(1).isInstanceOf[SingleStatement]) + assert(forStmt.body.collection(1).asInstanceOf[SingleStatement].getText == "SELECT 2") + + assert(forStmt.label.contains("lbl")) + } + + test("for statement - no variable - nested") { + val sqlScriptText = + """ + |BEGIN + | lbl1: FOR SELECT 1 DO + | lbl2: FOR SELECT 2 DO + | SELECT 3; + | END FOR lbl2; + | END FOR lbl1; + |END""".stripMargin + val tree = parsePlan(sqlScriptText).asInstanceOf[CompoundBody] + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[ForStatement]) + + val forStmt = tree.collection.head.asInstanceOf[ForStatement] + assert(forStmt.query.isInstanceOf[SingleStatement]) + assert(forStmt.query.getText == "SELECT 1") + assert(forStmt.variableName.isEmpty) + assert(forStmt.label.contains("lbl1")) + + assert(forStmt.body.isInstanceOf[CompoundBody]) + assert(forStmt.body.collection.length == 1) + assert(forStmt.body.collection.head.isInstanceOf[ForStatement]) + val nestedForStmt = forStmt.body.collection.head.asInstanceOf[ForStatement] + + assert(nestedForStmt.query.isInstanceOf[SingleStatement]) + assert(nestedForStmt.query.getText == "SELECT 2") + assert(nestedForStmt.variableName.isEmpty) + assert(nestedForStmt.label.contains("lbl2")) + + assert(nestedForStmt.body.isInstanceOf[CompoundBody]) + assert(nestedForStmt.body.collection.length == 1) + assert(nestedForStmt.body.collection.head.isInstanceOf[SingleStatement]) + assert(nestedForStmt.body.collection. + head.asInstanceOf[SingleStatement].getText == "SELECT 3") } // Helper methods diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala new file mode 100644 index 0000000000000..5ff66098107c2 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.expressions.{AssertTrue, Cast, If, Literal, TimeZoneAwareExpression} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.types.BooleanType + +class NormalizePlanSuite extends SparkFunSuite with SQLConfHelper { + + test("Normalize InheritAnalysisRules expressions") { + val castWithoutTimezone = + Cast(child = Literal(1), dataType = BooleanType, ansiEnabled = conf.ansiEnabled) + val castWithTimezone = castWithoutTimezone.withTimeZone(conf.sessionLocalTimeZone) + + val baselineExpression = AssertTrue(castWithTimezone) + val baselinePlan = LocalRelation().select(baselineExpression) + + val testExpression = AssertTrue(castWithoutTimezone) + val testPlan = LocalRelation().select(testExpression) + + // Before calling [[setTimezoneForAllExpression]], [[AssertTrue]] node will look like: + // + // AssertTrue(Cast(Literal(1)), message, If(Cast(Literal(1)), Literal(null), error)) + // + // Calling [[setTimezoneForAllExpression]] will only apply timezone to the second Cast node + // because [[InheritAnalysisRules]] only sees replacement expression as its child. This will + // cause the difference when comparing [[resolvedBaselinePlan]] and [[resolvedTestPlan]], + // therefore we need normalization. + + // Before applying timezone, no timezone is set. + testPlan.expressions.foreach { + case _ @ AssertTrue(firstCast: Cast, _, _ @ If(secondCast: Cast, _, _)) => + assert(firstCast.timeZoneId.isEmpty) + assert(secondCast.timeZoneId.isEmpty) + case _ => + } + + val resolvedBaselinePlan = setTimezoneForAllExpression(baselinePlan) + val resolvedTestPlan = setTimezoneForAllExpression(testPlan) + + // After applying timezone, only the second cast gets timezone. + resolvedTestPlan.expressions.foreach { + case _ @ AssertTrue(firstCast: Cast, _, _ @ If(secondCast: Cast, _, _)) => + assert(firstCast.timeZoneId.isEmpty) + assert(secondCast.timeZoneId.isDefined) + case _ => + } + + // However, plans are still different. + assert(resolvedBaselinePlan != resolvedTestPlan) + assert(NormalizePlan(resolvedBaselinePlan) == NormalizePlan(resolvedTestPlan)) + } + + private def setTimezoneForAllExpression(plan: LogicalPlan): LogicalPlan = { + plan.transformAllExpressions { + case e: TimeZoneAwareExpression if e.timeZoneId.isEmpty => + e.withTimeZone(conf.sessionLocalTimeZone) + } + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala index 385850376d147..fb4053964a841 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala @@ -136,4 +136,19 @@ class StringUtilsSuite extends SparkFunSuite with SQLHelper { val expectedOutput = Seq("`c1`", "`v2.c2`", "`v1`.`c2`") assert(orderSuggestedIdentifiersBySimilarity(baseString, testStrings) === expectedOutput) } + + test("SPARK-50579: truncated string") { + assert(truncatedString(Seq.empty, ", ", -1) === "") + assert(truncatedString(Seq("a"), ", ", -1) === "... 1 more fields") + assert(truncatedString(Seq("B"), "(", ", ", ")", -1) === "(... 1 more fields)") + assert(truncatedString(Seq.empty, ", ", 0) === "") + assert(truncatedString(Seq.empty, "[", ", ", "]", 0) === "[]") + assert(truncatedString(Seq("a", "b"), ", ", 0) === "... 2 more fields") + assert(truncatedString(Seq.empty, ",", 1) === "") + assert(truncatedString(Seq("a"), ",", 1) === "a") + assert(truncatedString(Seq("a", "b"), ", ", 1) === "a, ... 1 more fields") + assert(truncatedString(Seq("a", "b"), ", ", 2) === "a, b") + assert(truncatedString(Seq("a", "b", "c"), ", ", Int.MaxValue) === "a, b, c") + assert(truncatedString(Seq("a", "b", "c"), ", ", Int.MinValue) === "... 3 more fields") + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala index 497ef848ac78f..ab17b93ad6146 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala @@ -295,7 +295,7 @@ abstract class InMemoryBaseTable( TableCapability.TRUNCATE) override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - new InMemoryScanBuilder(schema) + new InMemoryScanBuilder(schema, options) } private def canEvaluate(filter: Filter): Boolean = { @@ -309,8 +309,10 @@ abstract class InMemoryBaseTable( } } - class InMemoryScanBuilder(tableSchema: StructType) extends ScanBuilder - with SupportsPushDownRequiredColumns with SupportsPushDownFilters { + class InMemoryScanBuilder( + tableSchema: StructType, + options: CaseInsensitiveStringMap) extends ScanBuilder + with SupportsPushDownRequiredColumns with SupportsPushDownFilters { private var schema: StructType = tableSchema private var postScanFilters: Array[Filter] = Array.empty private var evaluableFilters: Array[Filter] = Array.empty @@ -318,7 +320,7 @@ abstract class InMemoryBaseTable( override def build: Scan = { val scan = InMemoryBatchScan( - data.map(_.asInstanceOf[InputPartition]).toImmutableArraySeq, schema, tableSchema) + data.map(_.asInstanceOf[InputPartition]).toImmutableArraySeq, schema, tableSchema, options) if (evaluableFilters.nonEmpty) { scan.filter(evaluableFilters) } @@ -442,7 +444,8 @@ abstract class InMemoryBaseTable( case class InMemoryBatchScan( var _data: Seq[InputPartition], readSchema: StructType, - tableSchema: StructType) + tableSchema: StructType, + options: CaseInsensitiveStringMap) extends BatchScanBaseClass(_data, readSchema, tableSchema) with SupportsRuntimeFiltering { override def filterAttributes(): Array[NamedReference] = { @@ -474,17 +477,17 @@ abstract class InMemoryBaseTable( } } - abstract class InMemoryWriterBuilder() extends SupportsTruncate with SupportsDynamicOverwrite - with SupportsStreamingUpdateAsAppend { + abstract class InMemoryWriterBuilder(val info: LogicalWriteInfo) + extends SupportsTruncate with SupportsDynamicOverwrite with SupportsStreamingUpdateAsAppend { - protected var writer: BatchWrite = Append - protected var streamingWriter: StreamingWrite = StreamingAppend + protected var writer: BatchWrite = new Append(info) + protected var streamingWriter: StreamingWrite = new StreamingAppend(info) override def overwriteDynamicPartitions(): WriteBuilder = { - if (writer != Append) { + if (!writer.isInstanceOf[Append]) { throw new IllegalArgumentException(s"Unsupported writer type: $writer") } - writer = DynamicOverwrite + writer = new DynamicOverwrite(info) streamingWriter = new StreamingNotSupportedOperation("overwriteDynamicPartitions") this } @@ -529,13 +532,13 @@ abstract class InMemoryBaseTable( override def abort(messages: Array[WriterCommitMessage]): Unit = {} } - protected object Append extends TestBatchWrite { + class Append(val info: LogicalWriteInfo) extends TestBatchWrite { override def commit(messages: Array[WriterCommitMessage]): Unit = dataMap.synchronized { withData(messages.map(_.asInstanceOf[BufferedRows])) } } - private object DynamicOverwrite extends TestBatchWrite { + class DynamicOverwrite(val info: LogicalWriteInfo) extends TestBatchWrite { override def commit(messages: Array[WriterCommitMessage]): Unit = dataMap.synchronized { val newData = messages.map(_.asInstanceOf[BufferedRows]) dataMap --= newData.flatMap(_.rows.map(getKey)) @@ -543,7 +546,7 @@ abstract class InMemoryBaseTable( } } - protected object TruncateAndAppend extends TestBatchWrite { + class TruncateAndAppend(val info: LogicalWriteInfo) extends TestBatchWrite { override def commit(messages: Array[WriterCommitMessage]): Unit = dataMap.synchronized { dataMap.clear() withData(messages.map(_.asInstanceOf[BufferedRows])) @@ -572,7 +575,7 @@ abstract class InMemoryBaseTable( s"${operation} isn't supported for streaming query.") } - private object StreamingAppend extends TestStreamingWrite { + class StreamingAppend(val info: LogicalWriteInfo) extends TestStreamingWrite { override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { dataMap.synchronized { withData(messages.map(_.asInstanceOf[BufferedRows])) @@ -580,7 +583,7 @@ abstract class InMemoryBaseTable( } } - protected object StreamingTruncateAndAppend extends TestStreamingWrite { + class StreamingTruncateAndAppend(val info: LogicalWriteInfo) extends TestStreamingWrite { override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { dataMap.synchronized { dataMap.clear() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryRowLevelOperationTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryRowLevelOperationTable.scala index 4abe4c8b3e3fb..3a684dc57c02f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryRowLevelOperationTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryRowLevelOperationTable.scala @@ -59,7 +59,7 @@ class InMemoryRowLevelOperationTable( } override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - new InMemoryScanBuilder(schema) { + new InMemoryScanBuilder(schema, options) { override def build: Scan = { val scan = super.build() configuredScan = scan.asInstanceOf[InMemoryBatchScan] @@ -115,7 +115,7 @@ class InMemoryRowLevelOperationTable( override def rowId(): Array[NamedReference] = Array(PK_COLUMN_REF) override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - new InMemoryScanBuilder(schema) + new InMemoryScanBuilder(schema, options) } override def newWriteBuilder(info: LogicalWriteInfo): DeltaWriteBuilder = diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTable.scala index af04816e6b6f0..c27b8fea059f7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTable.scala @@ -84,23 +84,23 @@ class InMemoryTable( InMemoryBaseTable.maybeSimulateFailedTableWrite(new CaseInsensitiveStringMap(properties)) InMemoryBaseTable.maybeSimulateFailedTableWrite(info.options) - new InMemoryWriterBuilderWithOverWrite() + new InMemoryWriterBuilderWithOverWrite(info) } - private class InMemoryWriterBuilderWithOverWrite() extends InMemoryWriterBuilder - with SupportsOverwrite { + class InMemoryWriterBuilderWithOverWrite(override val info: LogicalWriteInfo) + extends InMemoryWriterBuilder(info) with SupportsOverwrite { override def truncate(): WriteBuilder = { - if (writer != Append) { + if (!writer.isInstanceOf[Append]) { throw new IllegalArgumentException(s"Unsupported writer type: $writer") } - writer = TruncateAndAppend - streamingWriter = StreamingTruncateAndAppend + writer = new TruncateAndAppend(info) + streamingWriter = new StreamingTruncateAndAppend(info) this } override def overwrite(filters: Array[Filter]): WriteBuilder = { - if (writer != Append) { + if (!writer.isInstanceOf[Append]) { throw new IllegalArgumentException(s"Unsupported writer type: $writer") } writer = new Overwrite(filters) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala index 20ada0d622bca..9b7a90774f91c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala @@ -47,19 +47,22 @@ class InMemoryTableWithV2Filter( } override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - new InMemoryV2FilterScanBuilder(schema) + new InMemoryV2FilterScanBuilder(schema, options) } - class InMemoryV2FilterScanBuilder(tableSchema: StructType) - extends InMemoryScanBuilder(tableSchema) { + class InMemoryV2FilterScanBuilder( + tableSchema: StructType, + options: CaseInsensitiveStringMap) + extends InMemoryScanBuilder(tableSchema, options) { override def build: Scan = InMemoryV2FilterBatchScan( - data.map(_.asInstanceOf[InputPartition]).toImmutableArraySeq, schema, tableSchema) + data.map(_.asInstanceOf[InputPartition]).toImmutableArraySeq, schema, tableSchema, options) } case class InMemoryV2FilterBatchScan( var _data: Seq[InputPartition], readSchema: StructType, - tableSchema: StructType) + tableSchema: StructType, + options: CaseInsensitiveStringMap) extends BatchScanBaseClass(_data, readSchema, tableSchema) with SupportsRuntimeV2Filtering { override def filterAttributes(): Array[NamedReference] = { @@ -93,21 +96,21 @@ class InMemoryTableWithV2Filter( InMemoryBaseTable.maybeSimulateFailedTableWrite(new CaseInsensitiveStringMap(properties)) InMemoryBaseTable.maybeSimulateFailedTableWrite(info.options) - new InMemoryWriterBuilderWithOverWrite() + new InMemoryWriterBuilderWithOverWrite(info) } - private class InMemoryWriterBuilderWithOverWrite() extends InMemoryWriterBuilder - with SupportsOverwriteV2 { + class InMemoryWriterBuilderWithOverWrite(override val info: LogicalWriteInfo) + extends InMemoryWriterBuilder(info) with SupportsOverwriteV2 { override def truncate(): WriteBuilder = { - assert(writer == Append) - writer = TruncateAndAppend - streamingWriter = StreamingTruncateAndAppend + assert(writer.isInstanceOf[Append]) + writer = new TruncateAndAppend(info) + streamingWriter = new StreamingTruncateAndAppend(info) this } override def overwrite(predicates: Array[Predicate]): WriteBuilder = { - assert(writer == Append) + assert(writer.isInstanceOf[Append]) writer = new Overwrite(predicates) streamingWriter = new StreamingNotSupportedOperation( s"overwrite (${predicates.mkString("filters(", ", ", ")")})") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/StagingInMemoryTableCatalog.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/StagingInMemoryTableCatalog.scala index f3c7bc98cec09..2a207901b83f5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/StagingInMemoryTableCatalog.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/StagingInMemoryTableCatalog.scala @@ -78,7 +78,7 @@ class StagingInMemoryTableCatalog extends InMemoryTableCatalog with StagingTable maybeSimulateFailedTableCreation(properties) } - private abstract class TestStagedTable( + protected abstract class TestStagedTable( ident: Identifier, delegateTable: InMemoryTable) extends StagedTable with SupportsWrite with SupportsRead { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index d5fc4d87bb6ad..397241be76eb1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala @@ -23,13 +23,11 @@ import org.json4s.jackson.JsonMethods import org.apache.spark.{SparkException, SparkFunSuite, SparkIllegalArgumentException} import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, caseSensitiveResolution} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.{CollationFactory, StringConcat} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataTypeTestUtils.{dayTimeIntervalTypes, yearMonthIntervalTypes} -class DataTypeSuite extends SparkFunSuite with SQLHelper { +class DataTypeSuite extends SparkFunSuite { private val UNICODE_COLLATION_ID = CollationFactory.collationNameToId("UNICODE") @@ -368,6 +366,8 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { checkDefaultSize(TimestampType, 8) checkDefaultSize(TimestampNTZType, 8) checkDefaultSize(StringType, 20) + checkDefaultSize(CharType(20), 20) + checkDefaultSize(VarcharType(20), 20) checkDefaultSize(BinaryType, 100) checkDefaultSize(ArrayType(DoubleType, true), 8) checkDefaultSize(ArrayType(StringType, false), 20) @@ -412,6 +412,14 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { from = ArrayType(DoubleType, containsNull = false), to = ArrayType(StringType, containsNull = false), expected = false) + checkEqualsIgnoreCompatibleNullability( + from = ArrayType(CharType(5), containsNull = false), + to = ArrayType(StringType, containsNull = false), + expected = false) + checkEqualsIgnoreCompatibleNullability( + from = ArrayType(VarcharType(5), containsNull = false), + to = ArrayType(StringType, containsNull = false), + expected = false) checkEqualsIgnoreCompatibleNullability( from = MapType(StringType, DoubleType, valueContainsNull = true), @@ -425,6 +433,14 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { from = MapType(StringType, DoubleType, valueContainsNull = false), to = MapType(StringType, DoubleType, valueContainsNull = true), expected = true) + checkEqualsIgnoreCompatibleNullability( + from = MapType(CharType(5), DoubleType, valueContainsNull = false), + to = MapType(StringType, DoubleType, valueContainsNull = true), + expected = false) + checkEqualsIgnoreCompatibleNullability( + from = MapType(VarcharType(5), DoubleType, valueContainsNull = false), + to = MapType(StringType, DoubleType, valueContainsNull = true), + expected = false) checkEqualsIgnoreCompatibleNullability( from = MapType(StringType, DoubleType, valueContainsNull = true), to = MapType(StringType, DoubleType, valueContainsNull = false), @@ -443,10 +459,26 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { from = StructType(StructField("a", StringType, nullable = true) :: Nil), to = StructType(StructField("a", StringType, nullable = true) :: Nil), expected = true) + checkEqualsIgnoreCompatibleNullability( + from = StructType(StructField("a", CharType(5), nullable = true) :: Nil), + to = StructType(StructField("a", StringType, nullable = true) :: Nil), + expected = false) + checkEqualsIgnoreCompatibleNullability( + from = StructType(StructField("a", VarcharType(5), nullable = true) :: Nil), + to = StructType(StructField("a", StringType, nullable = true) :: Nil), + expected = false) checkEqualsIgnoreCompatibleNullability( from = StructType(StructField("a", StringType, nullable = false) :: Nil), to = StructType(StructField("a", StringType, nullable = false) :: Nil), expected = true) + checkEqualsIgnoreCompatibleNullability( + from = StructType(StructField("a", CharType(5), nullable = false) :: Nil), + to = StructType(StructField("a", StringType, nullable = false) :: Nil), + expected = false) + checkEqualsIgnoreCompatibleNullability( + from = StructType(StructField("a", VarcharType(5), nullable = false) :: Nil), + to = StructType(StructField("a", StringType, nullable = false) :: Nil), + expected = false) checkEqualsIgnoreCompatibleNullability( from = StructType(StructField("a", StringType, nullable = false) :: Nil), to = StructType(StructField("a", StringType, nullable = true) :: Nil), @@ -485,6 +517,8 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { checkCatalogString(DecimalType(10, 5)) checkCatalogString(BinaryType) checkCatalogString(StringType) + checkCatalogString(CharType(5)) + checkCatalogString(VarcharType(10)) checkCatalogString(DateType) checkCatalogString(TimestampType) checkCatalogString(createStruct(4)) @@ -509,8 +543,18 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { checkEqualsStructurally(BooleanType, BooleanType, true) checkEqualsStructurally(IntegerType, IntegerType, true) checkEqualsStructurally(IntegerType, LongType, false) + checkEqualsStructurally(CharType(5), CharType(5), true) + checkEqualsStructurally(CharType(5), CharType(10), false) + checkEqualsStructurally(CharType(5), VarcharType(5), false) + checkEqualsStructurally(VarcharType(5), VarcharType(5), true) + checkEqualsStructurally(VarcharType(5), VarcharType(10), false) + checkEqualsStructurally(VarcharType(5), CharType(5), false) checkEqualsStructurally(ArrayType(IntegerType, true), ArrayType(IntegerType, true), true) checkEqualsStructurally(ArrayType(IntegerType, true), ArrayType(IntegerType, false), false) + checkEqualsStructurally(ArrayType(CharType(5), true), ArrayType(CharType(5), true), true) + checkEqualsStructurally(ArrayType(CharType(5), true), ArrayType(CharType(5), false), false) + checkEqualsStructurally(ArrayType(VarcharType(5), true), ArrayType(VarcharType(5), true), true) + checkEqualsStructurally(ArrayType(VarcharType(5), true), ArrayType(VarcharType(5), false), false) checkEqualsStructurally( new StructType().add("f1", IntegerType), @@ -521,6 +565,15 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { new StructType().add("f2", IntegerType, false), false) + checkEqualsStructurally( + new StructType().add("f1", CharType(5)), + new StructType().add("f2", StringType), + false) + checkEqualsStructurally( + new StructType().add("f1", VarcharType(5)), + new StructType().add("f2", StringType), + false) + checkEqualsStructurally( new StructType().add("f1", IntegerType).add("f", new StructType().add("f2", StringType)), new StructType().add("f2", IntegerType).add("g", new StructType().add("f1", StringType)), @@ -540,6 +593,14 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { .add("g", new StructType().add("f1", StringType)), true, ignoreNullability = true) + checkEqualsStructurally( + new StructType().add("f1", IntegerType).add("f", new StructType().add("f2", CharType(5))), + new StructType().add("f2", IntegerType).add("g", new StructType().add("f1", StringType)), + false) + checkEqualsStructurally( + new StructType().add("f1", IntegerType).add("f", new StructType().add("f2", VarcharType(5))), + new StructType().add("f2", IntegerType).add("g", new StructType().add("f1", StringType)), + false) checkEqualsStructurally( ArrayType( @@ -581,6 +642,22 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { true, ignoreNullability = true) + checkEqualsStructurally( + ArrayType( + ArrayType(CharType(5), true), true), + ArrayType( + ArrayType(StringType, true), true), + false, + ignoreNullability = false) + + checkEqualsStructurally( + ArrayType( + ArrayType(VarcharType(5), true), true), + ArrayType( + ArrayType(StringType, true), true), + false, + ignoreNullability = false) + checkEqualsStructurally( MapType( ArrayType(IntegerType, true), ArrayType(IntegerType, true), true), @@ -629,6 +706,22 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { true, ignoreNullability = true) + checkEqualsStructurally( + MapType( + ArrayType(IntegerType, true), ArrayType(CharType(5), true), true), + MapType( + ArrayType(IntegerType, true), ArrayType(StringType, true), true), + false, + ignoreNullability = false) + + checkEqualsStructurally( + MapType( + ArrayType(IntegerType, true), ArrayType(VarcharType(5), true), true), + MapType( + ArrayType(IntegerType, true), ArrayType(StringType, true), true), + false, + ignoreNullability = false) + def checkEqualsStructurallyByName( from: DataType, to: DataType, @@ -659,6 +752,10 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { checkEqualsStructurallyByName(BooleanType, BooleanType, true) checkEqualsStructurallyByName(BooleanType, IntegerType, true) checkEqualsStructurallyByName(IntegerType, LongType, true) + checkEqualsStructurallyByName(StringType, CharType(5), true) + checkEqualsStructurallyByName(StringType, VarcharType(5), true) + checkEqualsStructurallyByName(CharType(5), StringType, true) + checkEqualsStructurallyByName(VarcharType(5), StringType, true) checkEqualsStructurallyByName( new StructType().add("f1", IntegerType).add("f2", IntegerType), @@ -667,6 +764,16 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { checkEqualsStructurallyByName( new StructType().add("f1", IntegerType).add("f2", IntegerType), + new StructType().add("f1", CharType(5)).add("f2", StringType), + true) + + checkEqualsStructurallyByName( + new StructType().add("f1", IntegerType).add("f2", IntegerType), + new StructType().add("f2", LongType).add("f1", StringType), + false) + + checkEqualsStructurallyByName( + new StructType().add("f1", IntegerType).add("f2", VarcharType(5)), new StructType().add("f2", LongType).add("f1", StringType), false) @@ -675,23 +782,45 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { new StructType().add("f1", LongType).add("f", new StructType().add("f2", BooleanType)), true) + checkEqualsStructurallyByName( + new StructType().add("f1", IntegerType).add("f", new StructType().add("f2", StringType)), + new StructType().add("f1", LongType).add("f", new StructType().add("f2", VarcharType(5))), + true) + checkEqualsStructurallyByName( new StructType().add("f1", IntegerType).add("f", new StructType().add("f2", StringType)), new StructType().add("f", new StructType().add("f2", BooleanType)).add("f1", LongType), false) + checkEqualsStructurallyByName( + new StructType().add("f1", IntegerType).add("f", new StructType().add("f2", StringType)), + new StructType().add("f", new StructType().add("f2", CharType(5))).add("f1", LongType), + false) + checkEqualsStructurallyByName( new StructType().add("f1", IntegerType).add("f2", IntegerType), new StructType().add("F1", LongType).add("F2", StringType), true, caseSensitive = false) + checkEqualsStructurallyByName( + new StructType().add("f1", IntegerType).add("f2", IntegerType), + new StructType().add("F1", LongType).add("F2", CharType(5)), + true, + caseSensitive = false) + checkEqualsStructurallyByName( new StructType().add("f1", IntegerType).add("f2", IntegerType), new StructType().add("F1", LongType).add("F2", StringType), false, caseSensitive = true) + checkEqualsStructurallyByName( + new StructType().add("f1", IntegerType).add("f2", IntegerType), + new StructType().add("F1", LongType).add("F2", VarcharType(5)), + false, + caseSensitive = true) + def checkEqualsIgnoreCompatibleCollation( from: DataType, to: DataType, @@ -705,19 +834,45 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { // Simple types. checkEqualsIgnoreCompatibleCollation(IntegerType, IntegerType, expected = true) - checkEqualsIgnoreCompatibleCollation(BooleanType, BooleanType, expected = true) - checkEqualsIgnoreCompatibleCollation(StringType, StringType, expected = true) checkEqualsIgnoreCompatibleCollation(IntegerType, BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(IntegerType, StringType, expected = false) + checkEqualsIgnoreCompatibleCollation(IntegerType, CharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation(IntegerType, VarcharType(5), expected = false) checkEqualsIgnoreCompatibleCollation(BooleanType, IntegerType, expected = false) - checkEqualsIgnoreCompatibleCollation(StringType, BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(BooleanType, BooleanType, expected = true) checkEqualsIgnoreCompatibleCollation(BooleanType, StringType, expected = false) + checkEqualsIgnoreCompatibleCollation(BooleanType, CharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation(BooleanType, VarcharType(5), expected = false) checkEqualsIgnoreCompatibleCollation(StringType, IntegerType, expected = false) - checkEqualsIgnoreCompatibleCollation(IntegerType, StringType, expected = false) + checkEqualsIgnoreCompatibleCollation(StringType, BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(StringType, StringType, expected = true) + checkEqualsIgnoreCompatibleCollation(StringType, CharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation(StringType, VarcharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation(CharType(5), IntegerType, expected = false) + checkEqualsIgnoreCompatibleCollation(CharType(5), BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(CharType(5), StringType, expected = false) + checkEqualsIgnoreCompatibleCollation(CharType(5), CharType(5), expected = true) + checkEqualsIgnoreCompatibleCollation(CharType(5), CharType(10), expected = false) + checkEqualsIgnoreCompatibleCollation(CharType(5), VarcharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation(VarcharType(5), IntegerType, expected = false) + checkEqualsIgnoreCompatibleCollation(VarcharType(5), BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(VarcharType(5), StringType, expected = false) + checkEqualsIgnoreCompatibleCollation(VarcharType(5), CharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation(VarcharType(5), VarcharType(5), expected = true) + checkEqualsIgnoreCompatibleCollation(VarcharType(5), VarcharType(10), expected = false) // Collated `StringType`. checkEqualsIgnoreCompatibleCollation(StringType, StringType("UTF8_LCASE"), expected = true) checkEqualsIgnoreCompatibleCollation( - StringType("UTF8_BINARY"), StringType("UTF8_LCASE"), expected = true) + StringType("UTF8_LCASE"), StringType("UTF8_BINARY"), expected = true) + checkEqualsIgnoreCompatibleCollation( + StringType("UTF8_LCASE"), CharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation( + CharType(5), StringType("UTF8_LCASE"), expected = false) + checkEqualsIgnoreCompatibleCollation( + StringType("UTF8_LCASE"), VarcharType(5), expected = false) + checkEqualsIgnoreCompatibleCollation( + VarcharType(5), StringType("UTF8_LCASE"), expected = false) // Complex types. checkEqualsIgnoreCompatibleCollation( ArrayType(StringType), @@ -734,6 +889,26 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { ArrayType(ArrayType(StringType("UTF8_LCASE"))), expected = false ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(ArrayType(StringType)), + ArrayType(ArrayType(CharType(5))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(ArrayType(StringType("UTF8_LCASE"))), + ArrayType(ArrayType(CharType(5))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(ArrayType(StringType)), + ArrayType(ArrayType(VarcharType(5))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(ArrayType(StringType("UTF8_LCASE"))), + ArrayType(ArrayType(VarcharType(5))), + expected = false + ) checkEqualsIgnoreCompatibleCollation( MapType(StringType, StringType), MapType(StringType, StringType("UTF8_LCASE")), @@ -744,11 +919,51 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { MapType(StringType, StringType), expected = false ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), StringType), + MapType(CharType(5), StringType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType, StringType), + MapType(CharType(5), StringType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), StringType), + MapType(VarcharType(5), StringType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType, StringType), + MapType(VarcharType(5), StringType), + expected = false + ) checkEqualsIgnoreCompatibleCollation( MapType(StringType("UTF8_LCASE"), ArrayType(StringType)), MapType(StringType("UTF8_LCASE"), ArrayType(StringType("UTF8_LCASE"))), expected = false ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), ArrayType(StringType)), + MapType(StringType("UTF8_LCASE"), ArrayType(CharType(5))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), ArrayType(StringType("UTF8_LCASE"))), + MapType(StringType("UTF8_LCASE"), ArrayType(CharType(5))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), ArrayType(StringType)), + MapType(StringType("UTF8_LCASE"), ArrayType(VarcharType(5))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), ArrayType(StringType("UTF8_LCASE"))), + MapType(StringType("UTF8_LCASE"), ArrayType(VarcharType(5))), + expected = false + ) checkEqualsIgnoreCompatibleCollation( MapType(ArrayType(StringType), IntegerType), MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), @@ -759,14 +974,74 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), expected = true ) + checkEqualsIgnoreCompatibleCollation( + MapType(ArrayType(StringType), IntegerType), + MapType(ArrayType(CharType(5)), IntegerType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), + MapType(ArrayType(CharType(5)), IntegerType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(ArrayType(StringType), IntegerType), + MapType(ArrayType(VarcharType(5)), IntegerType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), + MapType(ArrayType(VarcharType(5)), IntegerType), + expected = false + ) checkEqualsIgnoreCompatibleCollation( StructType(StructField("a", StringType) :: Nil), StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil), expected = false ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("a", CharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil), + StructType(StructField("a", CharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("a", VarcharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil), + StructType(StructField("a", VarcharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", ArrayType(StringType)) :: Nil), + StructType(StructField("a", ArrayType(StringType("UTF8_LCASE"))) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", ArrayType(StringType)) :: Nil), + StructType(StructField("a", ArrayType(CharType(5))) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", ArrayType(StringType("UTF8_LCASE"))) :: Nil), + StructType(StructField("a", ArrayType(CharType(5))) :: Nil), + expected = false + ) checkEqualsIgnoreCompatibleCollation( StructType(StructField("a", ArrayType(StringType)) :: Nil), + StructType(StructField("a", ArrayType(VarcharType(5))) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( StructType(StructField("a", ArrayType(StringType("UTF8_LCASE"))) :: Nil), + StructType(StructField("a", ArrayType(VarcharType(5))) :: Nil), expected = false ) checkEqualsIgnoreCompatibleCollation( @@ -774,11 +1049,51 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { StructType(StructField("a", MapType(StringType("UTF8_LCASE"), IntegerType)) :: Nil), expected = false ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", MapType(StringType, IntegerType)) :: Nil), + StructType(StructField("a", MapType(CharType(5), IntegerType)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", MapType(StringType("UTF8_LCASE"), IntegerType)) :: Nil), + StructType(StructField("a", MapType(CharType(5), IntegerType)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", MapType(StringType, IntegerType)) :: Nil), + StructType(StructField("a", MapType(VarcharType(5), IntegerType)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", MapType(StringType("UTF8_LCASE"), IntegerType)) :: Nil), + StructType(StructField("a", MapType(VarcharType(5), IntegerType)) :: Nil), + expected = false + ) checkEqualsIgnoreCompatibleCollation( StructType(StructField("a", StringType) :: Nil), StructType(StructField("b", StringType("UTF8_LCASE")) :: Nil), expected = false ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("b", CharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil), + StructType(StructField("b", CharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("b", VarcharType(5)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil), + StructType(StructField("b", VarcharType(5)) :: Nil), + expected = false + ) // Null compatibility checks. checkEqualsIgnoreCompatibleCollation( ArrayType(StringType, containsNull = true), @@ -878,90 +1193,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { } } - test("string field with invalid collation name") { - val collationProviders = Seq("spark", "icu") - collationProviders.foreach { provider => - val json = - s""" - |{ - | "type": "struct", - | "fields": [ - | { - | "name": "c1", - | "type": "string", - | "nullable": false, - | "metadata": { - | "${DataType.COLLATIONS_METADATA_KEY}": { - | "c1": "$provider.INVALID" - | } - | } - | } - | ] - |} - |""".stripMargin - - // Check that the exception will be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME config not enabled. - checkError( - exception = intercept[SparkException] { - DataType.fromJson(json) - }, - condition = "COLLATION_INVALID_NAME", - parameters = Map( - "proposals" -> "id", - "collationName" -> "INVALID")) - - // Check that the exception will not be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned. - withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") { - val dataType = DataType.fromJson(json) - assert(dataType === StructType( - StructField("c1", StringType(CollationFactory.UTF8_BINARY_COLLATION_ID), false) :: Nil)) - } - } - } - - test("string field with invalid collation provider") { - val json = - s""" - |{ - | "type": "struct", - | "fields": [ - | { - | "name": "c1", - | "type": "string", - | "nullable": false, - | "metadata": { - | "${DataType.COLLATIONS_METADATA_KEY}": { - | "c1": "INVALID.INVALID" - | } - | } - | } - | ] - |} - |""".stripMargin - - - // Check that the exception will be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME config not enabled. - checkError( - exception = intercept[SparkException] { - DataType.fromJson(json) - }, - condition = "COLLATION_INVALID_PROVIDER", - parameters = Map( - "supportedProviders" -> "spark, icu", - "provider" -> "INVALID")) - - // Check that the exception will not be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned. - withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") { - val dataType = DataType.fromJson(json) - assert(dataType === StructType( - StructField("c1", StringType(CollationFactory.UTF8_BINARY_COLLATION_ID), false) :: Nil)) - } - } - test("non string field has collation metadata") { val json = s""" @@ -1109,42 +1340,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { assert(parsedWithCollations === ArrayType(StringType(unicodeCollationId))) } - test("parse array type with invalid collation metadata") { - val utf8BinaryCollationId = CollationFactory.UTF8_BINARY_COLLATION_ID - val arrayJson = - s""" - |{ - | "type": "array", - | "elementType": "string", - | "containsNull": true - |} - |""".stripMargin - - val collationsMap = Map("element" -> "INVALID") - - // Parse without collations map - assert(DataType.parseDataType(JsonMethods.parse(arrayJson)) === ArrayType(StringType)) - - // Check that the exception will be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME config not enabled. - checkError( - exception = intercept[SparkException] { - DataType.parseDataType(JsonMethods.parse(arrayJson), collationsMap = collationsMap) - }, - condition = "COLLATION_INVALID_NAME", - parameters = Map( - "proposals" -> "id", - "collationName" -> "INVALID")) - - // Check that the exception will not be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned. - withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") { - val dataType = DataType.parseDataType( - JsonMethods.parse(arrayJson), collationsMap = collationsMap) - assert(dataType === ArrayType(StringType(utf8BinaryCollationId))) - } - } - test("parse map type with collation metadata") { val unicodeCollationId = CollationFactory.collationNameToId("UNICODE") val mapJson = @@ -1168,44 +1363,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper { MapType(StringType(unicodeCollationId), StringType(unicodeCollationId))) } - test("parse map type with invalid collation metadata") { - val utf8BinaryCollationId = CollationFactory.UTF8_BINARY_COLLATION_ID - val mapJson = - s""" - |{ - | "type": "map", - | "keyType": "string", - | "valueType": "string", - | "valueContainsNull": true - |} - |""".stripMargin - - val collationsMap = Map("key" -> "INVALID", "value" -> "INVALID") - - // Parse without collations map - assert(DataType.parseDataType(JsonMethods.parse(mapJson)) === MapType(StringType, StringType)) - - // Check that the exception will be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME config not enabled. - checkError( - exception = intercept[SparkException] { - DataType.parseDataType(JsonMethods.parse(mapJson), collationsMap = collationsMap) - }, - condition = "COLLATION_INVALID_NAME", - parameters = Map( - "proposals" -> "id", - "collationName" -> "INVALID")) - - // Check that the exception will not be thrown in case of invalid collation name and - // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned. - withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") { - val dataType = DataType.parseDataType( - JsonMethods.parse(mapJson), collationsMap = collationsMap) - assert(dataType === MapType( - StringType(utf8BinaryCollationId), StringType(utf8BinaryCollationId))) - } - } - test("SPARK-48680: Add CharType and VarcharType to DataTypes JAVA API") { assert(DataTypes.createCharType(1) === CharType(1)) assert(DataTypes.createVarcharType(100) === VarcharType(100)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeWriteCompatibilitySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeWriteCompatibilitySuite.scala index f07ee8b35bbb2..ba3eaf46a5597 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeWriteCompatibilitySuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeWriteCompatibilitySuite.scala @@ -685,6 +685,11 @@ abstract class DataTypeWriteCompatibilityBaseSuite extends SparkFunSuite { ) } + test("Check string types: cast allowed regardless of collation") { + assertAllowed(StringType, StringType("UTF8_LCASE"), + "date time types", "Should allow writing string to collated string") + } + // Helper functions def assertAllowed( diff --git a/sql/connect/common/src/main/protobuf/spark/connect/base.proto b/sql/connect/common/src/main/protobuf/spark/connect/base.proto index e3c84ddd5e887..1d34011b20e78 100644 --- a/sql/connect/common/src/main/protobuf/spark/connect/base.proto +++ b/sql/connect/common/src/main/protobuf/spark/connect/base.proto @@ -25,6 +25,7 @@ import "spark/connect/common.proto"; import "spark/connect/expressions.proto"; import "spark/connect/relations.proto"; import "spark/connect/types.proto"; +import "spark/connect/ml.proto"; option java_multiple_files = true; option java_package = "org.apache.spark.connect.proto"; @@ -94,6 +95,7 @@ message AnalyzePlanRequest { Persist persist = 14; Unpersist unpersist = 15; GetStorageLevel get_storage_level = 16; + JsonToDDL json_to_ddl = 18; } message Schema { @@ -199,6 +201,11 @@ message AnalyzePlanRequest { // (Required) The logical plan to get the storage level. Relation relation = 1; } + + message JsonToDDL { + // (Required) The JSON formatted string to be converted to DDL. + string json_string = 1; + } } // Response to performing analysis of the query. Contains relevant metadata to be able to @@ -224,6 +231,7 @@ message AnalyzePlanResponse { Persist persist = 12; Unpersist unpersist = 13; GetStorageLevel get_storage_level = 14; + JsonToDDL json_to_ddl = 16; } message Schema { @@ -275,6 +283,10 @@ message AnalyzePlanResponse { // (Required) The StorageLevel as a result of get_storage_level request. StorageLevel storage_level = 1; } + + message JsonToDDL { + string ddl_string = 1; + } } // A request to be executed by the service. @@ -384,6 +396,9 @@ message ExecutePlanResponse { // Response for command that checkpoints a DataFrame. CheckpointCommandResult checkpoint_command_result = 19; + // ML command response + MlCommandResult ml_command_result = 20; + // Support arbitrary result objects. google.protobuf.Any extension = 999; } @@ -514,6 +529,9 @@ message ConfigRequest { message Set { // (Required) The config key-value pairs to set. repeated KeyValue pairs = 1; + + // (Optional) Whether to ignore failures. + optional bool silent = 2; } message Get { @@ -913,6 +931,20 @@ message ReleaseSessionRequest { // can be used for language or version specific information and is only intended for // logging purposes and will not be interpreted by the server. optional string client_type = 3; + + // Signals the server to allow the client to reconnect to the session after it is released. + // + // By default, the server tombstones the session upon release, preventing reconnections and + // fully cleaning the session state. + // + // If this flag is set to true, the server may permit the client to reconnect to the session + // post-release, even if the session state has been cleaned. This can result in missing state, + // such as Temporary Views, Temporary UDFs, or the Current Catalog, in the reconnected session. + // + // Use this option sparingly and only when the client fully understands the implications of + // reconnecting to a released session. The client must ensure that any queries executed do not + // rely on the session state prior to its release. + bool allow_reconnect = 4; } // Next ID: 3 diff --git a/sql/connect/common/src/main/protobuf/spark/connect/commands.proto b/sql/connect/common/src/main/protobuf/spark/connect/commands.proto index a01d4369a7aed..10033b6400b53 100644 --- a/sql/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/sql/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -21,6 +21,7 @@ import "google/protobuf/any.proto"; import "spark/connect/common.proto"; import "spark/connect/expressions.proto"; import "spark/connect/relations.proto"; +import "spark/connect/ml.proto"; package spark.connect; @@ -48,7 +49,7 @@ message Command { CheckpointCommand checkpoint_command = 14; RemoveCachedRemoteRelationCommand remove_cached_remote_relation_command = 15; MergeIntoTableCommand merge_into_table_command = 16; - + MlCommand ml_command = 17; // This field is used to mark extensions to the protocol. When plugins generate arbitrary // Commands they can add them here. During the planning the correct resolution is done. google.protobuf.Any extension = 999; diff --git a/sql/connect/common/src/main/protobuf/spark/connect/expressions.proto b/sql/connect/common/src/main/protobuf/spark/connect/expressions.proto index 3a91371fd3b25..bbe605a47f4ff 100644 --- a/sql/connect/common/src/main/protobuf/spark/connect/expressions.proto +++ b/sql/connect/common/src/main/protobuf/spark/connect/expressions.proto @@ -52,6 +52,8 @@ message Expression { NamedArgumentExpression named_argument_expression = 17; MergeAction merge_action = 19; TypedAggregateExpression typed_aggregate_expression = 20; + LazyExpression lazy_expression = 21; + SubqueryExpression subquery_expression = 22; // This field is used to mark extensions to the protocol. When plugins generate arbitrary // relations they can add them here. During the planning the correct resolution is done. @@ -259,6 +261,11 @@ message Expression { // When it is not a user defined function, Connect will use the function name directly. // When it is a user defined function, Connect will parse the function name first. bool is_user_defined_function = 4; + + // (Optional) Indicate if this function is defined in the internal function registry. + // If not set, the server will try to look up the function in the internal function registry + // and decide appropriately. + optional bool is_internal = 5; } // Expression as string. @@ -451,3 +458,22 @@ message MergeAction { Expression value = 2; } } + +message LazyExpression { + // (Required) The expression to be marked as lazy. + Expression child = 1; +} + +message SubqueryExpression { + // (Required) The id of corresponding connect plan. + int64 plan_id = 1; + + // (Required) The type of the subquery. + SubqueryType subquery_type = 2; + + enum SubqueryType { + SUBQUERY_TYPE_UNKNOWN = 0; + SUBQUERY_TYPE_SCALAR = 1; + SUBQUERY_TYPE_EXISTS = 2; + } +} diff --git a/sql/connect/common/src/main/protobuf/spark/connect/ml.proto b/sql/connect/common/src/main/protobuf/spark/connect/ml.proto new file mode 100644 index 0000000000000..48b04a6e14cd0 --- /dev/null +++ b/sql/connect/common/src/main/protobuf/spark/connect/ml.proto @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "spark/connect/relations.proto"; +import "spark/connect/ml_common.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; +option go_package = "internal/generated"; + +// Command for ML +message MlCommand { + oneof command { + Fit fit = 1; + Fetch fetch = 2; + Delete delete = 3; + Write write = 4; + Read read = 5; + } + + // Command for estimator.fit(dataset) + message Fit { + // Estimator information + MlOperator estimator = 1; + // parameters of the Estimator + MlParams params = 2; + // the training dataset + Relation dataset = 3; + } + + // Command to delete the cached object which could be a model + // or summary evaluated by a model + message Delete { + ObjectRef obj_ref = 1; + } + + // Command to write ML operator + message Write { + // It could be an estimator/evaluator or the cached model + oneof type { + // Estimator or evaluator + MlOperator operator = 1; + // The cached model + ObjectRef obj_ref = 2; + } + // The parameters of operator which could be estimator/evaluator or a cached model + MlParams params = 3; + // Save the ML instance to the path + string path = 4; + // Overwrites if the output path already exists. + bool should_overwrite = 5; + // The options of the writer + map options = 6; + } + + // Command to load ML operator. + message Read { + // ML operator information + MlOperator operator = 1; + // Load the ML instance from the input path + string path = 2; + } +} + +// The result of MlCommand +message MlCommandResult { + oneof result_type { + // The result of the attribute + Param param = 1; + // Evaluate a Dataset in a model and return the cached ID of summary + string summary = 2; + // Operator information + MlOperatorInfo operator_info = 3; + } + + // Represents an operator info + message MlOperatorInfo { + oneof type { + // The cached object which could be a model or summary evaluated by a model + ObjectRef obj_ref = 1; + // Operator name + string name = 2; + } + string uid = 3; + MlParams params = 4; + } + +} diff --git a/sql/connect/common/src/main/protobuf/spark/connect/ml_common.proto b/sql/connect/common/src/main/protobuf/spark/connect/ml_common.proto new file mode 100644 index 0000000000000..f91c2489ed947 --- /dev/null +++ b/sql/connect/common/src/main/protobuf/spark/connect/ml_common.proto @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "spark/connect/expressions.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; +option go_package = "internal/generated"; + +// MlParams stores param settings for ML Estimator / Transformer / Evaluator +message MlParams { + // User-supplied params + map params = 1; +} + +// Represents the parameter type of the ML instance, or the returned value +// of the attribute +message Param { + oneof param_type { + Expression.Literal literal = 1; + Vector vector = 2; + Matrix matrix = 3; + } +} + +// MLOperator represents the ML operators like (Estimator, Transformer or Evaluator) +message MlOperator { + // The qualified name of the ML operator. + string name = 1; + // Unique id of the ML operator + string uid = 2; + // Represents what the ML operator is + OperatorType type = 3; + enum OperatorType { + UNSPECIFIED = 0; + ESTIMATOR = 1; + TRANSFORMER = 2; + EVALUATOR = 3; + MODEL = 4; + } +} + +// Represents a reference to the cached object which could be a model +// or summary evaluated by a model +message ObjectRef { + // The ID is used to lookup the object on the server side. + string id = 1; +} + +// See pyspark.ml.linalg.Vector +message Vector { + oneof vector_type { + Dense dense = 1; + Sparse sparse = 2; + } + // See pyspark.ml.linalg.DenseVector + message Dense { + repeated double value = 1; + } + // See pyspark.ml.linalg.SparseVector + message Sparse { + int32 size = 1; + repeated int32 index = 2; + repeated double value = 3; + } +} + +// See pyspark.ml.linalg.Matrix +message Matrix { + oneof matrix_type { + Dense dense = 1; + Sparse sparse = 2; + } + // See pyspark.ml.linalg.DenseMatrix + message Dense { + int32 num_rows = 1; + int32 num_cols = 2; + repeated double value = 3; + bool is_transposed = 4; + } + // See pyspark.ml.linalg.SparseMatrix + message Sparse { + int32 num_rows = 1; + int32 num_cols = 2; + repeated int32 colptr = 3; + repeated int32 row_index = 4; + repeated double value = 5; + bool is_transposed = 6; + } +} diff --git a/sql/connect/common/src/main/protobuf/spark/connect/relations.proto b/sql/connect/common/src/main/protobuf/spark/connect/relations.proto index a7b9137c3400a..c2cbed0dd22ba 100644 --- a/sql/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/sql/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -24,6 +24,7 @@ import "spark/connect/expressions.proto"; import "spark/connect/types.proto"; import "spark/connect/catalog.proto"; import "spark/connect/common.proto"; +import "spark/connect/ml_common.proto"; option java_multiple_files = true; option java_package = "org.apache.spark.connect.proto"; @@ -78,6 +79,7 @@ message Relation { WithRelations with_relations = 41; Transpose transpose = 42; UnresolvedTableValuedFunction unresolved_table_valued_function = 43; + LateralJoin lateral_join = 44; // NA functions NAFill fill_na = 90; @@ -97,6 +99,9 @@ message Relation { // Catalog API (experimental / unstable) Catalog catalog = 200; + // ML relation + MlRelation ml_relation = 300; + // This field is used to mark extensions to the protocol. When plugins generate arbitrary // relations they can add them here. During the planning the correct resolution is done. google.protobuf.Any extension = 998; @@ -104,6 +109,55 @@ message Relation { } } +// Relation to represent ML world +message MlRelation { + oneof ml_type { + Transform transform = 1; + Fetch fetch = 2; + } + // Relation to represent transform(input) of the operator + // which could be a cached model or a new transformer + message Transform { + oneof operator { + // Object reference + ObjectRef obj_ref = 1; + // Could be an ML transformer like VectorAssembler + MlOperator transformer = 2; + } + // the input dataframe + Relation input = 3; + // the operator specific parameters + MlParams params = 4; + } +} + +// Message for fetching attribute from object on the server side. +// Fetch can be represented as a Relation or a ML command +// Command: model.coefficients, model.summary.weightedPrecision which +// returns the final literal result +// Relation: model.summary.roc which returns a DataFrame (Relation) +message Fetch { + // (Required) reference to the object on the server side + ObjectRef obj_ref = 1; + // (Required) the calling method chains + repeated Method methods = 2; + + // Represents a method with inclusion of method name and its arguments + message Method { + // (Required) the method name + string method = 1; + // (Optional) the arguments of the method + repeated Args args = 2; + + message Args { + oneof args_type { + Param param = 1; + Relation input = 2; + } + } + } +} + // Used for testing purposes only. message Unknown {} @@ -974,6 +1028,9 @@ message GroupMap { // (Optional) Timeout configuration for groups that do not receive data for a while. optional string timeout_conf = 9; + + // (Optional) The schema for the grouped state. + optional DataType state_schema = 10; } message CoGroupMap { @@ -1140,3 +1197,20 @@ message AsOfJoin { // (Required) Whether to search for prior, subsequent, or closest matches. string direction = 10; } + +// Relation of type [[LateralJoin]]. +// +// `left` and `right` must be present. +message LateralJoin { + // (Required) Left input relation for a Join. + Relation left = 1; + + // (Required) Right input relation for a Join. + Relation right = 2; + + // (Optional) The join condition. + Expression join_condition = 3; + + // (Required) The join type. + Join.JoinType join_type = 4; +} diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/CloseableIterator.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/CloseableIterator.scala index 4ec6828d885ab..9de585503a500 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/CloseableIterator.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/CloseableIterator.scala @@ -25,6 +25,16 @@ private[sql] trait CloseableIterator[E] extends Iterator[E] with AutoCloseable { override def close() = self.close() } + + override def map[B](f: E => B): CloseableIterator[B] = { + new CloseableIterator[B] { + override def next(): B = f(self.next()) + + override def hasNext: Boolean = self.hasNext + + override def close(): Unit = self.close() + } + } } private[sql] abstract class WrappedCloseableIterator[E] extends CloseableIterator[E] { diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala index 3aad90e96f8cd..959779b357c2d 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala @@ -41,8 +41,7 @@ private[sql] class SparkResult[T]( responses: CloseableIterator[proto.ExecutePlanResponse], allocator: BufferAllocator, encoder: AgnosticEncoder[T], - timeZoneId: String, - setObservationMetricsOpt: Option[(Long, Row) => Unit] = None) + timeZoneId: String) extends AutoCloseable { self => case class StageInfo( @@ -122,7 +121,8 @@ private[sql] class SparkResult[T]( while (!stop && responses.hasNext) { val response = responses.next() - // Collect metrics for this response + // Collect **all** metrics for this response, whether or not registered to an Observation + // object. observedMetrics ++= processObservedMetrics(response.getObservedMetricsList) // Save and validate operationId @@ -209,23 +209,7 @@ private[sql] class SparkResult[T]( private def processObservedMetrics( metrics: java.util.List[ObservedMetrics]): Iterable[(String, Row)] = { metrics.asScala.map { metric => - assert(metric.getKeysCount == metric.getValuesCount) - var schema = new StructType() - val values = mutable.ArrayBuilder.make[Any] - values.sizeHint(metric.getKeysCount) - (0 until metric.getKeysCount).foreach { i => - val key = metric.getKeys(i) - val value = LiteralValueProtoConverter.toCatalystValue(metric.getValues(i)) - schema = schema.add(key, LiteralValueProtoConverter.toDataType(value.getClass)) - values += value - } - val row = new GenericRowWithSchema(values.result(), schema) - // If the metrics is registered by an Observation object, attach them and unblock any - // blocked thread. - setObservationMetricsOpt.foreach { setObservationMetrics => - setObservationMetrics(metric.getPlanId, row) - } - metric.getName -> row + metric.getName -> SparkResult.transformObservedMetrics(metric) } } @@ -387,8 +371,23 @@ private[sql] class SparkResult[T]( } } -private object SparkResult { +private[sql] object SparkResult { private val cleaner: Cleaner = Cleaner.create() + + /** Return value is a Seq of pairs, to preserve the order of values. */ + private[sql] def transformObservedMetrics(metric: ObservedMetrics): Row = { + assert(metric.getKeysCount == metric.getValuesCount) + var schema = new StructType() + val values = mutable.ArrayBuilder.make[Any] + values.sizeHint(metric.getKeysCount) + (0 until metric.getKeysCount).foreach { i => + val key = metric.getKeys(i) + val value = LiteralValueProtoConverter.toCatalystValue(metric.getValues(i)) + schema = schema.add(key, LiteralValueProtoConverter.toDataType(value.getClass)) + values += value + } + new GenericRowWithSchema(values.result(), schema) + } } private[client] class SparkResultCloseable( diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala index f3abaddb0110b..4618c7e24d4ac 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala @@ -40,6 +40,7 @@ import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.connect.client.CloseableIterator import org.apache.spark.sql.errors.{CompilationErrors, ExecutionErrors} import org.apache.spark.sql.types.Decimal +import org.apache.spark.unsafe.types.VariantVal /** * Helper class for converting arrow batches into user objects. @@ -336,6 +337,34 @@ object ArrowDeserializers { } } + case (VariantEncoder, StructVectors(struct, vectors)) => + assert(vectors.exists(_.getName == "value")) + assert( + vectors.exists(field => + field.getName == "metadata" && field.getField.getMetadata + .containsKey("variant") && field.getField.getMetadata.get("variant") == "true")) + val valueDecoder = + deserializerFor( + BinaryEncoder, + vectors + .find(_.getName == "value") + .getOrElse(throw CompilationErrors.columnNotFoundError("value")), + timeZoneId) + val metadataDecoder = + deserializerFor( + BinaryEncoder, + vectors + .find(_.getName == "metadata") + .getOrElse(throw CompilationErrors.columnNotFoundError("metadata")), + timeZoneId) + new StructFieldSerializer[VariantVal](struct) { + def value(i: Int): VariantVal = { + new VariantVal( + valueDecoder.get(i).asInstanceOf[Array[Byte]], + metadataDecoder.get(i).asInstanceOf[Array[Byte]]) + } + } + case (JavaBeanEncoder(tag, fields), StructVectors(struct, vectors)) => val constructor = methodLookup.findConstructor(tag.runtimeClass, MethodType.methodType(classOf[Unit])) @@ -366,7 +395,7 @@ object ArrowDeserializers { override def get(i: Int): Any = codec.decode(deserializer.get(i)) } - case (CalendarIntervalEncoder | VariantEncoder | _: UDTEncoder[_], _) => + case (CalendarIntervalEncoder | _: UDTEncoder[_], _) => throw ExecutionErrors.unsupportedDataTypeError(encoder.dataType) case _ => diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala index f8a5c63ac3abe..c01390bf07857 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala @@ -42,6 +42,7 @@ import org.apache.spark.sql.connect.client.CloseableIterator import org.apache.spark.sql.errors.ExecutionErrors import org.apache.spark.sql.types.Decimal import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.unsafe.types.VariantVal /** * Helper class for converting user objects into arrow batches. @@ -433,6 +434,22 @@ object ArrowSerializer { case (RowEncoder(fields), StructVectors(struct, vectors)) => structSerializerFor(fields, struct, vectors) { (_, i) => r => r.asInstanceOf[Row].get(i) } + case (VariantEncoder, StructVectors(struct, vectors)) => + assert(vectors.exists(_.getName == "value")) + assert( + vectors.exists(field => + field.getName == "metadata" && field.getField.getMetadata + .containsKey("variant") && field.getField.getMetadata.get("variant") == "true")) + new StructSerializer( + struct, + Seq( + new StructFieldSerializer( + extractor = (v: Any) => v.asInstanceOf[VariantVal].getValue, + serializerFor(BinaryEncoder, struct.getChild("value"))), + new StructFieldSerializer( + extractor = (v: Any) => v.asInstanceOf[VariantVal].getMetadata, + serializerFor(BinaryEncoder, struct.getChild("metadata"))))) + case (JavaBeanEncoder(tag, fields), StructVectors(struct, vectors)) => structSerializerFor(fields, struct, vectors) { (field, _) => val getter = methodLookup.findVirtual( @@ -450,7 +467,7 @@ object ArrowSerializer { delegate.write(index, codec.encode(value)) } - case (CalendarIntervalEncoder | VariantEncoder | _: UDTEncoder[_], _) => + case (CalendarIntervalEncoder | _: UDTEncoder[_], _) => throw ExecutionErrors.unsupportedDataTypeError(encoder.dataType) case _ => diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala index f63692717947a..3577ca228b03e 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala @@ -175,16 +175,6 @@ object DataTypeProtoConverter { proto.DataType.Decimal.newBuilder().setPrecision(precision).setScale(scale).build()) .build() - case s: StringType => - proto.DataType - .newBuilder() - .setString( - proto.DataType.String - .newBuilder() - .setCollation(CollationFactory.fetchCollation(s.collationId).collationName) - .build()) - .build() - case CharType(length) => proto.DataType .newBuilder() @@ -197,6 +187,17 @@ object DataTypeProtoConverter { .setVarChar(proto.DataType.VarChar.newBuilder().setLength(length).build()) .build() + // StringType must be matched after CharType and VarcharType + case s: StringType => + proto.DataType + .newBuilder() + .setString( + proto.DataType.String + .newBuilder() + .setCollation(CollationFactory.fetchCollation(s.collationId).collationName) + .build()) + .build() + case DateType => ProtoDataTypes.DateType case TimestampType => ProtoDataTypes.TimestampType diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/InvalidCommandInput.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/InvalidCommandInput.scala new file mode 100644 index 0000000000000..313fe7262a10b --- /dev/null +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/common/InvalidCommandInput.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.common + +/** + * Error thrown when a connect command is not valid. + */ +final case class InvalidCommandInput( + private val message: String = "", + private val cause: Throwable = null) + extends Exception(message, cause) diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_collation.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_collation.explain index a6a251505652a..a6bf9ae2e71c9 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_collation.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_collation.explain @@ -1,2 +1,2 @@ -Project [UTF8_BINARY AS collation(g)#0] +Project [SYSTEM.BUILTIN.UTF8_BINARY AS collation(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain index ef87c18948b23..89e03c8188232 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain @@ -1,2 +1,2 @@ -Project [invoke(CsvToStructsEvaluator(Map(mode -> FAILFAST),StructType(StructField(id,LongType,true),StructField(a,IntegerType,true),StructField(b,DoubleType,true)),_corrupt_record,Some(America/Los_Angeles),None).evaluate(g#0)) AS from_csv(g)#0] +Project [from_csv(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), (mode,FAILFAST), g#0, Some(America/Los_Angeles), None) AS from_csv(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain index 9bc33b3b97d2c..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain @@ -1,2 +1,2 @@ -Project [invoke(JsonToStructsEvaluator(Map(),StructType(StructField(id,LongType,true),StructField(a,IntegerType,true),StructField(b,DoubleType,true)),_corrupt_record,Some(America/Los_Angeles),false).evaluate(g#0)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain index 9bc33b3b97d2c..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain @@ -1,2 +1,2 @@ -Project [invoke(JsonToStructsEvaluator(Map(),StructType(StructField(id,LongType,true),StructField(a,IntegerType,true),StructField(b,DoubleType,true)),_corrupt_record,Some(America/Los_Angeles),false).evaluate(g#0)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain index 9bc33b3b97d2c..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain @@ -1,2 +1,2 @@ -Project [invoke(JsonToStructsEvaluator(Map(),StructType(StructField(id,LongType,true),StructField(a,IntegerType,true),StructField(b,DoubleType,true)),_corrupt_record,Some(America/Los_Angeles),false).evaluate(g#0)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain index 928dd0bf85cc7..4491b6166afae 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain @@ -1,2 +1,2 @@ -Project [id#0L, id#0L, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, ... 3 more fields] +Project [id#0L, id#0L, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, ... 2 more fields] +- LocalRelation , [id#0L, a#0, b#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain index b400aeeca5af2..d75545d8766d0 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain @@ -1,2 +1,2 @@ -Project [static_invoke(JsonExpressionEvalUtils.schemaOfJson(com.fasterxml.jackson.core.JsonFactory, org.apache.spark.sql.catalyst.json.JSONOptions, org.apache.spark.sql.catalyst.json.JsonInferSchema, [{"col":01}])) AS schema_of_json([{"col":01}])#0] +Project [invoke(SchemaOfJsonEvaluator(Map()).evaluate([{"col":01}])) AS schema_of_json([{"col":01}])#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain index b400aeeca5af2..37321af1deed4 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain @@ -1,2 +1,2 @@ -Project [static_invoke(JsonExpressionEvalUtils.schemaOfJson(com.fasterxml.jackson.core.JsonFactory, org.apache.spark.sql.catalyst.json.JSONOptions, org.apache.spark.sql.catalyst.json.JsonInferSchema, [{"col":01}])) AS schema_of_json([{"col":01}])#0] +Project [invoke(SchemaOfJsonEvaluator(Map(allowNumericLeadingZeros -> true)).evaluate([{"col":01}])) AS schema_of_json([{"col":01}])#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain index 36dde1393cdb2..4b46e8453a1c0 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain @@ -1,2 +1,2 @@ -Project [timestampadd(week, cast(x#0L as int), t#0, Some(America/Los_Angeles)) AS timestampadd(week, x, t)#0] +Project [timestampadd(week, x#0L, t#0, Some(America/Los_Angeles)) AS timestampadd(week, x, t)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain index 3557274e9de8d..51270c147549e 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain @@ -1,2 +1,2 @@ -Project [cast(gettimestamp(s#0, yyyy-MM-dd, TimestampType, Some(America/Los_Angeles), false) as date) AS to_date(s, yyyy-MM-dd)#0] +Project [cast(gettimestamp(s#0, yyyy-MM-dd, TimestampType, try_to_date, Some(America/Los_Angeles), false) as date) AS to_date(s, yyyy-MM-dd)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ltz_with_format.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ltz_with_format.explain index e212c8d51a62f..e66fdba89e0ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ltz_with_format.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ltz_with_format.explain @@ -1,2 +1,2 @@ -Project [gettimestamp(g#0, g#0, TimestampType, Some(America/Los_Angeles), false) AS to_timestamp_ltz(g, g)#0] +Project [gettimestamp(g#0, g#0, TimestampType, try_to_timestamp, Some(America/Los_Angeles), false) AS to_timestamp_ltz(g, g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ntz_with_format.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ntz_with_format.explain index 10ca240877fe1..f133becf78237 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ntz_with_format.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_ntz_with_format.explain @@ -1,2 +1,2 @@ -Project [gettimestamp(g#0, g#0, TimestampNTZType, Some(America/Los_Angeles), false) AS to_timestamp_ntz(g, g)#0] +Project [gettimestamp(g#0, g#0, TimestampNTZType, try_to_timestamp, Some(America/Los_Angeles), false) AS to_timestamp_ntz(g, g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain index 54e1c0348a3a9..514b6705fa8e2 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain @@ -1,2 +1,2 @@ -Project [gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, Some(America/Los_Angeles), false) AS to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS)#0] +Project [gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, try_to_timestamp, Some(America/Los_Angeles), false) AS to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_try_to_timestamp.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_try_to_timestamp.explain index 8074beab7db81..c4dd956e83427 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_try_to_timestamp.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_try_to_timestamp.explain @@ -1,2 +1,2 @@ -Project [gettimestamp(g#0, g#0, TimestampType, Some(America/Los_Angeles), false) AS try_to_timestamp(g, g)#0] +Project [gettimestamp(g#0, g#0, TimestampType, try_to_timestamp, Some(America/Los_Angeles), false) AS try_to_timestamp(g, g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain index 606bb694bad47..6d854da250fcc 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain @@ -1,2 +1,2 @@ -Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, ... 19 more fields] +Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, ... 18 more fields] +- LocalRelation , [id#0L, a#0, b#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_date.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_date.explain index a1934253d93bd..7ac1d31802baf 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_date.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_date.explain @@ -1,2 +1,2 @@ -Project [unix_date(cast(gettimestamp(s#0, yyyy-MM-dd, TimestampType, Some(America/Los_Angeles), false) as date)) AS unix_date(to_date(s, yyyy-MM-dd))#0] +Project [unix_date(cast(gettimestamp(s#0, yyyy-MM-dd, TimestampType, try_to_date, Some(America/Los_Angeles), false) as date)) AS unix_date(to_date(s, yyyy-MM-dd))#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_micros.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_micros.explain index fb5cdd36f9b70..e5337b0f6c499 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_micros.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_micros.explain @@ -1,2 +1,2 @@ -Project [unix_micros(gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, Some(America/Los_Angeles), false)) AS unix_micros(to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS))#0L] +Project [unix_micros(gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, try_to_timestamp, Some(America/Los_Angeles), false)) AS unix_micros(to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS))#0L] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_millis.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_millis.explain index 3382c9ed679c5..5c852467a3507 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_millis.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_millis.explain @@ -1,2 +1,2 @@ -Project [unix_millis(gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, Some(America/Los_Angeles), false)) AS unix_millis(to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS))#0L] +Project [unix_millis(gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, try_to_timestamp, Some(America/Los_Angeles), false)) AS unix_millis(to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS))#0L] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_seconds.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_seconds.explain index d21c368869732..03d4386edda71 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_seconds.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_unix_seconds.explain @@ -1,2 +1,2 @@ -Project [unix_seconds(gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, Some(America/Los_Angeles), false)) AS unix_seconds(to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS))#0L] +Project [unix_seconds(gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, try_to_timestamp, Some(America/Los_Angeles), false)) AS unix_seconds(to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS))#0L] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain index d9e2e55d9b12e..4752e5218bb12 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain @@ -1,2 +1,2 @@ -Project [xpath(s#0, a/b/text()) AS xpath(s, a/b/text())#0] +Project [invoke(XPathListEvaluator(a/b/text()).evaluate(s#0)) AS xpath(s, a/b/text())#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain index 9b75f81802467..b537366736d25 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain @@ -1,2 +1,2 @@ -Project [xpath_boolean(s#0, a/b) AS xpath_boolean(s, a/b)#0] +Project [invoke(XPathBooleanEvaluator(a/b).evaluate(s#0)) AS xpath_boolean(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain index 9ce47136df242..76e0b01721841 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain @@ -1,2 +1,2 @@ -Project [xpath_double(s#0, a/b) AS xpath_double(s, a/b)#0] +Project [invoke(XPathDoubleEvaluator(a/b).evaluate(s#0)) AS xpath_double(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain index 02b29ec4afa9c..21aebb357928f 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain @@ -1,2 +1,2 @@ -Project [xpath_float(s#0, a/b) AS xpath_float(s, a/b)#0] +Project [invoke(XPathFloatEvaluator(a/b).evaluate(s#0)) AS xpath_float(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain index cdd56eaa73199..eee74472b1cff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain @@ -1,2 +1,2 @@ -Project [xpath_int(s#0, a/b) AS xpath_int(s, a/b)#0] +Project [invoke(XPathIntEvaluator(a/b).evaluate(s#0)) AS xpath_int(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain index 3acefb13d0f8c..8356c2c8e18c1 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain @@ -1,2 +1,2 @@ -Project [xpath_long(s#0, a/b) AS xpath_long(s, a/b)#0L] +Project [invoke(XPathLongEvaluator(a/b).evaluate(s#0)) AS xpath_long(s, a/b)#0L] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain index 0a30685f0c6d2..bc32d4fefffb8 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain @@ -1,2 +1,2 @@ -Project [xpath_number(s#0, a/b) AS xpath_number(s, a/b)#0] +Project [invoke(XPathDoubleEvaluator(a/b).evaluate(s#0)) AS xpath_number(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain index ed440972bf490..e0ba76b3acd0e 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain @@ -1,2 +1,2 @@ -Project [xpath_short(s#0, a/b) AS xpath_short(s, a/b)#0] +Project [invoke(XPathShortEvaluator(a/b).evaluate(s#0)) AS xpath_short(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain index f4103f68c3bc3..80f2600e6cdd4 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain @@ -1,2 +1,2 @@ -Project [xpath_string(s#0, a/b) AS xpath_string(s, a/b)#0] +Project [invoke(XPathStringEvaluator(a/b).evaluate(s#0)) AS xpath_string(s, a/b)#0] +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_add.json b/sql/connect/common/src/test/resources/query-tests/queries/column_add.json index cfa40fac8c6f9..3b8219884aa0b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_add.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_add.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin index 10b410b5b08b5..9b9889cf775fe 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_and.json b/sql/connect/common/src/test/resources/query-tests/queries/column_and.json index d3f8cd0e73cbc..2fa4c654cce1d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_and.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_and.json @@ -25,7 +25,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -38,9 +39,11 @@ "literal": { "double": 0.5 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin index 241f1a9303b2c..99111eba7191c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_between.json b/sql/connect/common/src/test/resources/query-tests/queries/column_between.json index 20927b93d8438..de970b1cdf343 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_between.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_between.json @@ -25,7 +25,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -38,9 +39,11 @@ "literal": { "integer": 20 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin index d03dd02a2f36a..324e26b35750a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json index bd3ac671fca33..71f6d6b3ec3ab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json @@ -22,7 +22,8 @@ "literal": { "integer": 255 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin index 4815bc7dd1a20..13e798120873a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json index eaa27ffa46164..e2771dc543b12 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json @@ -22,7 +22,8 @@ "literal": { "integer": 7 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin index 9cf110da4ad61..f823efd7551a1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json index c51eb3140c339..108d66745d34d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json @@ -22,7 +22,8 @@ "literal": { "integer": 78 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin index 70c61f9620576..994283d7a9431 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_contains.json b/sql/connect/common/src/test/resources/query-tests/queries/column_contains.json index 05d6ccf38b367..d8aef66bdf546 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_contains.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_contains.json @@ -22,7 +22,8 @@ "literal": { "string": "baz" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin index 9c796f9470c31..1126c759b0332 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_divide.json b/sql/connect/common/src/test/resources/query-tests/queries/column_divide.json index 8d71061b151ca..be7a5f3c851d1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_divide.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_divide.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin index 49b5d8d2590dd..22b010a97dd58 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.json b/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.json index f4171c2792fbd..5ee6cfe40b1ec 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.json @@ -22,7 +22,8 @@ "literal": { "string": "suffix_" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin index 03f41a339f00c..3f3db0c90bc19 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json b/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json index eea1da49bc59e..44e11ad2b8942 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin index 22de941ad44b0..0614560048a9b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_equals.json b/sql/connect/common/src/test/resources/query-tests/queries/column_equals.json index 7397f4fb46acd..1f05b249eb00b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_equals.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_equals.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin index e226de59ddcd4..cad0e9b14a814 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_geq.json b/sql/connect/common/src/test/resources/query-tests/queries/column_geq.json index 9f24bc251739f..4c7f5339409f0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_geq.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_geq.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin index 1c4af866109ab..a68ee6cc8b6a7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_gt.json b/sql/connect/common/src/test/resources/query-tests/queries/column_gt.json index 4bb8fb41f249d..74be85e709ed5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_gt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_gt.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin index 44ca37fbb4048..e85f5a3e23552 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.json b/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.json index 47c1b63abe319..60a62c4595ac8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.json @@ -22,7 +22,8 @@ "literal": { "string": "%fOb%" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin index 285400db7daf5..368bebd9ea48d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.json b/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.json index f594918ed930a..12d3d19d7797a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin index 1030abda5b8c2..8c2fad75be346 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json b/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json index f34d3f4eac552..6af0e5bfdb4e3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin index e8cccdf024934..cdc382e44ee22 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.json b/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.json index 74e990622a3a7..a6ac6534ecd55 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin index 8fc24a9e21b38..0ea4d6f2ffe89 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isin.json b/sql/connect/common/src/test/resources/query-tests/queries/column_isin.json index d8811a4e780b5..b34abc941cce8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_isin.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_isin.json @@ -30,7 +30,8 @@ "literal": { "string": "foo" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin index 365e07f35bb48..cde6686dd4064 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_leq.json b/sql/connect/common/src/test/resources/query-tests/queries/column_leq.json index cda8694c0439e..55388f667c448 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_leq.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_leq.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin index e8463292e4040..692ccad0aa9c3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_like.json b/sql/connect/common/src/test/resources/query-tests/queries/column_like.json index 1390451af55ab..be7b4e8dfeb34 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_like.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_like.json @@ -22,7 +22,8 @@ "literal": { "string": "%bob%" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin index 07382ec1643cb..a1856511eb1b2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_lt.json b/sql/connect/common/src/test/resources/query-tests/queries/column_lt.json index c927e75de181b..1264a0e43a54a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_lt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_lt.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin index f4c3a110b126b..083c8d46611d1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.json b/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.json index 0c5a78eea2dff..d11494f159a5a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.json @@ -22,7 +22,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin index 55bfeba04ed66..a86b5e5de63e4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.json b/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.json index 8c17581c67d1c..7b12a8850d5f2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin index 8fd1b3941d1f7..b4958d84c5a05 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_not.json b/sql/connect/common/src/test/resources/query-tests/queries/column_not.json index 2f873196ba1d0..3fa58e874d75d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_not.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_not.json @@ -18,7 +18,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin index 19609b6ee85a5..51ea1c4d20bc7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.json b/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.json index 589d57a18b94b..093770f4563be 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.json @@ -25,9 +25,11 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin index cdf0b4290e61e..cc13a11b48013 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_or.json b/sql/connect/common/src/test/resources/query-tests/queries/column_or.json index ae1424f763feb..324bfc850d2dc 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_or.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_or.json @@ -25,7 +25,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -38,9 +39,11 @@ "literal": { "double": 0.5 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin index 69f219e938a4e..a52ba0707a755 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.json b/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.json index e53403db41cd0..6ff88b1ea6560 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.json @@ -22,7 +22,8 @@ "literal": { "string": "^[0-9]*$" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin index 7dd56baf04213..b4cd080c61aad 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.json b/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.json index 431e13d818639..9a9036b3cf963 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.json @@ -22,7 +22,8 @@ "literal": { "string": "prefix_" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin index fa1132c73de7b..366011b3c3968 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_substr.json b/sql/connect/common/src/test/resources/query-tests/queries/column_substr.json index 3b02117cc6e5b..5beaf7c4c3711 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_substr.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_substr.json @@ -26,7 +26,8 @@ "literal": { "integer": 3 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin index 636a46a480626..5eedae63ea95c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.json b/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.json index d15c2941ee1bd..68faab9cbb05a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin index f5716427588ed..2a341fb5201ed 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json b/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json index 0db558e49e38c..b4c76e2c6719b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json @@ -18,7 +18,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin index 66343bea4e29b..53277e9dd2452 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json b/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json index db2ceccfd22ab..f2223c20e569c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json @@ -25,7 +25,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, { "literal": { @@ -42,7 +43,8 @@ "literal": { "integer": 20 } - }] + }], + "isInternal": false } }, { "literal": { @@ -52,7 +54,8 @@ "literal": { "string": "high" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin index 031c3683c5e6d..e22f469ea2490 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/cube_column.json b/sql/connect/common/src/test/resources/query-tests/queries/cube_column.json index 5b9709ff06576..b4d884568354b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/cube_column.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/cube_column.json @@ -30,7 +30,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }, "name": ["count"] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin index d46e40b39dcfe..99a704c0c7f07 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/cube_string.json b/sql/connect/common/src/test/resources/query-tests/queries/cube_string.json index 03625861d88f2..1c9fb7b264664 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/cube_string.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/cube_string.json @@ -32,7 +32,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }, "name": ["count"] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin index 59c7a55571201..2e092aa640278 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/filter.json b/sql/connect/common/src/test/resources/query-tests/queries/filter.json index 1046e1262150e..d40f8031884a5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/filter.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/filter.json @@ -22,7 +22,8 @@ "literal": { "long": "10" } - }] + }], + "isInternal": false } } } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/filter.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/filter.proto.bin index 069171ead3233..36bb753fff234 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/filter.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/filter.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.json b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.json index 662aa746af243..adbc647c186de 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.json @@ -41,9 +41,11 @@ "literal": { "string": "zstandard" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.proto.bin index 5da5c48b41153..eba3a4648ca60 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_with_options.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.json b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.json index da2840f2d3a0b..0ef3262f1eb4a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.json @@ -22,7 +22,8 @@ "literal": { "string": "{\"type\": \"string\", \"name\": \"name\"}" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.proto.bin index 4dd12e2dbe1dd..629804e8608aa 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/from_avro_without_options.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.json b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.json index 375c0f9324c3f..fe2efd928ccf4 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.json @@ -26,7 +26,8 @@ "literal": { "binary": "CvwBCgxjb21tb24ucHJvdG8SDXNwYXJrLmNvbm5lY3QisAEKDFN0b3JhZ2VMZXZlbBIZCgh1c2VfZGlzaxgBIAEoCFIHdXNlRGlzaxIdCgp1c2VfbWVtb3J5GAIgASgIUgl1c2VNZW1vcnkSIAoMdXNlX29mZl9oZWFwGAMgASgIUgp1c2VPZmZIZWFwEiIKDGRlc2VyaWFsaXplZBgEIAEoCFIMZGVzZXJpYWxpemVkEiAKC3JlcGxpY2F0aW9uGAUgASgFUgtyZXBsaWNhdGlvbkIiCh5vcmcuYXBhY2hlLnNwYXJrLmNvbm5lY3QucHJvdG9QAWIGcHJvdG8z" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.proto.bin index 07d4c6c5b286f..db667ef8ee6b2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.json b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.json index db9371b64ef72..93974afec3566 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.json @@ -37,9 +37,11 @@ "literal": { "string": "2" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.proto.bin index 00fd58da6be84..a7262d64522c6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_descFilePath_options.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_abs.json b/sql/connect/common/src/test/resources/query-tests/queries/function_abs.json index 13df3437ddabe..aa589275670b8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_abs.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_abs.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin index 86cfbc09a8f91..6bda0e50aa837 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_acos.json b/sql/connect/common/src/test/resources/query-tests/queries/function_acos.json index 7506c0f6cb630..82543692456c1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_acos.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_acos.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin index cc6a279cb188e..98bc0d821d7d0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.json b/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.json index 6a83b4ab008bc..82a69e9f74166 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin index e16ed2ba92e3f..48c57cb1c9f9b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.json b/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.json index b1b2e78a08435..97b9a00d5ea3e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin index 6abacc9cc2b40..c396f24928cf5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.json index 4204a44b44ce0..28beb401cd650 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.proto.bin index f635e1fc689b1..40687059a8c45 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.json index 9c630e1253494..0436dd1a60c85 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.proto.bin index 41d024cdb7eed..8e9a324c2fde5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.json index 8f5be474ab4b3..56ad10f6f74bf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.proto.bin index cd6764581f2ca..97091b52e6c59 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.json index 9381042b71886..305cf0b317a23 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.json @@ -34,7 +34,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.proto.bin index ca789f04ce1d4..0d4c98e59ce0a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.json index 06469d4840547..89d07a44e8440 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.proto.bin index c7a70b51707f3..0089323b6bbe1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.json index 7eb9b4ed8b4ed..afef2dba9aad9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.proto.bin index ecd81ae44fcbd..3d89f200e609c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.json index 59a6a5e35fd42..8617d2d9d928a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.proto.bin index 9de01ddc5ea69..3888e9a1d075d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.json index 285c67289d30a..b8b8e66787848 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.json @@ -34,7 +34,8 @@ "literal": { "binary": "Q0RF" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.proto.bin index 812426f3c00d3..0ff56c7c74372 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.json index eb0e178fd3534..cb790e822a52f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.proto.bin index ee39beb07cee4..7ff11b9868b99 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aes_encrypt_with_mode_padding_iv_aad.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.json index df1813aed64c5..8e113b8874a5f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.json @@ -35,7 +35,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["y_2"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -55,7 +56,8 @@ "nameParts": ["x_3"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin index c43f4e6dbbc1b..12456f54ab438 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.json b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.json index 956b42db65639..b46810e63a304 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.json @@ -35,7 +35,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["y_2"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -57,14 +58,16 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }, "arguments": [{ "nameParts": ["x_3"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.proto.bin index cf32ea4ddd3e7..70fbe778cb715 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_aggregate_with_finish_lambda.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_any.json b/sql/connect/common/src/test/resources/query-tests/queries/function_any.json index 4512c060d703b..73332625c2995 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_any.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_any.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "flag" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_any.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_any.proto.bin index 9b014b58da57c..ff038010b22c2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_any.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_any.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.json index 011d43a91d080..35cc2daade7a8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.proto.bin index 546c696ecfdf3..29c9a0d93723d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_ignore_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.json index 7d4f5a2de38e8..9307c56feb4af 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.proto.bin index 124a7ad7efe09..fc2e75c2015b9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_any_value_with_respect_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json index 5579faf119647..773fd7df0ab30 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin index bac82f670b298..9c7bd463ab943 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json index 851862082ca04..0224877077bc5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json @@ -22,7 +22,8 @@ "literal": { "double": 0.1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin index fd61420fd1e45..79b285eb0a28f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.json b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.json index 490a2dcd86967..925f95e618930 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.json @@ -26,7 +26,8 @@ "literal": { "integer": 20 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.proto.bin index ae73716fa4319..478af7d5ff53a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_approx_percentile.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array.json index 20fe495bb9bf4..99152d4e998d1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin index 2b679eb4c6db1..25fd26ed8ec81 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.json index a3197ce95068a..0e4e0fe4dd504 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.proto.bin index c7306df86214e..4b30c105a1078 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.json index cabd44c063dec..b6af59d5a1cc1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.json @@ -22,7 +22,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin index 76f2f0255bf25..3e2ac2115ff67 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.json index c3ebf313190c2..93b449217eb51 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin index 949d66cb951f0..5c244efd0258f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.json index a362d66d9d64d..349927b7cfd9e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.json @@ -22,7 +22,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin index d8764f60364c2..af333721d6944 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json index d38f4194bcd2b..00b65fc1665d5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin index e6359c074bf23..98dfa75ac01b3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.json index 17d50c87161d6..81ed93a29524d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.json @@ -33,9 +33,11 @@ "literal": { "integer": 4 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin index 692511b2f74a6..1037a1ffe38db 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.json index f4540edbf4108..5bd114b61ad40 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.json @@ -26,7 +26,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin index 6e2178ad124e9..a44ca96ace24a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json index 1b95a6724f86d..daa94e5aed678 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json @@ -29,9 +29,11 @@ "literal": { "integer": 4 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin index 67fb497cf270c..dca31097549b8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.json index 94e8c176cefbf..cbecc842d0b2c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.json @@ -22,7 +22,8 @@ "literal": { "string": ";" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin index fbab1b208605d..0b2959acac448 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json index ad580c33e476c..c5c5dce8cda9e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json @@ -26,7 +26,8 @@ "literal": { "string": "null" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin index e3fb6b3bf67c3..f0045cbeb529e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.json index ba67984758a5a..2bf3706f92b88 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin index f7a98c08cd175..b8009a1fb6193 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.json index a342ae18f9ef7..6485fb4cf3691 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin index 02cfdfeb215d6..45ac77d97e837 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.json index 4c212cb028273..20c7794a7de6b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.json @@ -22,7 +22,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin index 4ef2b11273f25..ee2811ae0de0f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.json index ededeb015a227..ff6bd2b0e33fc 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.json @@ -22,7 +22,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.proto.bin index 837710597e7b6..8ad00dfca7a04 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_prepend.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.json index 8c562247714a4..f769471cd9791 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.json @@ -22,7 +22,8 @@ "literal": { "integer": 314 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin index 95e2872ad77bd..fd44cfb3372ad 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json index c9d9f1f9ca79d..0d218470c1ec1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json @@ -22,7 +22,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin index e370db16e977c..6302ae6ee3f0c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.json index c1c618bc7f11f..ac279580a09e1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.proto.bin index 47949dfbbda29..97554f7ecc930 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_size.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.json index 406dc54c8cd2f..74a038895b36a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin index 2074caae16384..6ac4fd09dc0ca 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json index f8178ddd64aaf..7a36c03476279 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json @@ -31,7 +31,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["y_2"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -40,7 +41,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin index c506889388c97..77719f7334985 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.json b/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.json index 7d54079cdb47e..841888bcb1497 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.json @@ -33,9 +33,11 @@ "literal": { "integer": 3 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin index fc3d9d7cd0fd1..ee743e69702ae 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json index ce1d288e00d78..73b49b729edfa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json @@ -29,9 +29,11 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin index 216f306507d40..b555679123842 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json index f24ee44835eb4..ce0c0ce75ab9c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json @@ -29,9 +29,11 @@ "literal": { "integer": 20 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin index 67c867e6d450c..c1baf7e190a15 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.json index 3c4dcb70fead3..4c98e62f9dbb9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin index 5989bd3b5c606..cd46fe994aedd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_asin.json b/sql/connect/common/src/test/resources/query-tests/queries/function_asin.json index 4bf89be753458..6d06a28966590 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_asin.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_asin.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin index 737ad789da268..6365765a3a45a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.json b/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.json index 238571b0231c6..95900eaf761fd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin index 01ea4675b22eb..f23beb2848fae 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json b/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json index 5520b70a0250b..83cd89e5b9afa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json @@ -25,13 +25,15 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } }, { "literal": { "string": "id negative!" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin index 6992604efe1b3..923478e910580 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_atan.json b/sql/connect/common/src/test/resources/query-tests/queries/function_atan.json index 3ae4e7ef188ec..2a873025e6254 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_atan.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_atan.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin index b932086941f45..c458d693ca127 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.json b/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.json index 7d08116c40ae6..53a03d1324f25 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin index 372ae8358494e..25a25871185e7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.json b/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.json index 8daec8813917e..f78b9f6421e89 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin index 0aa2f3527ae9c..eebe635bc33e0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_avg.json b/sql/connect/common/src/test/resources/query-tests/queries/function_avg.json index b433f1ea89c29..a3d8868671520 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_avg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_avg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin index 9d9bd296dbdda..eb1078cd83ba7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_base64.json b/sql/connect/common/src/test/resources/query-tests/queries/function_base64.json index 97739dca283ef..8ba9e38dd538e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_base64.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_base64.json @@ -26,7 +26,8 @@ } } } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin index fc854d974752b..0ab0e038829c3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bin.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bin.json index 304e56504bad9..56a5ce889d93d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bin.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bin.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin index e8d55fb8d6149..64790b75bc090 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.json index 83b2bcf599f7c..3d156cec87a17 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.proto.bin index ad81bec6f0865..af643d406b41e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_and.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.json index d5c6b698f7f16..277244f769144 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.proto.bin index 875e17d974e21..1eb39a6fb7346 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_count.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.json index 39425c5e3ffbc..9e2b2d901308f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.json @@ -22,7 +22,8 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.proto.bin index cd0f4098374c0..e3769a795a2f4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_get.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.json index df21871cb535d..1f09a52756b80 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin index 860c2eaec0e85..13617e95afd73 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.json index c8e1b2acfe4e0..be21d9557e076 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.proto.bin index a52907474fb96..9e4ef6d43a175 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_or.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.json index 463e6fc5322f2..250d2d97c5b0b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.proto.bin index c4a9a5e654f2b..33f0570e72f10 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bit_xor.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.json index 8956c3f303fea..1e8a60bae7e3d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.proto.bin index 8dcf7b1718d4b..849fa465d39db 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bit_position.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.json index 8956c3f303fea..1e8a60bae7e3d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.proto.bin index 8dcf7b1718d4b..849fa465d39db 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_bucket_number.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.json index 910702e90e0ed..aac1f57b5d9b6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.proto.bin index deae4a3aea072..8c57d776b1d63 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_construct_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.json index b2e9f11efbcbd..95095c7c28922 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "bytes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.proto.bin index a568b2dae4208..216c51d3c4d29 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_count.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.json index 23e37246647b9..041ed9cb6330f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "bytes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.proto.bin index d27edc3766288..41309fc648cda 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bitmap_or_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json index 7ddf73253e0a3..736751212fa96 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin index bfaefb2a20075..5cfb94c1da38f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.json index 9b58c898242d9..3e013976a418f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "flag" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.proto.bin index 6ea3860027c86..e6e52551364e1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_and.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.json index 763b019b05f1f..7cf57c57812fd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "flag" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.proto.bin index dd928c8dc8a31..207083a8a12a0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bool_or.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bround.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bround.json index 585a0befb224d..d42711c424c46 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bround.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bround.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin index 8625ccb1a58f1..40e173d9df4a0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.json b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.json index 3f35d627f9a54..8c3ec028f0a54 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.proto.bin index 200dac07a0bb7..e2cbbefb551a4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.json b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.json index cf0476340ccb3..9f8b77f176f72 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.proto.bin index d7669c93b2b89..25e48c663b97e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_btrim_with_specified_trim_string.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.json b/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.json index 971660144a5bc..002222d2f7765 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin index 1b389401f15e6..874d68be22a53 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.json b/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.json index 0d78dd471f20c..a4c83ac456721 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.json @@ -19,7 +19,8 @@ "unparsedIdentifier": "g" } }], - "isUserDefinedFunction": true + "isUserDefinedFunction": true, + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.proto.bin index aee05767813f9..bde6f21637bea 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_call_function.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.json b/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.json index e2b3dd0428793..82b5dadd84073 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.proto.bin index 54c8cfe843433..232a89d09c6e8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_cardinality.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.json index 5a9961ab47f55..d6f4769366fe7 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin index 3761deb1663a2..2b18da5724b81 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json index bda5e85924c30..7413bb77656f5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin index 8db402ac167e0..a658e6f879f89 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.json index 99726305e8524..878003697424f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.proto.bin index cc91ac246a57c..57b7616c17930 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.json index c0b0742b12157..c1fdfb5dfad50 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.proto.bin index 30efc42b9d2bc..ba54d5d4d1edc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_char.json b/sql/connect/common/src/test/resources/query-tests/queries/function_char.json index 593139a0a584a..70f6fbcd475f6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_char.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_char.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_char.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_char.proto.bin index 21c3dad55657b..df7fb55ab4ab0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_char.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_char.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.json index 3e408260d7020..94323cf8e727e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.proto.bin index 7f290c6ddc623..93096e7ff4051 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_char_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.json index ad12dde8a956c..286b1e9d6040f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.proto.bin index f1762971d4eca..12932bdb3087d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_character_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_chr.json b/sql/connect/common/src/test/resources/query-tests/queries/function_chr.json index 28366f87e10d7..a7d551a94c194 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_chr.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_chr.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_chr.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_chr.proto.bin index dc665d294ecb6..7b51a92f63aae 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_chr.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_chr.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.json b/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.json index 497922b5df75c..523f853552b5e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.json @@ -22,7 +22,8 @@ "literal": { "integer": 3 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin index ec871018489c2..6ceb52f4de47e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collate.json b/sql/connect/common/src/test/resources/query-tests/queries/function_collate.json index 8bb6c2c4c3726..f1af032978d9e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_collate.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_collate.json @@ -22,7 +22,8 @@ "literal": { "string": "UNICODE" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collate.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_collate.proto.bin index dda4a00a395b6..17ce43c3de332 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_collate.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_collate.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collation.json b/sql/connect/common/src/test/resources/query-tests/queries/function_collation.json index dac04b3b9858f..50ad1268ea9d6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_collation.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_collation.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collation.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_collation.proto.bin index 739994a486026..3cb87236be855 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_collation.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_collation.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.json b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.json index c5bae4baef352..81210507bc861 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin index e3827b9f650ae..a66294b2c475d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.json b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.json index 615386d050e14..4ebf94c214391 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin index 5fb97f27d25b6..abbd935086a33 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_concat.json b/sql/connect/common/src/test/resources/query-tests/queries/function_concat.json index bad1ad6f3b90e..cd3702e86b3dc 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_concat.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_concat.json @@ -29,7 +29,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -42,9 +43,11 @@ "literal": { "integer": 40 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin index 7411f55f14747..87076f350fe42 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json b/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json index b9ba89b42185c..9468c641c0a81 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin index 2fbc4f7090448..34d0bbdeb8794 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_contains.json b/sql/connect/common/src/test/resources/query-tests/queries/function_contains.json index b7cb12d9aa9a3..3a10154044dab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_contains.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_contains.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_contains.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_contains.proto.bin index 8864968a9dc3b..80eeae9cd02c2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_contains.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_contains.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_conv.json b/sql/connect/common/src/test/resources/query-tests/queries/function_conv.json index c6734936bfcd1..9e65edfed6ee6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_conv.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_conv.json @@ -26,7 +26,8 @@ "literal": { "integer": 16 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin index 373b997b79240..3161da1cbca42 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.json index b27d7e2b55fae..6d0b24f2c8c72 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.proto.bin index 8ef4e3bdce29e..26e096c4c3f20 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_with_source_time_zone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.json index b072c89d42bd1..40f599741a9d2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.proto.bin index c6d1db9b8fb1d..a7f5317512a32 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_convert_timezone_without_source_time_zone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_corr.json b/sql/connect/common/src/test/resources/query-tests/queries/function_corr.json index 6fadb0385622b..c7c44457d4e0d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_corr.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_corr.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin index fdeeb4fd12d19..5aac92504c0bd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cos.json b/sql/connect/common/src/test/resources/query-tests/queries/function_cos.json index f7072dff03404..f9ee2077f33a0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_cos.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_cos.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin index 09fd198b097c0..f7592d6ab63c9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.json b/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.json index 3bcab61d37a0d..78b0d73b977ab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin index 54d5da8fabfa6..5619d48d20397 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cot.json b/sql/connect/common/src/test/resources/query-tests/queries/function_cot.json index 62ce963fa8737..1e521c2e6d7f6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_cot.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_cot.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin index e79c32660a772..e8ba807d9506a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count.json b/sql/connect/common/src/test/resources/query-tests/queries/function_count.json index 126a0ca242c52..613ba5510828f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_count.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_count.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin index 6c87a809ad0c4..1e9b69069e05d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json b/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json index eb211ceb239aa..6a844d694ae8f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json @@ -23,7 +23,8 @@ "unparsedIdentifier": "g" } }], - "isDistinct": true + "isDistinct": true, + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin index 591e2300ec689..b25c393cea048 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.json b/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.json index 669477bbc5dd8..f19ce9ced1a5d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.json @@ -25,9 +25,11 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.proto.bin index 07c65ebaa4293..97306ae5253e4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_count_if.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.json b/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.json index 94be79dcc33e5..badc965380235 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.json @@ -30,7 +30,8 @@ "literal": { "integer": 11 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.proto.bin index 11bcae8062e82..d2bc25513fde3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_count_min_sketch.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.json b/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.json index 1c5df90b79cd1..db67d7e86bd46 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin index 44b613eb40c6f..852290d484be3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json index 3c4df70a5fbfc..294d055796e08 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin index 4a7202f15e768..09939663ce3bf 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json index 7c723069e4671..f366b2f62ae50 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin index ebff687730e35..c8ef18e85fe67 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.json b/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.json index 1892a9af85d97..7ab879e648a6c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.json @@ -26,7 +26,8 @@ } } } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin index 54ad14dedae4e..07cb4138077d1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_csc.json b/sql/connect/common/src/test/resources/query-tests/queries/function_csc.json index 88504ed9c5280..68cc4e0096f4b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_csc.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_csc.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin index 0ed5022a73adf..8bf716cbab8a1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json b/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json index ac48841199075..a2978d3242ca6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json @@ -15,7 +15,8 @@ "window": { "windowFunction": { "unresolvedFunction": { - "functionName": "cume_dist" + "functionName": "cume_dist", + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin index 7578245aabe3a..1dea3404d5a55 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.json b/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.json index c344f5271704d..5b49d10fb5c56 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "curdate" + "functionName": "curdate", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.proto.bin index 6ec58b57c2a7a..b307dc12a647e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_curdate.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.json index 27c7b23111908..ff962a03b8053 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_catalog" + "functionName": "current_catalog", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.proto.bin index bb25a49935482..b1db0f9bd74d8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_catalog.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.json index dfa59fd5fe50a..1624c30ce4000 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_database" + "functionName": "current_database", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.proto.bin index fdb11c9c8bd06..541891bc4418b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_database.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.json index 6dab8c39d626c..3f3c9c7dba5c0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_date" + "functionName": "current_date", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin index f32c3f541c4c7..d6b94df786293 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.json index 01d5126a74b39..f8ac56d6398f0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_schema" + "functionName": "current_schema", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.proto.bin index 9687afe89a50b..e083896cdfeae 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_schema.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json index 16af5eb9ba084..bea08c2ee6af0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_timestamp" + "functionName": "current_timestamp", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin index 5a1f3de6c3a9a..8691a0468fa82 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.json index 082d7f5bae6f6..d3807c4dbbe92 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_timezone" + "functionName": "current_timezone", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.proto.bin index a780830516bc0..bf06690e07afc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.json b/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.json index 30ddb80f884c2..f7bf8d9f91509 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "current_user" + "functionName": "current_user", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.proto.bin index 0a79078fd7097..df4ea4e2cc4da 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_current_user.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.json index f81ad3335242c..1c022cdafbc3b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin index f4dbc16b05c1d..d1e296f485cb7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.json index b6094ff8734d6..a4fc25f1bf0db 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.json @@ -33,9 +33,11 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.proto.bin index 5621af09474d8..bd37bb76df8e8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_diff.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.json index 9b3d469ed4e98..6ef48b7f015a3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.json @@ -22,7 +22,8 @@ "literal": { "string": "yyyy-MM-dd" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin index 7226c20974b2a..f28036fdcc2d9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.json index ada0747743b78..08769597e4d7c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.json @@ -18,7 +18,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.proto.bin index 5cbd76eda9074..ba9b9047893d7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_from_unix_date.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.json index 5e8d075c4e2d0..51590b395c19a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin index 368ecb676c1fe..cddd5c1954888 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.json index f1dde0902a20a..2d74935b65010 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin index 43b630c27ed45..a033be1ab6e41 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json b/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json index 363da9b9b9006..649c211af8465 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json @@ -22,7 +22,8 @@ "literal": { "string": "minute" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin index f037fb8d34a56..fad72e5dc4eb8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.json b/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.json index 2658c724d287a..e8d272bafa922 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.proto.bin index e72a77f0e2394..1dcbd5fc64ca2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_dateadd.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.json b/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.json index b5ef560486d0d..e9e9dccf52f95 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.json @@ -33,9 +33,11 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin index 02e917b406838..bbcd3b621848a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.json b/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.json index cc4dca8674264..42b21d6e9c831 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin index 9d58fd3474d70..470e4796515fe 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_day.json b/sql/connect/common/src/test/resources/query-tests/queries/function_day.json index c1e4b4d13fb7a..712ad56df109d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_day.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_day.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_day.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_day.proto.bin index e72a4a354c31d..9115ae09ce8fc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_day.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_day.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.json b/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.json index 7898aa53deb89..a27513443fe4a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.proto.bin index 2809f3b9b7a11..370737bbc1fb7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_dayname.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json index 3e453c1f7a652..76b0a6e03f27c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin index 3a2973e21e5a0..8a63ee68777e8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json index 74715de151e77..7cea8fdc7745d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin index fceea203c790e..6c9a6252a48ed 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json index d23c6790a47dd..3579113015448 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin index a526b449ae0a4..ef1d60c231599 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_days.json b/sql/connect/common/src/test/resources/query-tests/queries/function_days.json index 9e20c48729a30..b9a73878304b6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_days.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_days.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin index b0a8472f8c4ff..b81c13479a36f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_decode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_decode.json index 6be60808e64f3..2aaae4466b5fc 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_decode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_decode.json @@ -30,7 +30,8 @@ "literal": { "string": "UTF-8" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin index 18b8bbcf6a01d..d71f018e81ee8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.json b/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.json index e096b07e4dc6e..f366faf9ac7c5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin index e2d264bb2e108..efe6bb3a93cca 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json b/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json index 46c5e1eaddfc0..2699a863e9755 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json @@ -15,7 +15,8 @@ "window": { "windowFunction": { "unresolvedFunction": { - "functionName": "dense_rank" + "functionName": "dense_rank", + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin index 4597e63be8379..bd6861d0514e4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_e.json b/sql/connect/common/src/test/resources/query-tests/queries/function_e.json index c99c04a6befdb..12f1ed4146bb1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_e.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_e.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "e" + "functionName": "e", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_e.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_e.proto.bin index 49f6c12fbcc72..e08ad39fe02f7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_e.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_e.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.json b/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.json index ef5551440934c..562f84a0abfb3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.json @@ -22,7 +22,8 @@ "literal": { "string": "bob" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin index 993818c6cb4bf..6cbf58ebaaf63 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_elt.json b/sql/connect/common/src/test/resources/query-tests/queries/function_elt.json index fe7dd29f91a33..0573d25e70879 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_elt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_elt.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_elt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_elt.proto.bin index d719db6f89c37..429c39ca81b26 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_elt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_elt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_encode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_encode.json index 92e95f2c946d0..51d0c871f3494 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_encode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_encode.json @@ -22,7 +22,8 @@ "literal": { "string": "UTF-8" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin index 9644825af470b..a996b0e73435b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.json b/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.json index 1f7943f5116ba..2577b62ccbca8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin index 2dfef1c6d86bc..89619bc8c38ba 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.json b/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.json index bc53edfa25f5e..50963dd1c9d0a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.proto.bin index 2855f3ebbf3c3..ad21a1e96b9de 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_equal_null.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_every.json b/sql/connect/common/src/test/resources/query-tests/queries/function_every.json index ffefb8cf1103f..31d78cf56c07e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_every.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_every.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "flag" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_every.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_every.proto.bin index 1b28782200223..6ccfd9c879923 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_every.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_every.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_exists.json b/sql/connect/common/src/test/resources/query-tests/queries/function_exists.json index 3ae49d13c5fc6..caa2c6170f251 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_exists.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_exists.json @@ -31,14 +31,16 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, "arguments": [{ "nameParts": ["x_1"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin index d808227fdc659..ac30b86a396da 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_exp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_exp.json index d317efef75eee..a814a7f3a4937 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_exp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_exp.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin index 7def20c94df00..0803af7c262c1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_explode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_explode.json index 35ad40ccdd04f..8c0094b3a39d5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_explode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_explode.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin index 9c15f942bb11d..fb5d06b544709 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json b/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json index efd7f4b524d47..12c753a38be70 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin index 9f2cf9554dd15..8e9ecc3cfacb0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.json b/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.json index d425a6de709b7..70cd7b9284654 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin index 3c310cb04ce3d..daa03048ab5b5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_extract.json b/sql/connect/common/src/test/resources/query-tests/queries/function_extract.json index 6ccdb2f9d7531..851ae5de2741a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_extract.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_extract.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin index 91553c3b94bcf..1d47c480d7708 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.json b/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.json index 7f13a10480915..e762531e0be86 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.json @@ -25,9 +25,11 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin index ac776eb60d2b0..4863eee97ac1c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_filter.json b/sql/connect/common/src/test/resources/query-tests/queries/function_filter.json index 1c71362f75247..6963b28386740 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_filter.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_filter.json @@ -31,14 +31,16 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, "arguments": [{ "nameParts": ["x_1"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin index aa776b474a4d6..b669b9ff69c2b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json b/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json index f2b85c21af755..94f04a9111a88 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json @@ -34,7 +34,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -47,9 +48,11 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -58,7 +61,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin index 8cf5f2d65cf29..b927f1c623a44 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.json b/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.json index 538651b52c424..424357c82f13d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.proto.bin index 26abfa0e394c3..662aec53059a7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_find_in_set.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.json index af55fe44ae8ca..bab44ade0cfaa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.proto.bin index 7121c820aa737..6c72610c94265 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_ignore_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.json index 8276e35893feb..85ef747ef81ba 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.proto.bin index b843d52111528..b6c8d358976a8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_first_value_with_respect_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.json index dc33bad3c506a..ec4c9da731d65 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.proto.bin index cb029dfd26be9..a995bb05e6f26 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_ignore_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.json index 0e315cc6b1bce..244f2d1f9575e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.proto.bin index bf1d48903dfab..eb52e48309944 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_first_with_respect_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.json b/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.json index 1f04630fd5f31..3118b0620c0e3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.json @@ -32,11 +32,14 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin index 9a684850f9cfa..1d930ab05303c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_floor.json b/sql/connect/common/src/test/resources/query-tests/queries/function_floor.json index 78924f5f33627..9bf149ea295d9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_floor.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_floor.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin index b52696ca4d00a..1f529ad20219f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json b/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json index 394621e4dd314..af37c822a21df 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin index ee0665bab644c..f2510d8540374 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_forall.json b/sql/connect/common/src/test/resources/query-tests/queries/function_forall.json index 4a4914d6a9b1d..a9084fe59bdb6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_forall.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_forall.json @@ -31,14 +31,16 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }, "arguments": [{ "nameParts": ["x_1"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin index 7fc2821694589..b72863b1db28b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.json b/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.json index daa648c0a599e..6f376efc9fa81 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.json @@ -22,7 +22,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin index 81e2c4d5fd54d..2f156c4336703 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.json index 798e79e6618f5..60242d45866aa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.json @@ -33,9 +33,11 @@ "literal": { "string": "FAILFAST" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin index 8acd3b619b41e..2a3a14f8914a1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json index ddfa91abca05e..850ae645c0497 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json @@ -22,7 +22,8 @@ "literal": { "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin index ad95d0f2b343d..5a1990c957290 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.json index ddfa91abca05e..850ae645c0497 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.json @@ -22,7 +22,8 @@ "literal": { "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.proto.bin index ad95d0f2b343d..5a1990c957290 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json_with_json_schema.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json index 81d6608adb18f..bbe54e42d2fe8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json @@ -18,7 +18,8 @@ "literal": { "long": "1" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin index b1c34caaf62f0..5b92281e1f289 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json index 5d63fd829f302..ce7d64ebdc541 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json @@ -22,7 +22,8 @@ "literal": { "string": "-08:00" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin index 34bf9c64f3a97..c562454e01d11 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json index cfcd40a74b3a7..f137f14baac3a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json @@ -22,7 +22,8 @@ "literal": { "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin index 1cc3a26c254fb..dc331f2b26074 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.json index cfcd40a74b3a7..f137f14baac3a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.json @@ -22,7 +22,8 @@ "literal": { "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.proto.bin index 1cc3a26c254fb..dc331f2b26074 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml_with_json_schema.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_get.json b/sql/connect/common/src/test/resources/query-tests/queries/function_get.json index 7a2a89447c079..fd8f86b0f518c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_get.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_get.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin index be40df955a407..9561814a9e2d2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json b/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json index 17adf9230a6eb..505bfcc07bf89 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json @@ -22,7 +22,8 @@ "literal": { "string": "$.device_type" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin index 08ad8f4f91bad..df7774ef54842 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.json b/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.json index ef33382022a53..8bb777bfc37b2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.json @@ -22,7 +22,8 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.proto.bin index 15575e4f7cbb9..b7684864503db 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_getbit.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.json b/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.json index bf5d50edec84f..92414ef27da5b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.json @@ -31,7 +31,8 @@ } } } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin index 44d9d5f8cfb2d..a1ba220f29429 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hash.json b/sql/connect/common/src/test/resources/query-tests/queries/function_hash.json index 6ef504a006457..04363f49f72b8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_hash.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_hash.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin index 284700c4c5ea9..dea61a10eeb52 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hex.json b/sql/connect/common/src/test/resources/query-tests/queries/function_hex.json index af9d0dd298277..7cfc7a5647adf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_hex.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_hex.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin index 9d8c3b5e23584..52d2da3ea1e8b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.json b/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.json index 548b4977ddc50..2758e1a0638f6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.json @@ -22,7 +22,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.proto.bin index 81dbcd476ecbc..465557aa7f48b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_histogram_numeric.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hour.json b/sql/connect/common/src/test/resources/query-tests/queries/function_hour.json index 2621b9f81913c..e13d18f244903 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_hour.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_hour.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin index 6cdb50364c133..f57e2333ba855 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hours.json b/sql/connect/common/src/test/resources/query-tests/queries/function_hours.json index a72a8656362fd..d521c709a93d2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_hours.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_hours.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin index 6e8203b89e320..1c73b8d22fbb3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.json b/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.json index 2d0d6be0164bc..42fe4ac079293 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin index 3ad07a2a1ee45..8dce8d0b02a00 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.json index 2a426fe6fff3a..577d45ddfe6e8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.proto.bin index c1307c2be8caa..9840001010b01 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ifnull.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.json index 46b1b87e03246..aa97c83dba131 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.proto.bin index b1c50e3aaf4b2..8dd2e803b2234 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.json index 6392912efe85d..f3fcfd7a86b4e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.json @@ -26,7 +26,8 @@ "literal": { "string": "/" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.proto.bin index de0d89f2c8cee..6a5714f5731c4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ilike_with_escape.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.json b/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.json index 896bb3d0209da..733efb38bac41 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin index 72df35bd9b387..9129dbf467c28 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_inline.json b/sql/connect/common/src/test/resources/query-tests/queries/function_inline.json index 4abdac736d0fe..f63e0184608ee 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_inline.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_inline.json @@ -21,9 +21,11 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin index 261e28e3acaa8..105edc74658df 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json b/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json index d74ee83eeff3e..8ad2752040c41 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json @@ -21,9 +21,11 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin index d757e5afe3050..fc847de167c69 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.json index 2b478579f377d..69f7aae01de2b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "input_file_block_length" + "functionName": "input_file_block_length", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.proto.bin index 55684ba7d1b02..6dd26fb322275 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.json b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.json index a85f58c3b9aaf..d05cfb4126397 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "input_file_block_start" + "functionName": "input_file_block_start", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.proto.bin index 6fa8027cc82e2..eed5dac21f88a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_block_start.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json index 47f2e461eba46..7801d2afa72b1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "input_file_name" + "functionName": "input_file_name", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin index c3c6414d5d881..b1eae731498e6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json b/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json index 7ae72f8f88e5c..2743ad7263a7f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json @@ -21,9 +21,11 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin index 4d3d2624609e7..2bf328f8a9db5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.json b/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.json index f594918ed930a..12d3d19d7797a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin index 1030abda5b8c2..8c2fad75be346 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.json b/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.json index 6ec209e7c24fc..eb011fe4664eb 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.proto.bin index 1b37308d24753..5bd84491a2b94 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_isnotnull.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.json b/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.json index 7443fc97f42cf..11608a0de2195 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin index 3d1fbd4dedfe7..05217e9e9085b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.json b/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.json index 196dd4869577f..828ccaf2c570e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.proto.bin index b5cd2ea0e929b..971395d4b3573 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_java_method.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.json index 36223a451e3f8..1789c0bef0840 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.proto.bin index 817c803d83033..79aefffa10332 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_json_array_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.json b/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.json index f8667a1012a08..9f9f60134485c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.proto.bin index 4be9477ec9185..023ce9ea65266 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_json_object_keys.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json b/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json index 32de63452c364..4f2b072db3e25 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json @@ -30,7 +30,8 @@ "literal": { "string": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin index e51be42b38d34..60b1685ece0f1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json b/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json index 7399d7a6da388..386c3c8205046 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin index 848a4842e2462..30cd079db28d1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lag.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lag.json index dd1cba376f3c7..f0f38c35c0c5f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lag.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lag.json @@ -35,7 +35,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin index 7fd85861fb8c8..3abeec8a65bfb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.json b/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.json index 2cb1635caf47e..cbd450c2859fc 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin index 1afb5c02ae347..925a919daffc0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.json index e78a456082cbd..18e55564d6ac0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.proto.bin index c04f8385995ee..0baa09d55bc20 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_ignore_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.json index cb509997e6533..d1388758fe8a6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.proto.bin index cee9838b70438..80bf3dfcc9abb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_last_value_with_respect_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.json index 6d1be02c78545..cb147d6998478 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.proto.bin index f6590582c6f5a..2477c1e58803d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_ignore_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.json b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.json index f26e5887ed527..dd68e3d189c03 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.json @@ -22,7 +22,8 @@ "literal": { "boolean": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.proto.bin index 69221737be671..a4a02664b5030 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_last_with_respect_nulls.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.json index a1610815b6c7d..a3a293bc7b1b8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.proto.bin index d5627abb0a5d0..e18e7e6781e76 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lcase.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lead.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lead.json index ef76586d381dd..aab3c54f48954 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lead.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lead.json @@ -32,7 +32,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin index 9bcdcdb3617a9..f72eea8071743 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_least.json b/sql/connect/common/src/test/resources/query-tests/queries/function_least.json index 403531c9f6958..d0bcb0b07e404 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_least.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_least.json @@ -31,7 +31,8 @@ } } } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin index c9ead802a9616..bcb27b7e02114 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_left.json b/sql/connect/common/src/test/resources/query-tests/queries/function_left.json index e629782ba6d5b..dbf9948ee3555 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_left.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_left.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_left.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_left.proto.bin index 497cf68194e88..c5b4ff7f56763 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_left.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_left.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_len.json b/sql/connect/common/src/test/resources/query-tests/queries/function_len.json index 884f875a961da..3b353abe2eac6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_len.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_len.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_len.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_len.proto.bin index 939a6c9c3360b..f787d98c0698e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_len.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_len.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_length.json index f2c3c69255897..cbb6cf10974ad 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_length.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin index a14f94085b3b6..f4a53e9fa48d4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json index 10caaf184fee5..878dc8f564869 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json @@ -22,7 +22,8 @@ "literal": { "string": "bob" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin index 75b48541b7663..0742bd0585665 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json index 5cc30772e8e88..aceb63829aa8a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json @@ -26,7 +26,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin index 22e1a3328756e..2ba96d65869c6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_like.json b/sql/connect/common/src/test/resources/query-tests/queries/function_like.json index 3ce3431e50f8f..23b9578abf5b3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_like.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_like.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_like.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_like.proto.bin index d9a13f5c79bce..2f27591f3d68e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_like.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_like.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.json b/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.json index 0313398f0ad60..04e27ea1de9e1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.json @@ -26,7 +26,8 @@ "literal": { "string": "/" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.proto.bin index cc5fefe193fb9..0668e85576e04 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_like_with_escape.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ln.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ln.json index ababbc52d088d..4fe4ccd2aa34d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ln.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ln.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ln.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ln.proto.bin index ecb87a1fc4102..3296765cb86cd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ln.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ln.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json index 68281d2e6d9d1..5683069790afa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "localtimestamp" + "functionName": "localtimestamp", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin index b1a9e70c7c802..e280db3af4ccd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_locate.json b/sql/connect/common/src/test/resources/query-tests/queries/function_locate.json index 7939fdd2c7559..285374acfe075 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_locate.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_locate.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin index cc7ced9957a52..e43b34bf0665f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json b/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json index 269f39701608a..87fa8050f4005 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json @@ -26,7 +26,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin index 162ab0108c132..b11ee8f8ae770 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log.json b/sql/connect/common/src/test/resources/query-tests/queries/function_log.json index ababbc52d088d..4fe4ccd2aa34d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_log.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_log.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin index ecb87a1fc4102..3296765cb86cd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log10.json b/sql/connect/common/src/test/resources/query-tests/queries/function_log10.json index 13292d83c4727..bced949b9aaf1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_log10.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_log10.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin index 22d4655a6efbd..1a363b7043dc3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.json b/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.json index 4e9e6847c3c36..95942299457da 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin index 9a72c377b0cc4..6097a554cee23 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log2.json b/sql/connect/common/src/test/resources/query-tests/queries/function_log2.json index ec29e154a0e1d..4fe3d22e03415 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_log2.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_log2.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin index 34e3780650540..39aab70e5ac3e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json b/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json index 6bc2a4ec3335a..3d6e05077dd6c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin index 2e64e15ed5555..95f9352d73f93 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lower.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lower.json index f7fe5beba2c02..e2b4a5816e686 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lower.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lower.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin index 7c736d93f7729..f9f4d930ebc23 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.json index b9f3e6700bfa4..7cdd2b75dadc6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.json @@ -26,7 +26,8 @@ "literal": { "string": "-" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin index 470ab1cc44add..8576f2b0be3b7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json index aeb39ba09ad20..60eaa163e632d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json @@ -26,7 +26,8 @@ "literal": { "binary": "DAoPDg==" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin index b4acebb394c7a..a8651574a6bbb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.json index dd3b459520221..1c9ac2e57ffbb 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin index 162b6a7337bb9..ed81a583ccaac 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json index 01dc81bdae7bc..f67f689d0d950 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin index 0cd62226c9716..6a9d3bf7b00ef 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.json index a363298dd123a..0f78921e965e5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.json @@ -26,7 +26,8 @@ "literal": { "integer": 14 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin index 0526825fccade..c47ea55dbe6cc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.json index 24cd85d7ae442..66e5da74d5e45 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "make_dt_interval" + "functionName": "make_dt_interval", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.proto.bin index 09c5a25a10cc2..00bedb03e9dc4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.json index fb65f1fcd9def..9fe3d9dedbc83 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.proto.bin index 11c67d98f9f49..70cd3d296df6a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.json index e08a9c3b083ee..f902694ef7774 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.proto.bin index bd16de042f61d..d8107489e1e78 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.json index a4b8c14538ae3..5a760c1b469fb 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.proto.bin index 7595205c6bb0e..89a249a8535fc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.json index 20eaa7521d3d9..745012b755db3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.proto.bin index 6db1bc8b51bea..ba6a2b4e10f82 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_dt_interval_days_hours_mins_secs.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.json index cdbe616565287..eceeeddd4b15b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "make_interval" + "functionName": "make_interval", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.proto.bin index 8d4327eeff426..c569945e7b4c1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.json index e5afa5ec3349a..e49924238ced8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.proto.bin index 7be990a47aba6..84ed402ba145f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.json index 9de86b70c169e..a7f87a2015bc1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.proto.bin index 219cc5a023d45..a886b4d11afd3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.json index c387757c6f739..7ca71d529325b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.proto.bin index 7fb48227f69c7..af3f2d4b4519f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.json index 54274116ee951..d0741256492d1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.proto.bin index 58d62b76ac5f9..0d3355c516bdd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.json index a6e343532ec2c..aecddb3dcb53f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.json @@ -34,7 +34,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.proto.bin index 3133c2d497ea6..52b889c8ad81b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.json index d921fda962896..b5ed4c2e8d4e8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.proto.bin index cf2ad98b5c16c..7d06032ee8a45 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.json index 09f95bc933b10..372b13ce44621 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.json @@ -42,7 +42,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin index dca680fd90b61..033d88d328ed1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.json index a58259eefe742..7572d311648c0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.json @@ -42,7 +42,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.proto.bin index 3eac6534c6510..ecc490af0f3c4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_with_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.json index 5c87a856fc6c0..9e10c499ee9ee 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.proto.bin index b35f9fd474607..2baa0d4f269d4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ltz_without_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.json index 4cc4f1a11acdc..6241859ae9269 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.proto.bin index 5a6554443ceca..17dc37d99f63f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_ntz.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.json index a7a5ff132c083..a8d427ae58c0c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.json @@ -42,7 +42,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.proto.bin index 77c0d5961c804..660c4399e35f4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_with_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.json index 286ed33f82e10..8426f3dd45143 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.proto.bin index f91efead687a8..348befe79dbf8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_timestamp_without_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.json index dc1848be0cc42..f37ca015eed91 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "make_ym_interval" + "functionName": "make_ym_interval", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.proto.bin index eaffc7c237094..0c63f66caf9a0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.json index d789064ad9b8e..81b70cf342205 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.proto.bin index 1938b7c53bdd1..7486df5d7530d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.json b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.json index aeffbbb4a1a14..1eb67c528682e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.proto.bin index f03f6ecce83de..5a85f5981b948 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_make_ym_interval_years_months.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map.json index ca9d3bf2bcc71..830445d3facf8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map.json @@ -30,7 +30,8 @@ "literal": { "string": "dummy" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin index 229a48b75131d..32b233fa939db 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.json index f56f6cee20ab0..6e076de494412 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.json @@ -55,11 +55,14 @@ }, "name": ["b"] } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin index 0a76d3a1193ea..6c04979af6da7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json index 56833f9651023..c25e508806149 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json @@ -22,7 +22,8 @@ "literal": { "string": "xyz" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin index e517479020e16..09bd7fa7c4796 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.json index 0226506545010..310b3980931a1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin index f1451d4ad7ba4..1050436839a0a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.json index b50e77c0bf8e9..48c7871244f19 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.json @@ -31,7 +31,8 @@ "literal": { "string": "baz" } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -40,7 +41,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin index 7f3d0c31fd6fe..a6143e55099ff 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json index 1eb1f7d2ef066..0ce0f1f547174 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json @@ -25,7 +25,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -38,9 +39,11 @@ "literal": { "string": "two" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin index f5333b1c882bc..e85aba1a35ce7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json index 1e48a1c2082df..e505d884d85f6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json @@ -34,7 +34,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["x_1"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -43,9 +44,11 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin index 0dd0d31350991..88d178e75c4fe 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.json index 5af013295cd9f..1e25cd90ac88c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin index ee19968bacc2c..b45fde5acd6bd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.json index 3c5eb651801dc..96ca6b6807963 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin index 4cd7c488ada48..6708875252f51 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json b/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json index d13bd8dce75f3..9e913743009d1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json @@ -53,7 +53,8 @@ } } } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -64,7 +65,8 @@ "nameParts": ["z_3"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin index 2770b083e32ef..65dba7ed7bbd1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask.json b/sql/connect/common/src/test/resources/query-tests/queries/function_mask.json index c0473466a3e1c..f4c6dad9b6385 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_mask.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_mask.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_mask.proto.bin index 5e94c2675937d..6397cc4fe4adc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_mask.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_mask.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.json b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.json index 571d514e72ded..6dc93c69e10e8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.json @@ -22,7 +22,8 @@ "literal": { "string": "X" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.proto.bin index 0f6c4b579c4f5..699d662fcca48 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.json b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.json index ae527d70cf162..671a19a8900af 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.json @@ -26,7 +26,8 @@ "literal": { "string": "x" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.proto.bin index 5a6b4d7caa60e..c754da066d573 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.json b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.json index e7fee11d3169e..f12f44b3ecb38 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.json @@ -30,7 +30,8 @@ "literal": { "string": "n" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.proto.bin index f0a2e7cb643af..473f724d4126c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.json b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.json index d6076ae558bc7..1ddb661b636bb 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.json @@ -34,7 +34,8 @@ "literal": { "string": "*" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.proto.bin index cb5f090361b20..bf641173435f3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_mask_with_specific_upperChar_lowerChar_digitChar_otherChar.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_max.json b/sql/connect/common/src/test/resources/query-tests/queries/function_max.json index b23dd9d14c643..1514b34f8b462 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_max.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_max.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin index 788c9539b5767..d36b5d79ecbd9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.json b/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.json index da311e340cc50..1048a30325e5a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin index 284c2453af8bd..1f1832962cb34 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_md5.json b/sql/connect/common/src/test/resources/query-tests/queries/function_md5.json index e8718594b0be3..d954d60a9c68a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_md5.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_md5.json @@ -26,7 +26,8 @@ } } } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin index d3ec7c26a2ede..87ee03e940081 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_median.json b/sql/connect/common/src/test/resources/query-tests/queries/function_median.json index 7331454b9ecb0..a358a25d85705 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_median.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_median.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin index 59533e5be5992..5a80a2f7cd44b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_min.json b/sql/connect/common/src/test/resources/query-tests/queries/function_min.json index 1b7266b6774e4..3fba2b795a224 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_min.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_min.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin index b82f4c5309222..2ae3da3391fa5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.json b/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.json index d2478f5e81abe..6c9b99ad7d43d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin index ddc642b95000c..da76415ec74a0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_minute.json b/sql/connect/common/src/test/resources/query-tests/queries/function_minute.json index 7c749cdff82f5..c94a8703d38e9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_minute.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_minute.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin index e81b7dad85331..b0b743773e902 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_mode.json index 8e8183e9e0883..a4f3b601ad47a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_mode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_mode.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin index dca0953a387b1..e3dfc96922e62 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json b/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json index 0a14f1008976e..01ca4536c97f9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "monotonically_increasing_id" + "functionName": "monotonically_increasing_id", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin index 724ce3ac6904c..2e86a0566afaf 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_month.json b/sql/connect/common/src/test/resources/query-tests/queries/function_month.json index 7ea1e5d0375e9..6343ed28faa01 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_month.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_month.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin index b97100a6fe2ec..1c09b5d1f26c2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.json b/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.json index c5ad3485252f1..7da09908cadaa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin index 4518bb8d74253..958b9b34dff86 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_months.json b/sql/connect/common/src/test/resources/query-tests/queries/function_months.json index 278bab76a6544..235893cf6cdf3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_months.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_months.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin index fdcd96750dc9c..1a689d77c7019 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.json b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.json index 0fa772d26cd41..f02f9c3ea416a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.json @@ -16,13 +16,15 @@ "functionName": "months_between", "arguments": [{ "unresolvedFunction": { - "functionName": "current_date" + "functionName": "current_date", + "isInternal": false } }, { "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin index 22ddc1813e0fb..34b5c49c83375 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json index d11bfbd7f2426..f03709aece83c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json @@ -16,7 +16,8 @@ "functionName": "months_between", "arguments": [{ "unresolvedFunction": { - "functionName": "current_date" + "functionName": "current_date", + "isInternal": false } }, { "unresolvedAttribute": { @@ -26,7 +27,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin index bf9c545911ffd..a072570756411 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.json b/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.json index c4d92131ed06c..a23893ddb0fcf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.proto.bin index b595cfc282036..87c310823426b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_named_struct.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.json b/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.json index 69daab270c2b9..151c3d830716f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin index f314a73dcae65..edddf04956a74 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_negate.json b/sql/connect/common/src/test/resources/query-tests/queries/function_negate.json index e269fabe44be1..96f9359193fbd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_negate.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_negate.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin index 9c56c111ceee6..f4d42e82e0c0d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_negative.json b/sql/connect/common/src/test/resources/query-tests/queries/function_negative.json index e269fabe44be1..96f9359193fbd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_negative.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_negative.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_negative.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_negative.proto.bin index 9c56c111ceee6..f4d42e82e0c0d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_negative.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_negative.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.json b/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.json index 486523dcad3ec..ad6f3bb22ff82 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.json @@ -22,7 +22,8 @@ "literal": { "string": "Mon" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin index a97bd75f129db..8cece90ab671b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_now.json b/sql/connect/common/src/test/resources/query-tests/queries/function_now.json index 98556585c3e31..1ceb0bd1366ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_now.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_now.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "now" + "functionName": "now", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_now.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_now.proto.bin index a8fcd67fa1982..f9ab22b653c09 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_now.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_now.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.json b/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.json index 4c764a5d5603c..97f434a6d71ae 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.json @@ -28,7 +28,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin index f87e1695f22e3..cd6eeac2e054e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.json index 2346a788b64bd..595cfe02b8631 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.json @@ -20,7 +20,8 @@ "literal": { "integer": 4 } - }] + }], + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin index d9ccd2e8a6007..fa7d6cac0bf17 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.json b/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.json index 3892eb19fc52c..ac9f5620f9243 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.proto.bin index 9bbf5f4ccb8ac..0217381b686ff 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_nullif.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.json b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.json index 483448c26d114..1bf1f22bcad2b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.proto.bin index 21a9b37eb65ec..663f7c714883c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.json b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.json index 8db7f9ba6292c..408d6eba0a05b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.proto.bin index 8b7f90bf27552..627a671a085e2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_nvl2.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.json b/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.json index 7be9ac82662a4..bfd624216f70d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin index 484ebbb6487b0..3f94747ea2595 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.json b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.json index b580570f923a6..28cb276fe4cb1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.json @@ -26,7 +26,8 @@ "literal": { "integer": 4 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin index 2110ae9c14610..43d75d4a07231 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json index 99d5426c46fba..79f0be1011dab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json @@ -30,7 +30,8 @@ "literal": { "string": "3" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin index 9a09d28d84fde..9521c75b4a83e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.json index dfcf56c19223e..4a84cbe99b65d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin index a7187fa2c1af0..cea247c862173 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.json b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.json index e03b86c21eb94..e193266998299 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin index 56917289c1ec9..f15b2316d3c2e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json index bd627911ef22d..862e3b5aa8d3a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin index 231622cbd8a6b..543b9dd6f43dc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json b/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json index d8778ec8cd81d..4770d3d81d6d5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json @@ -15,7 +15,8 @@ "window": { "windowFunction": { "unresolvedFunction": { - "functionName": "percent_rank" + "functionName": "percent_rank", + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin index d668f7e1504cb..2dcfbf3777ee9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json index 6289464de2a37..16f5f1f2b4b7b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json @@ -26,7 +26,8 @@ "literal": { "integer": 20 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin index f44ec86888f6c..159c21c647729 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.json b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.json index f57804426643d..fb548fd0233e1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.json @@ -26,7 +26,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.proto.bin index 91d6279f9bd8c..13be09fc2901a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_with_frequency.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.json b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.json index 44e2c98a4dc60..3af4be6aad5cd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.json @@ -22,7 +22,8 @@ "literal": { "double": 0.3 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.proto.bin index 45b807e5ffbd2..93156e49b4556 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_percentile_without_frequency.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_pi.json b/sql/connect/common/src/test/resources/query-tests/queries/function_pi.json index 46474dfd8e369..d73ca1d6ca691 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_pi.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_pi.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "pi" + "functionName": "pi", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_pi.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_pi.proto.bin index 14f018904bfb7..33fee15270257 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_pi.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_pi.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.json b/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.json index 1dc2cb54cbb67..fa6edbcda84f8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.json @@ -22,7 +22,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin index a2bb94dbb5173..068b878b77ca5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.json index f8a9db37e62be..261ee78e20b43 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin index fc50f5f4c85b7..9b22124951e80 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json index 0e8cd4c1509e1..71d11fd72cd8e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin index 19d700665e7f5..d771cb8c33739 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_position.json b/sql/connect/common/src/test/resources/query-tests/queries/function_position.json index 7b005e2bb8213..e71a363461f8d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_position.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_position.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin index 34b7e301fe943..db2530e0625ba 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json b/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json index 2cd04992d1da8..0a1f3fc42a29a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin index b34eaf80f8866..4f092eae4057a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_positive.json b/sql/connect/common/src/test/resources/query-tests/queries/function_positive.json index a8b3a2d6244bb..26f8ae17bbd19 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_positive.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_positive.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_positive.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_positive.proto.bin index 5507abce8caac..32e7859676cfe 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_positive.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_positive.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_pow.json b/sql/connect/common/src/test/resources/query-tests/queries/function_pow.json index 187636fb360c6..b9b24218fd99e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_pow.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_pow.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin index 6e1d3b06fe87a..52a249ada18f6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_power.json b/sql/connect/common/src/test/resources/query-tests/queries/function_power.json index 187636fb360c6..b9b24218fd99e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_power.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_power.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_power.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_power.proto.bin index 6e1d3b06fe87a..52a249ada18f6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_power.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_power.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_printf.json b/sql/connect/common/src/test/resources/query-tests/queries/function_printf.json index 73ca595e8650b..c3ead5f1388b5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_printf.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_printf.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin index 3fb3862f44d91..b78d5046c98dc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_product.json b/sql/connect/common/src/test/resources/query-tests/queries/function_product.json index 1dfb7f81912d3..802f3e77e8cd1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_product.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_product.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin index 8c3fbd31eb6b3..a7ff0061481b4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.json b/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.json index b95867e0be963..eb8e75f914318 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin index fdc2d96fb08ca..d9ad4e20a3aed 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_radians.json b/sql/connect/common/src/test/resources/query-tests/queries/function_radians.json index 837960dedc653..83f211272c123 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_radians.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_radians.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin index 33a2521b22ac9..e371fb5947a5b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.json b/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.json index 5318466706bd8..6fd5cfe1194d3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.json @@ -18,7 +18,8 @@ "literal": { "string": "kaboom" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin index 7fbd33b9869ca..678fc8cda7afb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json index 453ea54bd0ef3..67c1250a72ff8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json @@ -18,7 +18,8 @@ "literal": { "long": "133" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin index 566a49d641293..cb15624497821 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json b/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json index ef84f05c3e193..f043d22159b20 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json @@ -18,7 +18,8 @@ "literal": { "long": "133" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin index b0064842bf308..6f601e62c262b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.json b/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.json index 11238a43ec1a3..fc0c945b825ae 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.json @@ -18,7 +18,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.proto.bin index aa4208afedb88..918de15afa6b5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_random_with_seed.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rank.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rank.json index 93c8dc38d668a..905af83a134f2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rank.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rank.json @@ -15,7 +15,8 @@ "window": { "windowFunction": { "unresolvedFunction": { - "functionName": "rank" + "functionName": "rank", + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin index 3aef331fb1739..8d72dd7fddafa 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.json b/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.json index 4928145bda572..cad612fbc66d2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.json @@ -35,7 +35,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["y_2"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -55,7 +56,8 @@ "nameParts": ["x_3"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.proto.bin index 2532c111e3874..89868c4ea5ef6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_reduce.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.json b/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.json index 2b0fe7911150c..7ae607ccbeb92 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.proto.bin index 31c6c9bf13150..be0973b5020b9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_reflect.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.json index 005d7264969f2..890dc31aa5a19 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.json @@ -22,7 +22,8 @@ "literal": { "string": "[a-z]+b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.proto.bin index 0379829055998..c138f434647ff 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.json index 540f1821f50e4..23b90b66e6115 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.json @@ -22,7 +22,8 @@ "literal": { "string": "\\d+" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.proto.bin index 3afcfd8c21e7c..ec3970a0434ef 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_count.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json index 5d9c7a5b4a5ab..e3b3650c16ba5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json @@ -26,7 +26,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin index 32ba8b6dcb5e9..9fa17177b5be4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.json index ebe2f581e3de2..04186ace547a0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.json @@ -26,7 +26,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.proto.bin index 2cf31e5f75f4f..12d2feefb602c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_with_regex_group_index.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.json index 84a2e378ed2e3..34459b011dcf3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.json @@ -22,7 +22,8 @@ "literal": { "string": "(\\d+)([a-z]+)" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.proto.bin index 529cae91ce595..3fbcb69e8ae54 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_extract_all_without_regex_group_index.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.json index cb44dda5ba2c2..69d171eca7f55 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.json @@ -26,7 +26,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.proto.bin index 55cc77eb3cd1f..32649c83ecc75 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_with_regex_group_index.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.json index eeab13abaa6da..e5ee8f177efdd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.json @@ -22,7 +22,8 @@ "literal": { "string": "\\d+(a|b|m)" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.proto.bin index 3aee655d92c65..791af5111d72d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_instr_without_regex_group_index.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.json index 289fb3d9b4eab..4856ead38ac16 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.json @@ -22,7 +22,8 @@ "literal": { "string": "[a-z]+b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.proto.bin index e7bb85bfa47d8..65e383f0e95f0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_like.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json index 83dd7a8569fd4..b6a237881c400 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json @@ -26,7 +26,8 @@ "literal": { "string": "XXX" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin index b7d3fde25cf85..6d7dd2cb762f6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.json index 067652959a94f..0e0c6ea3c4ff1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.json @@ -22,7 +22,8 @@ "literal": { "string": "\\d{2}(a|b|m)" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.proto.bin index 43b987c612cd9..5c629e755a99b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regexp_substr.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.json index 4fdc9b035d764..0c220a9401193 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.proto.bin index 5771d141728ad..e71110d6a3511 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgx.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.json index af225fdf5a895..a3ce82193c43a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.proto.bin index 0a6dcf0106ac7..8c9084a77a0fb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_avgy.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.json index 510fc78140a6e..cc51b8bd0a10e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.proto.bin index b1eff9f4d0329..325137df60e1b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_count.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.json index a8596615a2d7f..961fd09d0a7d3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.proto.bin index b9a1c0eff8943..49ee2215109cd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_intercept.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.json index 9f88c6ad41268..7d6d482dd2430 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.proto.bin index 0011348d3880a..aca6143f423a6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_r2.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.json index 9503b2c6feff3..7d93ecaf46afb 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.proto.bin index 69c918a7861f2..ccfe35dbe4485 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_slope.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.json index fb243c9989ecf..705e3c357a7f7 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.proto.bin index df31a2e6851f9..9ebce4d26e382 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxx.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.json index 459deaa391e8d..4c35e57128935 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.proto.bin index db51c0bc32a79..8d7b682aef3a2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_sxy.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.json b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.json index 877fbc3aa7c51..624dfa2bf855d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.proto.bin index 6452b277a6e27..0bf9aa5bcf263 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_regr_syy.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_replace.json b/sql/connect/common/src/test/resources/query-tests/queries/function_replace.json index 2f6df6833f368..730207dc6e7d3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_replace.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_replace.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin index 0564f7ed57583..a1a5792013595 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json b/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json index 2e91450552c19..4b4039e16e5b1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin index 136a6b31821af..8aea298529272 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.json b/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.json index 93869adfbedca..3f028731c409e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "e" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin index dd7f2d5de513d..f03f20dbba7f0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_right.json b/sql/connect/common/src/test/resources/query-tests/queries/function_right.json index 843f5be44a650..966e92157ed94 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_right.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_right.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_right.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_right.proto.bin index b8d0156c98132..d48b49b52e79e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_right.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_right.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rint.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rint.json index ea5bcebf81d72..af368a5694875 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rint.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rint.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin index bd47adc8476fa..3f8c7c35ceec5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.json index fe8480a0800d1..ec6188eb31ac6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.json @@ -22,7 +22,8 @@ "literal": { "string": "[a-z]+b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.proto.bin index 79bbbe92c7fdb..028162c3ddcab 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rlike.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_round.json b/sql/connect/common/src/test/resources/query-tests/queries/function_round.json index 585a0befb224d..d42711c424c46 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_round.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_round.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin index 8625ccb1a58f1..40e173d9df4a0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.json b/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.json index 3d5ac8afe3db3..9972a7e942c96 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.json @@ -15,7 +15,8 @@ "window": { "windowFunction": { "unresolvedFunction": { - "functionName": "row_number" + "functionName": "row_number", + "isInternal": false } }, "partitionSpec": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin index 90b4fcb27d3f1..4368883ca2e36 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.json index d9b78a0cfd7a9..8c9aaf8242e81 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.json @@ -26,7 +26,8 @@ "literal": { "string": "-" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin index d4c355afee0b7..8f945f6329135 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json index 0daaf1636f13d..3e04a1bb094b0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json @@ -26,7 +26,8 @@ "literal": { "binary": "CwoLDg==" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin index c6f9f22146c61..8396bd5bc016e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.json index 5fe66e8e33596..39980dbd802d8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin index 4320bf6ac397c..e11f621a033f6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json index 0ac2401f9eacf..a41ad58bb73ba 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin index 1332f5b330000..6755240b51672 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json index 6df6438a1a9ca..0e6f8f1425d6e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json @@ -29,9 +29,11 @@ "literal": { "string": "|" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin index 99475ddf30d11..6ee33f3a9e986 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json index 06110d326e1ef..5760bbfc038dc 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json @@ -18,7 +18,8 @@ "literal": { "string": "[{\"col\":01}]" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin index c4ca00e629262..f1934d780d8fd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json index ab05ffa940c50..bca6b670d8b5d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json @@ -29,9 +29,11 @@ "literal": { "string": "true" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin index 482485501dd37..7b125c550aef4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json index c4ea467bc1a24..c0ab18fd4b4b4 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json @@ -21,9 +21,11 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin index 0971460bf4112..ecea29dcd41ac 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json index 19bf62f70b20f..c31f58aa3e320 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json @@ -21,9 +21,11 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin index 68c872ef0d4d2..a6a3f5bccd26b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sec.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sec.json index 1cab2239755ca..b54347f1488ab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sec.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sec.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin index 8760f57a6d4f0..f1b4e4f830a0c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_second.json b/sql/connect/common/src/test/resources/query-tests/queries/function_second.json index c77a572b88aa0..5040147fa8b2b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_second.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_second.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin index 193c46e917ba2..196f57b276cad 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.json index 412ac0272dd57..a2ace14172722 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin index 4b62f22574d32..0ce754a8375e6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json index 869e074ccd604..ffb3065d74494 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json @@ -22,7 +22,8 @@ "literal": { "string": "en" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin index 7514b380a1c82..bf003eecdcd40 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json index 991b42faddb76..5a6cab4cfd609 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json @@ -26,7 +26,8 @@ "literal": { "string": "US" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin index 01c0136c6df16..ce9b087ef03bb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.json index b8bd1b68c9a8f..e9bc5b437502d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.json @@ -22,7 +22,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin index 36f1980f4ec2b..190d2a8225059 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.json b/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.json index 07afa4a77c1b9..03ed1976a708b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "session_user" + "functionName": "session_user", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.proto.bin index 948e3eeed60ac..62904540ce677 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_session_user.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.json b/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.json index 92995656bd265..6178d6a73ab77 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.json @@ -22,7 +22,8 @@ "literal": { "string": "10 minutes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin index 364ecdf2aaa28..b669a186b2e2b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sha.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sha.json index 57c5cb5bbd270..2385eb652cc15 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sha.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sha.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sha.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sha.proto.bin index e99760e49222d..66d9291a14101 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sha.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sha.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.json index ce5014ac2f7e6..b19270ee3fdcb 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.json @@ -26,7 +26,8 @@ } } } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin index 3fdfdb2a072de..004ef664ee8d8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.json index 5278d604e97b9..ed90b3b939ee6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.json @@ -30,7 +30,8 @@ "literal": { "integer": 512 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin index 20a0ee1082ae2..bf16ad9677137 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json index 12decd300ab03..c02f85d5d56b1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin index 94bfbc99fce2d..5ebdbbc9996ee 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.json b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.json index c2295c4abaaa2..eabafb977393a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin index 910d12f50d6a9..6a089c2ffa344 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json index 875e26a5a5652..4b32899df264a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json @@ -22,7 +22,8 @@ "literal": { "integer": 2 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin index aba9c425dca96..d732f7244aa0e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sign.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sign.json index 34451969078b0..3491a453f6b68 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sign.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sign.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sign.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sign.proto.bin index ff866c97303ed..35083f8b9a89a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sign.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sign.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_signum.json b/sql/connect/common/src/test/resources/query-tests/queries/function_signum.json index bcf6ad7eb174d..02ab0e364fd10 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_signum.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_signum.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin index af52abfb7f25b..65c838e408540 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sin.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sin.json index cb5b0da073456..a6be1adb3249d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sin.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sin.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin index a63f574fa59cb..1f746a3ab76f9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.json index e0f46b428611e..c84ac26b64222 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin index 2f17ab02a6d94..545a0749d7973 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_size.json b/sql/connect/common/src/test/resources/query-tests/queries/function_size.json index 37c9cd1ac1ba7..97a996e23a790 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_size.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_size.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "f" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin index a8ae600a3dd7a..4f80765b81564 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.json b/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.json index 4b14c8d5ca79c..4ed304e15c67e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin index 889f96b2d2a39..73a8e1e4d1998 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_slice.json b/sql/connect/common/src/test/resources/query-tests/queries/function_slice.json index b0a63248784ea..c9229b9487c7d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_slice.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_slice.json @@ -26,7 +26,8 @@ "literal": { "integer": 5 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin index 620a006f775d6..923214f8cf339 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_some.json b/sql/connect/common/src/test/resources/query-tests/queries/function_some.json index bd6e28468e357..7b440a8d7e17a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_some.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_some.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "flag" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_some.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_some.proto.bin index 0293719148506..43a3ab993a3a4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_some.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_some.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.json index b42bede5cd172..24ffc7c2d387e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.json @@ -22,7 +22,8 @@ "literal": { "boolean": true } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin index 994048af2afc4..0414c104be556 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json b/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json index 851745b32ebe0..0e428e3e199d2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "spark_partition_id" + "functionName": "spark_partition_id", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin index df99cd64e7203..843ca4273b7d8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split.json b/sql/connect/common/src/test/resources/query-tests/queries/function_split.json index 001d44dcaaf6e..a00e18d77628c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_split.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_split.json @@ -22,7 +22,8 @@ "literal": { "string": ";" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin index cab0bde7b6da2..d0da01bba86dd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.json b/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.json index 81ced1555d3e4..a9c6a3ec9e2d0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin index 2c1948f20dc22..fde88be6654b0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json b/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json index 98ef0e54e6211..5cdf413daf98c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin index a87702f83d1bd..e5a712f11a74e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json index 45a7588838ff8..0e2f5ac83e77c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json @@ -26,7 +26,8 @@ "literal": { "integer": 10 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin index 497297fad8715..adc0a101eaf19 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json index 138f9d70b2c85..73b7355de15e7 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin index 04e24be40e9d8..a25da7d145181 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.json index f9a2b76520c13..6ca6327142558 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin index e98e3bdfdb665..8b2ed11bd0e28 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stack.json b/sql/connect/common/src/test/resources/query-tests/queries/function_stack.json index 14865c72df228..f714739b185cf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_stack.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_stack.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stack.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_stack.proto.bin index 5e5e12478d682..f578d0d2bb952 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_stack.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_stack.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.json b/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.json index ce2b0ac658c4a..4fb08d9de4760 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin index 2f09e8095f5a0..f46d166a7c554 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_std.json b/sql/connect/common/src/test/resources/query-tests/queries/function_std.json index cbdb4ea9e5e83..5bb7ed27a5a74 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_std.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_std.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_std.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_std.proto.bin index 7e34b0427c23b..26b1f4e167534 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_std.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_std.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.json b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.json index 1403817886ca0..d27469b26a7fe 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin index 8d214eea8e74e..b24c06af4fd04 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json index 35e3a08b219f8..038b6e1fbb70a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin index b679f55014f97..e311b7d0311e8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json index 17cd0fd5e5976..3f7b829e4821f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin index 9f22eba5e39aa..7f888c1c07d77 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.json b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.json index 2cfd095f8fe62..3c640499aba54 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.proto.bin index 9732a829513a8..7c125c71e5150 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.json b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.json index 228c939a43ef2..c3b52657efd2e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.json @@ -22,7 +22,8 @@ "literal": { "string": "," } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.proto.bin index 069c15db9af76..e65abe3472b91 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_and_keyValue_delimiter.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.json b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.json index 7e02c7f13d2ec..2af5fcbb3fbf2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.json @@ -26,7 +26,8 @@ "literal": { "string": "\u003d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.proto.bin index 86a9d15b6512d..4e90cc32e37aa 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_str_to_map_with_pair_delimiter.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_struct.json b/sql/connect/common/src/test/resources/query-tests/queries/function_struct.json index ba950215a2591..f88910dc3f494 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_struct.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_struct.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin index 079c2be3c52e5..90e12eb597175 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substr.json b/sql/connect/common/src/test/resources/query-tests/queries/function_substr.json index ef6d225821c37..510d501b5c9b0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_substr.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_substr.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin index 934201c433381..6b0871916a8a2 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json b/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json index d8492899d69bc..b09ef5ed2723e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin index 0fab03c025061..e5a3de2dc6c58 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substring.json b/sql/connect/common/src/test/resources/query-tests/queries/function_substring.json index 84a70cf1c0236..5590cd2660922 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_substring.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_substring.json @@ -26,7 +26,8 @@ "literal": { "integer": 5 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin index d302cd95c7434..bdb3a13e5d9c3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.json b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.json index dc81d925957cd..a5396bb478197 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.json @@ -26,7 +26,8 @@ "literal": { "integer": 5 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin index 192bb2e300dc3..9ac474c32e1bd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json index ba28b1c7f5700..8f01512673cf8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin index f14b44ef5a501..838e1d9a8bb90 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sum.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sum.json index e9526a20b67fb..28b4ea5bbe856 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sum.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sum.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin index 0e347bbc0a167..05c69d6f94029 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json index 4614cf99ad3a6..0de8f3d36c22f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json @@ -19,7 +19,8 @@ "unparsedIdentifier": "a" } }], - "isDistinct": true + "isDistinct": true, + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin index b4cf704391a4d..6c345201d8eea 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_tan.json b/sql/connect/common/src/test/resources/query-tests/queries/function_tan.json index ead160a7e3ac2..38ca851765599 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_tan.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_tan.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin index d674dc033b2cd..ba28964c9befb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.json b/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.json index bcd12c664427e..e9e3996bc5aaa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin index 21c28c3ef88e6..d635c5020a53f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json index 8fd71bb36d85e..c23a5c3bfa129 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin index 5ab8ec531e073..142672a0929e8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json index 635cbb45460e6..c779b0936dc63 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "t" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin index 3a81fd8b318c0..c2053f46f6a55 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.json b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.json index e43aa6d7115bd..985a23c536e1c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "x" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.proto.bin index c8ca8eedef3c0..f17fc48d4418b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_micros.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.json b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.json index afcdf42d7b3be..52389f7fe5fab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "x" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.proto.bin index bbe401c39f3d1..f63af6ecb1fb8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_millis.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json index e6892d17708b3..526d22229facf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "x" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin index 102afbdda9021..0c7647735eed5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.json index 156c3a5b3ca65..fcf1d35f42169 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.proto.bin index a1da0e6e2eda1..2b02883e03a30 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.json index 8c78cc6f8b99f..325e92437d515 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.json @@ -22,7 +22,8 @@ "literal": { "string": "utf-8" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.proto.bin index 2f2364e5abab1..5be46a049a535 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_binary_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.json index 404a89a87ecb2..117955fa60c3a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.json @@ -22,7 +22,8 @@ "literal": { "string": "$99.99" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.proto.bin index 087e212c39f4e..6ed0c2cdde8a9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_char.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.json index 6b3856f5ac0af..cfbce992619b8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.json @@ -29,9 +29,11 @@ "literal": { "string": "|" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin index a3017643a330a..318966eb2b58b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.json index 8b9d50aa578b8..0da88a6158438 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "s" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin index 59178487eef58..bfd79f65053c8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json index 48ae80d1e70ed..1f6250ec0656b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json @@ -22,7 +22,8 @@ "literal": { "string": "yyyy-MM-dd" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin index 2641d660ff69f..8118ff7ee4705 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.json index 7ceeb9d113cd3..0482c5cba9500 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.json @@ -29,9 +29,11 @@ "literal": { "string": "dd/MM/yyyy" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin index c9461c1aa961c..f3d94b476135a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.json index abb71e80a769c..8df8436bf647d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.json @@ -22,7 +22,8 @@ "literal": { "string": "$99.99" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin index 189c73553c5db..0d160ed3239c1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json index 323c57e2ef58a..4e9013a1c0ff2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "s" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin index ec6bd64f98187..1c8f18761c5c6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.json index 59a79f39eb612..c994b0968d099 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.proto.bin index 9cabae3e75657..45d6d1549d98a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.json index 08cb9c153f77f..6ff1dca87de6a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.proto.bin index 22fd3d07dfc43..6aebdbd4f8667 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ltz_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.json index 6808047ef2094..53678e814da88 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.proto.bin index 5cd4cfddbd164..9a0c00065da25 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.json index 03e38801bfa56..240c53dd5c31c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.proto.bin index 3a5d3dd970200..f38ad9460ff52 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_ntz_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json index 30f34528319c7..1988d2fb5a863 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json @@ -22,7 +22,8 @@ "literal": { "string": "yyyy-MM-dd HH:mm:ss.SSSS" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin index 9c2d6d354ca73..939f9151de2a4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.json index 15a42b814a629..6a1c5f3677e7f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.proto.bin index 1c70f303e6fc2..2887468cea205 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.json index d6f4280d4464e..ce072b6395620 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.proto.bin index 141ff1fa320d5..803ec8311f552 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_unix_timestamp_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json index 015fbb5cf534a..fbd86a28a12f3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json @@ -22,7 +22,8 @@ "literal": { "string": "-04:00" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin index b2b65089604a2..bfaf5d2af8a9c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.json b/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.json index 3694a68dc8f5c..732eb4b426dde 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.json @@ -22,7 +22,8 @@ "literal": { "string": "$99.99" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.proto.bin index 005c9ab064c9b..b912951a75519 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_to_varchar.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform.json b/sql/connect/common/src/test/resources/query-tests/queries/function_transform.json index 3ad6fe9435644..c8ec0608b13af 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_transform.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_transform.json @@ -31,14 +31,16 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }, "arguments": [{ "nameParts": ["x_1"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin index 266b093f7a99b..e4efe8865b124 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json index 86349f460adaa..ed7487a58d3b8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json @@ -40,7 +40,8 @@ } } } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -49,7 +50,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin index 827b6f273ceea..e04eabb98195f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.json b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.json index 02aeca229ce5d..6e76f8a0554f9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.json @@ -41,7 +41,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin index b4a653ff77a5d..e24a6320c40b3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json index df5e15b44fdd3..49dec8db7da73 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json @@ -31,7 +31,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["y_2"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -40,7 +41,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin index e502c18dcd9e8..30b1901f42f58 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_translate.json b/sql/connect/common/src/test/resources/query-tests/queries/function_translate.json index 93d155c2857fb..ad5f98152258e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_translate.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_translate.json @@ -26,7 +26,8 @@ "literal": { "string": "bar" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin index 1ce32c8d2843e..ec9c556cfef09 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_trim.json b/sql/connect/common/src/test/resources/query-tests/queries/function_trim.json index d2700174bca3d..a7925c2c7b5d2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_trim.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_trim.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin index d5f4f21510fc6..0ea9051f33837 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json b/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json index fc3281c921531..cb566a6b98dfe 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin index 2136b55656212..4423ab1a02a37 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.json b/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.json index 4c596cd863261..01e528241eedd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.json @@ -22,7 +22,8 @@ "literal": { "string": "mm" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin index cdcee95af6344..acfd2bb94483b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.json index 80300b5b5778a..2a34feb64a6d5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.proto.bin index c1cb613b3943f..f9efaff46f3ec 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_add.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.json index 80e10f4786a81..fd2ba4bd66bdf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.proto.bin index c2a477e5320c7..cc94446a39c8d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.json index cc4ea4bfe5fb9..360742eaf0b69 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.proto.bin index 22919795e3e6a..b1fde39e2b632 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.json index 1f1fc777959a2..f02b625bd9e6a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.proto.bin index b16d49e2428a2..1774a4dfaba19 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.json index b7e7cd41bda8f..b0c434231378b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.json @@ -34,7 +34,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.proto.bin index d406961d5ccfc..6f75571c79ff3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_aes_decrypt_with_mode_padding_aad.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.json index 1216f4b5c635f..9ac56e71ad8b6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.proto.bin index 8ab7a5d19e380..1b378b28e9ca7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_avg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.json index d7d012756e62f..d7bd1d47d1b2a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.proto.bin index 05c8d4a193adb..3ba2e35029a78 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_divide.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.json index c2651e4ad7253..8fb878d1358f2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.proto.bin index b86d5efd4096b..1fc1045fc085e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_array.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.json index c4e5bc2f415ee..e200f3fa9d278 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.proto.bin index 2f6c54f2fa5ec..1f8fdb10899a6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_element_at_map.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.json index a7a2348496040..4c3bafee572d1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.proto.bin index d459b6e8ec677..acb1952d621d7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.json index 14aaa41ee2cb5..4508e5610dc81 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.proto.bin index 5123b995417ba..f12d4affbff93 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.json index a6ac2f27e3dc5..672acdf6b2d3d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.proto.bin index cecfca97f7e20..555eb56ff0438 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.json index c9d4f1d4d2f1f..70782ac30ea49 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.proto.bin index 423172405c397..b9d6de2d00662 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.json index 7f2a42f01db45..fda1b3ea21a36 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.json @@ -34,7 +34,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.proto.bin index 71259b402aa51..3f477ddeab7db 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.json index 35ab05a90b3cd..b2db53de2c58e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.proto.bin index f8cf29d15aabf..72dd0ffd9d539 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.json index 2f9c1d019359b..5a1389ff2665e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.json @@ -42,7 +42,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin index d7343a059b53d..90bf07eb4e9e3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_interval_years_months_weeks_days_hours_mins_secs.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.json index 179f6e06988fc..36559a50d7aa4 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.json @@ -42,7 +42,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.proto.bin index d0c60ba1c7bf8..34dcc6bf46092 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_with_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.json index 29aa2096c2273..36e121b976dbf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.proto.bin index 9caf6f6ba5285..c0eaf743e46fe 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ltz_without_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.json index 6b8d31d0c58e5..b131e0d07fdea 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.proto.bin index 7d7e2a8029def..a350a6861ed9a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_ntz.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.json index 79e11efc20d41..b2fe4db0dabab 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.json @@ -42,7 +42,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.proto.bin index 53b9839cf8c1f..8519bccb5a1a7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_with_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.json index 39ce728a38862..549071a8a964c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.json @@ -38,7 +38,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.proto.bin index 74918d42f89c6..2d8c06fb5a2a5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_make_timestamp_without_timezone.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.json index df22654c82031..0838efb2c1eb3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.proto.bin index 8912423235e0b..703d897792bcb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_multiply.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json index 91177eb4a5857..fca2a95a83ab8 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin index cc1f159cfd78c..a97f0801944be 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.json index b9603d5af2634..0b4e011421d6a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.proto.bin index 696c4ddde519c..080be30a46d23 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.json index 137ed4bd9bc80..d173704b73354 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.proto.bin index f4a13872e3c8f..b39bd2cb28cbb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_parse_url_with_key.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.json index de3fae90c2c4b..0d787f37493b6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.json @@ -26,7 +26,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.proto.bin index e38e0e5c06548..5971e17d1041b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_reflect.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.json index f3a5df24cce88..d1bae052a945c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.proto.bin index f0cb5f5027873..4951a8e3c5fd6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_subtract.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.json index 41e93d1fcf956..7961eb8eb5596 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.proto.bin index dce7d9df359c9..6e4cbe5a01090 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_sum.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.json index 9b57b6b26b562..164304ef4b0ae 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.json @@ -22,7 +22,8 @@ "literal": { "string": "format" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.proto.bin index 28b7059160757..3a1b90ca42f70 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.json index 2498ff9a7872f..6c676dc702a35 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.proto.bin index 682eb1821a3a1..4a7ed3da5738c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_binary_without_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.json index 44e894743dfc8..f206393079de7 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.json @@ -22,7 +22,8 @@ "literal": { "string": "99,999" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.proto.bin index c2eba8a19d5df..2eb9ff68df8fe 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_number.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.json index d00967823a33c..e0a532043e00c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.proto.bin index 4f0300d48a6fc..ff61c6147cd84 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.json index 4fdfc38ca539b..8589656d5ed52 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.proto.bin index 91a4156e305f6..72a6868870487 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_to_timestamp_without_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json index d51704c8f62e2..e73de5e669362 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin index 3e84921b12206..e15e5e2d902da 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json b/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json index 9a4a4e25f19e6..b8ce69c4bcf39 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json @@ -21,7 +21,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }, { "literal": { @@ -31,7 +32,8 @@ "literal": { "string": "int" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin index b16bbf4c7a4e9..82584c937aaa7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.json b/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.json index 7a6fcfcbcf898..7f9808d9fd947 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.proto.bin index a042a6e8d7607..585d98f767904 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_typeof.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.json b/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.json index 7193142acdb6f..5580e31e26ffa 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.proto.bin index 3e17a01d4b1f5..8a2b70936e0f9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_ucase.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.json index 6af2a00ed160e..af85c7b64779c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin index f37ceb91bf42b..f446e0ad73f45 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.json index 7c409d023f76a..1cea642cc9c68 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin index fbac2821fdb07..757eca2dc04d5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.json index 1a7ae09f46dad..2e617c4e6b8b1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.json @@ -25,9 +25,11 @@ "literal": { "string": "yyyy-MM-dd" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.proto.bin index 9c05e42bfad30..6ed08ec71d76a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_date.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.json index 07f5cd1d53dbd..f7bb6d9ba6264 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.json @@ -25,9 +25,11 @@ "literal": { "string": "yyyy-MM-dd HH:mm:ss.SSSS" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.proto.bin index c3f44d766f8b1..2574acb3c8d95 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_micros.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.json index aac02cc807aa0..4a7c077e88bc1 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.json @@ -25,9 +25,11 @@ "literal": { "string": "yyyy-MM-dd HH:mm:ss.SSSS" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.proto.bin index f0456e03e3fc1..1865aac8a7340 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_millis.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.json index 428cb26cd9c86..dc3fc3fae0c05 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.json @@ -25,9 +25,11 @@ "literal": { "string": "yyyy-MM-dd HH:mm:ss.SSSS" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.proto.bin index fdaf50e7322bb..2ca04f640cda4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_seconds.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json index e590f7778f2ea..0780a83d74088 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json @@ -16,9 +16,11 @@ "functionName": "unix_timestamp", "arguments": [{ "unresolvedFunction": { - "functionName": "current_timestamp" + "functionName": "current_timestamp", + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin index cb3d967ae0123..6f8cf9115629f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json index d2e087a5d8a24..93699abb33a7b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json @@ -22,7 +22,8 @@ "literal": { "string": "yyyy-MM-dd HH:mm:ss.SSSS" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin index ddfcdff63d11a..f838c9dd31912 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_upper.json b/sql/connect/common/src/test/resources/query-tests/queries/function_upper.json index 208ee9231a13c..36f1f0258ca82 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_upper.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_upper.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin index 5ddbfce96e71b..a7d6be43571a0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.json index d4cdeeb6c48c5..2e8003e4e9a9f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin index e347e73c3aef1..ff6fb3793e671 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.json b/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.json index 5d221e0fea6f4..9df8a4683ea65 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin index 9313fb8249859..eb9e31e63697b 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_user.json b/sql/connect/common/src/test/resources/query-tests/queries/function_user.json index aaf3de9ba034e..c3ab4ae4be94c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_user.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_user.json @@ -13,7 +13,8 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "user" + "functionName": "user", + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_user.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_user.proto.bin index dbd64cae9f360..17b0d6a4ecc90 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_user.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_user.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.json b/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.json index 9c74ce4a984f8..d91c325ef41ba 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin index 7ca6e8d3b811b..603e856366f10 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.json b/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.json index 979313dd0510d..8132510e61129 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin index 9bd042ad339e7..99a8d28ec0e72 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_variance.json b/sql/connect/common/src/test/resources/query-tests/queries/function_variance.json index 90a97c3becf4d..4bfce573c50c5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_variance.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_variance.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin index fd494fc496391..e3c0f8512c0c6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.json b/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.json index ab0acd29d505b..d44a94f365b56 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.json @@ -21,7 +21,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }, { "literal": { @@ -31,7 +32,8 @@ "literal": { "string": "int" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin index fe9b76bb97c4a..f1ac3c8c0ad63 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.json b/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.json index b757700291752..82f37d343207e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.proto.bin index 1954103269eb2..cc47e1928103a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_weekday.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json b/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json index 3f46a98569e24..de4ce19a12b5e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin index ec9b22522360e..e5c742732e76d 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_window.json b/sql/connect/common/src/test/resources/query-tests/queries/function_window.json index bdcb6a398800f..95bde679468e0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_window.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_window.json @@ -30,7 +30,8 @@ "literal": { "string": "0 second" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin index 8cffcc1e9f673..303b9673c8ab5 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.json b/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.json index 4809ea21261c4..2107ac2f12ecd 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.json @@ -35,7 +35,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "wt" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin index c143520df08ce..3de3a3e156a2c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.json index 3dea90a13653d..fd36b378137c6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b/text()" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin index aabfc76f8a7e1..d07c5d50fc3a4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json index 793d459ec165b..0749c2d422314 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin index 544caab4ecc5b..e70d2e5c3edd6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json index f88a06641b8f4..d1c77ae96a86b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin index 9c4ea31712021..de580971683b3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json index 94932891225d7..b13e12a2d7e9f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin index 32dfbc00cfa44..4285deba56d7c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json index 0dcef00ed20d4..2a55744cb38c9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin index e6298b37dbe36..afe3b10e4cd86 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json index c740d2bad4f5f..3d4d9267a6a50 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin index d240600eabbae..7cb6efd6ab2d0 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json index b164bb6a32ac7..cf1303b54d160 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin index b967d3e55cc5f..c589c8ecc775a 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json index 5d3a3e9983707..4aa5e3aae7fc9 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin index 9ae27bd973853..f407b525cdfa7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json index 26e4130ae2c4b..94fad0de2851f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json @@ -22,7 +22,8 @@ "literal": { "string": "a/b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin index 5384301238b1e..c31ae5065a513 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json b/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json index c20739d09ff10..5000f3b164766 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin index 414c76fc5ce7f..de84e70acef5e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_year.json b/sql/connect/common/src/test/resources/query-tests/queries/function_year.json index b8a4ee5a16525..9fadb5c411b9b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_year.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_year.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "d" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin index 623bc9ac6d81f..91bbfdc180efc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_years.json b/sql/connect/common/src/test/resources/query-tests/queries/function_years.json index 2e87307320271..7b0ab3d287ece 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_years.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_years.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": true } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin index 30c25423fd563..575f56951f017 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.json b/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.json index 660ca1931137e..66d53c39742ba 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.json @@ -35,7 +35,8 @@ "unresolvedNamedLambdaVariable": { "nameParts": ["y_2"] } - }] + }], + "isInternal": false } }, "arguments": [{ @@ -44,7 +45,8 @@ "nameParts": ["y_2"] }] } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin index edbfe197af4dc..1bf478358f357 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.json index b7b4c98518e6b..e36ad1de4960d 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -45,7 +47,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -55,7 +58,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -65,7 +69,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -75,7 +80,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -84,7 +90,8 @@ "unresolvedStar": { "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -94,7 +101,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin index d7b1b94ed04a2..22eacb3a01b03 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json index e61616786158e..cf42aabd68160 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json @@ -24,7 +24,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -33,7 +34,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin index d6daa1cc31f7d..a12bd0699df1f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json index 285c13f4bc8b3..e5261b8a18a09 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json @@ -31,7 +31,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -41,7 +42,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin index 674d506fa4a07..93bb39f16d1a6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.json index 0ded46cf6cc7c..4110779d80c56 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin index 444b0c3853f16..d43d816891aa6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.json index f92e22493e07b..04d6e91f9c7cf 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.json @@ -26,7 +26,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }, "name": ["count"] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin index 5bb539195df9a..fd009c3a636ca 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.json index ed186ff713519..643f6c0676a30 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin index 11cd163e91738..7bb503d75e92f 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.json index 0ded46cf6cc7c..4110779d80c56 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin index 444b0c3853f16..d43d816891aa6 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.json index 8c0ad283cb0a4..3d7546547d98f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin index 2bc985a1fe9f3..cd0488b381612 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.json b/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.json index 788b964491c6a..61133d3fe4321 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "b", "planId": "0" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin index e92041399cbca..d5f8bc8c47698 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.json b/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.json index 6e84824ec7a3a..a81c24a9fa077 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.json @@ -25,7 +25,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unparsedIdentifier": "a", "planId": "0" } - }] + }], + "isInternal": false } }], "groupingSets": [{ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin index ce0294096706e..48d51737b9a41 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json b/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json index 8ff81d95d2988..c1c1bcdf0dc87 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -39,7 +40,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -52,7 +54,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin index d1dded43ddf99..41e1a426e25b9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.json index 52f66cf2dc6b9..914f304f56a2e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "bytes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.proto.bin index 68b74817c3268..ba473ed2e2855 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.json index 52f66cf2dc6b9..914f304f56a2e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "bytes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.proto.bin index 68b74817c3268..ba473ed2e2855 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.json index fbd4ca05d9e99..590187f7fe63b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.json @@ -22,7 +22,8 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.proto.bin index bea4e1a642ab2..5b83337b80bee 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_columnName_lgConfigK_int.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.json index fbd4ca05d9e99..590187f7fe63b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.json @@ -22,7 +22,8 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.proto.bin index bea4e1a642ab2..5b83337b80bee 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.json index fbd4ca05d9e99..590187f7fe63b 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.json @@ -22,7 +22,8 @@ "literal": { "integer": 0 } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.proto.bin index bea4e1a642ab2..5b83337b80bee 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_sketch_agg_with_column_lgConfigK_int.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.json index 74b3e7c4a7410..216afd0f5975a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "bytes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.proto.bin index e19b476247a24..309e80a86ee38 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.json index 74b3e7c4a7410..216afd0f5975a 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "bytes" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.proto.bin index e19b476247a24..309e80a86ee38 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.json index bb6413a94ced3..e733e086af5d6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.json @@ -22,7 +22,8 @@ "literal": { "boolean": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.proto.bin index 4f7f236583949..c71e656127200 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_columnName_allowDifferentLgConfigK_boolean.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.json index bb6413a94ced3..e733e086af5d6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.json @@ -22,7 +22,8 @@ "literal": { "boolean": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.proto.bin index 4f7f236583949..c71e656127200 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.json b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.json index bb6413a94ced3..e733e086af5d6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.json @@ -22,7 +22,8 @@ "literal": { "boolean": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.proto.bin index 4f7f236583949..c71e656127200 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/hll_union_agg_with_column_allowDifferentLgConfigK_boolean.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/join_condition.json b/sql/connect/common/src/test/resources/query-tests/queries/join_condition.json index 993cd98a7dd16..7151d0420f6b5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/join_condition.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/join_condition.json @@ -46,7 +46,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "r.id" } - }] + }], + "isInternal": false } }, "joinType": "JOIN_TYPE_LEFT_ANTI" diff --git a/sql/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin index 1d11fe5e75bcc..4784998b49cca 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json b/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json index 527338c56ae60..9308d6babdb25 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json @@ -46,7 +46,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "r.a" } - }] + }], + "isInternal": false } }, "joinType": "JOIN_TYPE_INNER" diff --git a/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin index 5d3de55da9cf8..a49cc6ef30806 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/pivot.json b/sql/connect/common/src/test/resources/query-tests/queries/pivot.json index 2af86606b9fcb..f085d1a43b678 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/pivot.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/pivot.json @@ -24,7 +24,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }], "pivot": { diff --git a/sql/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin index f545179e84968..73c88bf97535e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json b/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json index aa043613795c4..9d5b22bce6e89 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json @@ -24,7 +24,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }], "pivot": { diff --git a/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin index 588b56f247e07..d722db0e17ea9 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.json b/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.json index 1102db18830bd..146904dc898e3 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.json @@ -30,7 +30,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }, "name": ["count"] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin index 64dbb597c3650..8949050821a12 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.json b/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.json index 5082051031f81..6fe3659064e79 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.json @@ -32,7 +32,8 @@ "literal": { "integer": 1 } - }] + }], + "isInternal": false } }, "name": ["count"] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin index 63fdead641dad..3843ae77a9bfc 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json index 90ef62c5f415b..e3dcf84ae9c39 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json @@ -28,11 +28,14 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin index 2273a16d4e6a8..b72a7233c4c04 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.json b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.json index c9c6c75235694..d144dcf8b8af5 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.proto.bin index 37f3915cd8d18..18763400b4abb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_2-arg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.json b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.json index 23850dcb136ef..55b64d26d4904 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -48,7 +50,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.proto.bin index b3b56953a8586..d535835523de3 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_3-arg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.json b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.json index 2bbdb60794db5..da0adf605f977 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -48,7 +50,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -61,7 +64,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.proto.bin index bacccff22ae0a..50197b862ad14 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_4-arg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.json b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.json index 4f57c0ef82145..196a91b9fbb81 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.json @@ -22,7 +22,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -35,7 +36,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -48,7 +50,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -61,7 +64,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }, { "unresolvedFunction": { @@ -74,7 +78,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "b" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.proto.bin index 2c51e2088885f..e2ff25edd34cd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/select_typed_5-arg.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/toJSON.json b/sql/connect/common/src/test/resources/query-tests/queries/toJSON.json index 9a99a18853cf1..9faba08d9792c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/toJSON.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/toJSON.json @@ -21,9 +21,11 @@ "unresolvedStar": { "planId": "0" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin index e930ee76aae97..0cd2c3d35c6b4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.json b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.json index 6079e13bbfc93..b632fba4a0192 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.json @@ -22,7 +22,8 @@ "literal": { "string": "{\"type\": \"int\", \"name\": \"id\"}" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.proto.bin index 2843fbb67fecf..6c3907802968c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_with_schema.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.json b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.json index fa19d2120b94f..dd289a6abcc16 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.json @@ -18,7 +18,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.proto.bin index 4e7251125e4ce..59bce6aac25c7 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/to_avro_without_schema.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.json b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.json index 921c1b800a089..e71bddc0b19a0 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.json @@ -22,7 +22,8 @@ "literal": { "string": "org.apache.spark.connect.proto.StorageLevel" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.proto.bin index 5cc7c49882c03..f49e6d227ddcd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.json b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.json index 0843b469384e0..c6ccee6f35c3f 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.json @@ -26,7 +26,8 @@ "literal": { "binary": "CvwBCgxjb21tb24ucHJvdG8SDXNwYXJrLmNvbm5lY3QisAEKDFN0b3JhZ2VMZXZlbBIZCgh1c2VfZGlzaxgBIAEoCFIHdXNlRGlzaxIdCgp1c2VfbWVtb3J5GAIgASgIUgl1c2VNZW1vcnkSIAoMdXNlX29mZl9oZWFwGAMgASgIUgp1c2VPZmZIZWFwEiIKDGRlc2VyaWFsaXplZBgEIAEoCFIMZGVzZXJpYWxpemVkEiAKC3JlcGxpY2F0aW9uGAUgASgFUgtyZXBsaWNhdGlvbkIiCh5vcmcuYXBhY2hlLnNwYXJrLmNvbm5lY3QucHJvdG9QAWIGcHJvdG8z" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.proto.bin index c3fe14aef47da..c30bc963ce0eb 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.json b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.json index 76307b3141f7f..bc676b7aa5b1c 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.json @@ -37,9 +37,11 @@ "literal": { "string": "2" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.proto.bin index a387611c1ad55..971d6b358711c 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_descFilePath_options.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.json b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.json index 8787f0fc15d77..fa3d57a251cb6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.json @@ -33,9 +33,11 @@ "literal": { "string": "2" } - }] + }], + "isInternal": false } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.proto.bin index 9ef8348446ad4..7f955b1013fe1 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/to_protobuf_messageClassName_options.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/where_column.json b/sql/connect/common/src/test/resources/query-tests/queries/where_column.json index bef80a7e6ed5a..15cca60f2a407 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/where_column.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/where_column.json @@ -22,7 +22,8 @@ "literal": { "long": "1" } - }] + }], + "isInternal": false } } } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin index e472ed0715b62..ef377b7044366 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.json b/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.json index 93d3b5297d9e1..3e7c112776655 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.json @@ -30,7 +30,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "a" } - }] + }], + "isInternal": false } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.proto.bin index f212e97bc1c5a..2667912763248 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/width_bucket.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/window.json b/sql/connect/common/src/test/resources/query-tests/queries/window.json index 23fd5c1556ec5..ad9555f3ae898 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/window.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/window.json @@ -20,7 +20,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }, "partitionSpec": [{ @@ -42,7 +43,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }, "partitionSpec": [{ @@ -64,7 +66,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }, "orderSpec": [{ @@ -94,7 +97,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }, "orderSpec": [{ @@ -124,7 +128,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }, "orderSpec": [{ @@ -163,7 +168,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } }, "orderSpec": [{ @@ -202,7 +208,8 @@ "unresolvedAttribute": { "unparsedIdentifier": "id" } - }] + }], + "isInternal": false } } } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/window.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/window.proto.bin index a89c0d6a6a3f4..01616601af0ea 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/window.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/window.proto.bin differ diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala index 051093fcad277..21b5e057fb77e 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala @@ -241,14 +241,13 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message]( // The state of interrupted, response and lastIndex are changed under executionObserver // monitor, and will notify upon state change. if (response.isEmpty) { + var timeout = Math.max(1, deadlineTimeMillis - System.currentTimeMillis()) // Wake up more frequently to send the progress updates. val progressTimeout = executeHolder.sessionHolder.session.sessionState.conf .getConf(CONNECT_PROGRESS_REPORT_INTERVAL) // If the progress feature is disabled, wait for the deadline. - val timeout = if (progressTimeout > 0) { - progressTimeout - } else { - Math.max(1, deadlineTimeMillis - System.currentTimeMillis()) + if (progressTimeout > 0L) { + timeout = Math.min(progressTimeout, timeout) } logTrace(s"Wait for response to become available with timeout=$timeout ms.") executionObserver.responseLock.wait(timeout) @@ -291,7 +290,7 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message]( assert(finished == false) } else { // If it wasn't sent, time deadline must have been reached before stream became available, - // or it was intterupted. Will exit in the next loop iterattion. + // or it was interrupted. Will exit in the next loop iterattion. assert(deadlineLimitReached || interrupted) } } else if (streamFinished) { diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala index d27f390a23f95..05e3395a53169 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala @@ -245,7 +245,7 @@ private[connect] class ExecuteThreadRunner(executeHolder: ExecuteHolder) extends .createObservedMetricsResponse( executeHolder.sessionHolder.sessionId, executeHolder.sessionHolder.serverSessionId, - executeHolder.request.getPlan.getRoot.getCommon.getPlanId, + executeHolder.allObservationAndPlanIds, observedMetrics ++ accumulatedInPython)) } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala index c0fd00b2eeaa7..5e3499573e9d9 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala @@ -77,8 +77,10 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) responseObserver.onNext(createSchemaResponse(request.getSessionId, dataframe.schema)) processAsArrowBatches(dataframe, responseObserver, executeHolder) responseObserver.onNext(MetricGenerator.createMetricsResponse(sessionHolder, dataframe)) - createObservedMetricsResponse(request.getSessionId, dataframe).foreach( - responseObserver.onNext) + createObservedMetricsResponse( + request.getSessionId, + executeHolder.allObservationAndPlanIds, + dataframe).foreach(responseObserver.onNext) } type Batch = (Array[Byte], Long) @@ -255,6 +257,7 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) private def createObservedMetricsResponse( sessionId: String, + observationAndPlanIds: Map[String, Long], dataframe: DataFrame): Option[ExecutePlanResponse] = { val observedMetrics = dataframe.queryExecution.observedMetrics.collect { case (name, row) if !executeHolder.observations.contains(name) => @@ -264,13 +267,12 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) name -> values } if (observedMetrics.nonEmpty) { - val planId = executeHolder.request.getPlan.getRoot.getCommon.getPlanId Some( SparkConnectPlanExecution .createObservedMetricsResponse( sessionId, sessionHolder.serverSessionId, - planId, + observationAndPlanIds, observedMetrics)) } else None } @@ -280,17 +282,17 @@ object SparkConnectPlanExecution { def createObservedMetricsResponse( sessionId: String, serverSessionId: String, - planId: Long, + observationAndPlanIds: Map[String, Long], metrics: Map[String, Seq[(Option[String], Any)]]): ExecutePlanResponse = { val observedMetrics = metrics.map { case (name, values) => val metrics = ExecutePlanResponse.ObservedMetrics .newBuilder() .setName(name) - .setPlanId(planId) values.foreach { case (key, value) => metrics.addValues(toLiteralProto(value)) key.foreach(metrics.addKeys) } + observationAndPlanIds.get(name).foreach(metrics.setPlanId) metrics.build() } // Prepare a response with the observed metrics. diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLCache.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLCache.scala new file mode 100644 index 0000000000000..a036f8b67350d --- /dev/null +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLCache.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.ml + +import java.util.UUID +import java.util.concurrent.ConcurrentHashMap + +import org.apache.spark.internal.Logging + +/** + * MLCache is for caching ML objects, typically for models and summaries evaluated by a model. + */ +private[connect] class MLCache extends Logging { + private val cachedModel: ConcurrentHashMap[String, Object] = + new ConcurrentHashMap[String, Object]() + + /** + * Cache an object into a map of MLCache, and return its key + * @param obj + * the object to be cached + * @return + * the key + */ + def register(obj: Object): String = { + val objectId = UUID.randomUUID().toString + cachedModel.put(objectId, obj) + objectId + } + + /** + * Get the object by the key + * @param refId + * the key used to look up the corresponding object + * @return + * the cached object + */ + def get(refId: String): Object = { + cachedModel.get(refId) + } + + /** + * Remove the object from MLCache + * @param refId + * the key used to look up the corresponding object + */ + def remove(refId: String): Unit = { + cachedModel.remove(refId) + } + + /** + * Clear all the caches + */ + def clear(): Unit = { + cachedModel.clear() + } +} diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLException.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLException.scala new file mode 100644 index 0000000000000..eb88bf9169d3d --- /dev/null +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLException.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.ml + +import org.apache.spark.SparkException + +private[spark] case class MlUnsupportedException(message: String) + extends SparkException( + errorClass = "CONNECT_ML.UNSUPPORTED_EXCEPTION", + messageParameters = Map("message" -> message), + cause = null) + +private[spark] case class MLAttributeNotAllowedException(attribute: String) + extends SparkException( + errorClass = "CONNECT_ML.ATTRIBUTE_NOT_ALLOWED", + messageParameters = Map("attribute" -> attribute), + cause = null) diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLHandler.scala new file mode 100644 index 0000000000000..b4bc6bfdc66b4 --- /dev/null +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLHandler.scala @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.ml + +import scala.jdk.CollectionConverters.CollectionHasAsScala + +import org.apache.spark.connect.proto +import org.apache.spark.internal.Logging +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.util.{MLWritable, Summary} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter +import org.apache.spark.sql.connect.ml.Serializer.deserializeMethodArguments +import org.apache.spark.sql.connect.service.SessionHolder + +private case class Method( + name: String, + argValues: Array[Object] = Array.empty, + argClasses: Array[Class[_]] = Array.empty) + +/** + * Helper function to get the attribute from an object by reflection + */ +private class AttributeHelper( + val sessionHolder: SessionHolder, + val objRef: String, + val methods: Array[Method]) { + protected lazy val instance = sessionHolder.mlCache.get(objRef) + // Get the attribute by reflection + def getAttribute: Any = { + assert(methods.length >= 1) + methods.foldLeft(instance) { (obj, m) => + if (m.argValues.isEmpty) { + MLUtils.invokeMethodAllowed(obj, m.name) + } else { + MLUtils.invokeMethodAllowed(obj, m.name, m.argValues, m.argClasses) + } + } + } +} + +// Model specific attribute helper with transform supported +private class ModelAttributeHelper( + sessionHolder: SessionHolder, + objRef: String, + methods: Array[Method]) + extends AttributeHelper(sessionHolder, objRef, methods) { + + def transform(relation: proto.MlRelation.Transform): DataFrame = { + // Create a copied model to avoid concurrently modify model params. + val model = instance.asInstanceOf[Model[_]] + val copiedModel = model.copy(ParamMap.empty).asInstanceOf[Model[_]] + MLUtils.setInstanceParams(copiedModel, relation.getParams) + val inputDF = MLUtils.parseRelationProto(relation.getInput, sessionHolder) + copiedModel.transform(inputDF) + } +} + +private object AttributeHelper { + def parseMethods( + sessionHolder: SessionHolder, + methodsProto: Array[proto.Fetch.Method] = Array.empty): Array[Method] = { + methodsProto.map { m => + val (argValues, argClasses) = + deserializeMethodArguments(m.getArgsList.asScala.toArray, sessionHolder).unzip + Method(m.getMethod, argValues, argClasses) + } + } + def apply( + sessionHolder: SessionHolder, + objId: String, + methodsProto: Array[proto.Fetch.Method] = Array.empty): AttributeHelper = { + new AttributeHelper(sessionHolder, objId, parseMethods(sessionHolder, methodsProto)) + } +} + +private object ModelAttributeHelper { + def apply( + sessionHolder: SessionHolder, + objId: String, + methodsProto: Array[proto.Fetch.Method] = Array.empty): ModelAttributeHelper = { + new ModelAttributeHelper( + sessionHolder, + objId, + AttributeHelper.parseMethods(sessionHolder, methodsProto)) + } +} + +// MLHandler is a utility to group all ML operations +private[connect] object MLHandler extends Logging { + def handleMlCommand( + sessionHolder: SessionHolder, + mlCommand: proto.MlCommand): proto.MlCommandResult = { + + val mlCache = sessionHolder.mlCache + + mlCommand.getCommandCase match { + case proto.MlCommand.CommandCase.FIT => + val fitCmd = mlCommand.getFit + val estimatorProto = fitCmd.getEstimator + assert(estimatorProto.getType == proto.MlOperator.OperatorType.ESTIMATOR) + + val dataset = MLUtils.parseRelationProto(fitCmd.getDataset, sessionHolder) + val estimator = MLUtils.getEstimator(estimatorProto, Some(fitCmd.getParams)) + val model = estimator.fit(dataset).asInstanceOf[Model[_]] + val id = mlCache.register(model) + proto.MlCommandResult + .newBuilder() + .setOperatorInfo( + proto.MlCommandResult.MlOperatorInfo + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(id))) + .build() + + case proto.MlCommand.CommandCase.FETCH => + val helper = AttributeHelper( + sessionHolder, + mlCommand.getFetch.getObjRef.getId, + mlCommand.getFetch.getMethodsList.asScala.toArray) + val attrResult = helper.getAttribute + attrResult match { + case s: Summary => + val id = mlCache.register(s) + proto.MlCommandResult.newBuilder().setSummary(id).build() + case _ => + val param = Serializer.serializeParam(attrResult) + proto.MlCommandResult.newBuilder().setParam(param).build() + } + + case proto.MlCommand.CommandCase.DELETE => + val objId = mlCommand.getDelete.getObjRef.getId + var result = false + if (!objId.contains(".")) { + mlCache.remove(objId) + result = true + } + proto.MlCommandResult + .newBuilder() + .setParam( + proto.Param + .newBuilder() + .setLiteral(LiteralValueProtoConverter.toLiteralProto(result)) + .build()) + .build() + + case proto.MlCommand.CommandCase.WRITE => + mlCommand.getWrite.getTypeCase match { + case proto.MlCommand.Write.TypeCase.OBJ_REF => // save a model + val objId = mlCommand.getWrite.getObjRef.getId + val model = mlCache.get(objId).asInstanceOf[Model[_]] + val copiedModel = model.copy(ParamMap.empty).asInstanceOf[Model[_]] + MLUtils.setInstanceParams(copiedModel, mlCommand.getWrite.getParams) + + copiedModel match { + case m: MLWritable => MLUtils.write(m, mlCommand.getWrite) + case other => throw MlUnsupportedException(s"$other is not writable") + } + + // save an estimator/evaluator/transformer + case proto.MlCommand.Write.TypeCase.OPERATOR => + val writer = mlCommand.getWrite + if (writer.getOperator.getType == proto.MlOperator.OperatorType.ESTIMATOR) { + val estimator = MLUtils.getEstimator(writer.getOperator, Some(writer.getParams)) + estimator match { + case m: MLWritable => MLUtils.write(m, mlCommand.getWrite) + case other => throw MlUnsupportedException(s"Estimator $other is not writable") + } + } else { + throw MlUnsupportedException(s"${writer.getOperator.getName} not supported") + } + + case other => throw MlUnsupportedException(s"$other not supported") + } + proto.MlCommandResult.newBuilder().build() + + case proto.MlCommand.CommandCase.READ => + val operator = mlCommand.getRead.getOperator + val name = operator.getName + val path = mlCommand.getRead.getPath + + if (operator.getType == proto.MlOperator.OperatorType.MODEL) { + val model = MLUtils.load(name, path).asInstanceOf[Model[_]] + val id = mlCache.register(model) + proto.MlCommandResult + .newBuilder() + .setOperatorInfo( + proto.MlCommandResult.MlOperatorInfo + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(id)) + .setUid(model.uid) + .setParams(Serializer.serializeParams(model))) + .build() + + } else if (operator.getType == proto.MlOperator.OperatorType.ESTIMATOR) { + val estimator = MLUtils.load(name, path).asInstanceOf[Estimator[_]] + proto.MlCommandResult + .newBuilder() + .setOperatorInfo( + proto.MlCommandResult.MlOperatorInfo + .newBuilder() + .setName(name) + .setUid(estimator.uid) + .setParams(Serializer.serializeParams(estimator))) + .build() + } else { + throw MlUnsupportedException(s"${operator.getType} not supported") + } + + case other => throw MlUnsupportedException(s"$other not supported") + } + } + + def transformMLRelation(relation: proto.MlRelation, sessionHolder: SessionHolder): DataFrame = { + relation.getMlTypeCase match { + // Ml transform + case proto.MlRelation.MlTypeCase.TRANSFORM => + relation.getTransform.getOperatorCase match { + // transform for a new ML transformer + case proto.MlRelation.Transform.OperatorCase.TRANSFORMER => + val transformProto = relation.getTransform + assert( + transformProto.getTransformer.getType == + proto.MlOperator.OperatorType.TRANSFORMER) + val dataset = MLUtils.parseRelationProto(transformProto.getInput, sessionHolder) + val transformer = MLUtils.getTransformer(transformProto) + transformer.transform(dataset) + + // transform on a cached model + case proto.MlRelation.Transform.OperatorCase.OBJ_REF => + val helper = + ModelAttributeHelper( + sessionHolder, + relation.getTransform.getObjRef.getId, + Array.empty) + helper.transform(relation.getTransform) + + case other => throw new IllegalArgumentException(s"$other not supported") + } + + // Get the attribute from a cached object which could be a model or summary + case proto.MlRelation.MlTypeCase.FETCH => + val helper = AttributeHelper( + sessionHolder, + relation.getFetch.getObjRef.getId, + relation.getFetch.getMethodsList.asScala.toArray) + helper.getAttribute.asInstanceOf[DataFrame] + + case other => throw MlUnsupportedException(s"$other not supported") + } + } + +} diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala new file mode 100644 index 0000000000000..72c86401eb631 --- /dev/null +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.ml + +import java.util.ServiceLoader + +import scala.collection.immutable.HashSet +import scala.jdk.CollectionConverters._ + +import org.apache.commons.lang3.reflect.MethodUtils.invokeMethod + +import org.apache.spark.connect.proto +import org.apache.spark.ml.{Estimator, Transformer} +import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} +import org.apache.spark.ml.param.Params +import org.apache.spark.ml.util.MLWritable +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter +import org.apache.spark.sql.connect.planner.SparkConnectPlanner +import org.apache.spark.sql.connect.service.SessionHolder +import org.apache.spark.util.{SparkClassUtils, Utils} + +private[ml] object MLUtils { + + /** + * Load the registered ML operators via ServiceLoader + * + * @param mlCls + * the operator class + * @return + * a Map with name and class + */ + private def loadOperators(mlCls: Class[_]): Map[String, Class[_]] = { + val loader = Utils.getContextOrSparkClassLoader + val serviceLoader = ServiceLoader.load(mlCls, loader) + val providers = serviceLoader.asScala.toList + providers.map(est => est.getClass.getName -> est.getClass).toMap + } + + private lazy val estimators = loadOperators(classOf[Estimator[_]]) + + private lazy val transformers = loadOperators(classOf[Transformer]) + + def deserializeVector(vector: proto.Vector): Vector = { + if (vector.hasDense) { + val values = vector.getDense.getValueList.asScala.map(_.toDouble).toArray + Vectors.dense(values) + } else { + val size = vector.getSparse.getSize + val indices = vector.getSparse.getIndexList.asScala.map(_.toInt).toArray + val values = vector.getSparse.getValueList.asScala.map(_.toDouble).toArray + Vectors.sparse(size, indices, values) + } + } + + def deserializeMatrix(matrix: proto.Matrix): Matrix = { + if (matrix.hasDense) { + val values = matrix.getDense.getValueList.asScala.map(_.toDouble).toArray + Matrices.dense(matrix.getDense.getNumRows, matrix.getDense.getNumCols, values) + } else { + val sparse = matrix.getSparse + val colPtrs = sparse.getColptrList.asScala.map(_.toInt).toArray + val rowIndices = sparse.getRowIndexList.asScala.map(_.toInt).toArray + val values = sparse.getValueList.asScala.map(_.toDouble).toArray + Matrices.sparse(sparse.getNumRows, sparse.getNumCols, colPtrs, rowIndices, values) + } + } + + /** + * Set the parameters to the ML instance + * + * @param instance + * an ML operator + * @param params + * the parameters of the ML operator + */ + def setInstanceParams(instance: Params, params: proto.MlParams): Unit = { + params.getParamsMap.asScala.foreach { case (name, paramProto) => + val p = instance.getParam(name) + val value = if (paramProto.hasLiteral) { + reconcileParam( + p.paramValueClassTag.runtimeClass, + LiteralValueProtoConverter.toCatalystValue(paramProto.getLiteral)) + } else if (paramProto.hasVector) { + deserializeVector(paramProto.getVector) + } else if (paramProto.hasMatrix) { + deserializeMatrix(paramProto.getMatrix) + } else { + throw MlUnsupportedException(s"Unsupported parameter type for ${name}") + } + instance.set(p, value) + } + } + + /** + * Convert the array from Object[] to Array[_] + * @param elementType + * the element type of the array + * @param array + * to be reconciled + * @return + * the reconciled array + */ + private def reconcileArray(elementType: Class[_], array: Array[_]): Array[_] = { + if (elementType == classOf[Byte]) { + array.map(_.asInstanceOf[Byte]) + } else if (elementType == classOf[Short]) { + array.map(_.asInstanceOf[Short]) + } else if (elementType == classOf[Int]) { + array.map(_.asInstanceOf[Int]) + } else if (elementType == classOf[Long]) { + array.map(_.asInstanceOf[Long]) + } else if (elementType == classOf[Float]) { + array.map(_.asInstanceOf[Float]) + } else if (elementType == classOf[Double]) { + array.map(_.asInstanceOf[Double]) + } else if (elementType == classOf[String]) { + array.map(_.asInstanceOf[String]) + } else { + throw MlUnsupportedException( + s"array element type unsupported, " + + s"found ${elementType.getName}") + } + } + + /** + * Reconcile the parameter value given the provided parameter type. Currently, support + * byte/short/int/long/float/double/string and array. Note that, array of array is not supported + * yet. + */ + private def reconcileParam(paramType: Class[_], value: Any): Any = { + // Some cases the param type might be mismatched with the value type. + // Because in python side we only have int / float type for numeric params. + // e.g.: + // param type is Int but client sends a Long type. + // param type is Long but client sends a Int type. + // param type is Float but client sends a Double type. + // param type is Array[Int] but client sends a Array[Long] type. + // param type is Array[Float] but client sends a Array[Double] type. + // param type is Array[Array[Int]] but client sends a Array[Array[Long]] type. + // param type is Array[Array[Float]] but client sends a Array[Array[Double]] type. + if (paramType == classOf[Byte]) { + value.asInstanceOf[java.lang.Number].byteValue() + } else if (paramType == classOf[Short]) { + value.asInstanceOf[java.lang.Number].shortValue() + } else if (paramType == classOf[Int]) { + value.asInstanceOf[java.lang.Number].intValue() + } else if (paramType == classOf[Long]) { + value.asInstanceOf[java.lang.Number].longValue() + } else if (paramType == classOf[Float]) { + value.asInstanceOf[java.lang.Number].floatValue() + } else if (paramType == classOf[Double]) { + value.asInstanceOf[java.lang.Number].doubleValue() + } else if (paramType == classOf[Boolean]) { + value.asInstanceOf[Boolean] + } else if (paramType == classOf[String]) { + value.asInstanceOf[String] + } else if (paramType.isArray) { + val compType = paramType.getComponentType + if (compType.isArray) { + throw MlUnsupportedException(s"Array of array unsupported") + } else { + val array = value.asInstanceOf[Array[_]].map { e => + reconcileParam(compType, e) + } + reconcileArray(compType, array) + } + } else { + throw MlUnsupportedException(s"Unsupported parameter type, found ${paramType.getName}") + } + } + + def parseRelationProto(relation: proto.Relation, sessionHolder: SessionHolder): DataFrame = { + val planner = new SparkConnectPlanner(sessionHolder) + val plan = planner.transformRelation(relation) + Dataset.ofRows(sessionHolder.session, plan) + } + + /** + * Get the instance according to the provided proto information. + * + * @param name + * The name of the instance (either estimator or transformer). + * @param uid + * The unique identifier for the instance. + * @param instanceMap + * A map of instance names to constructors. + * @param params + * Optional parameters for the instance. + * @tparam T + * The type of the instance (Estimator or Transformer). + * @return + * The instance of the requested type. + * @throws MlUnsupportedException + * If the instance is not supported. + */ + private def getInstance[T]( + name: String, + uid: String, + instanceMap: Map[String, Class[_]], + params: Option[proto.MlParams]): T = { + if (instanceMap.isEmpty || !instanceMap.contains(name)) { + throw MlUnsupportedException(s"Unsupported ML operator, found $name") + } + + val instance = instanceMap(name) + .getConstructor(classOf[String]) + .newInstance(uid) + .asInstanceOf[T] + + // Set parameters for the instance if they are provided + params.foreach(p => MLUtils.setInstanceParams(instance.asInstanceOf[Params], p)) + instance + } + + /** + * Get the Estimator instance according to the proto information + * + * @param operator + * MlOperator information + * @param params + * The optional parameters of the estimator + * @return + * the estimator + */ + def getEstimator(operator: proto.MlOperator, params: Option[proto.MlParams]): Estimator[_] = { + val name = operator.getName + val uid = operator.getUid + getInstance[Estimator[_]](name, uid, estimators, params) + } + + /** + * Get the transformer instance according to the transform proto + * + * @param transformProto + * transform proto + * @return + * a transformer + */ + def getTransformer(transformProto: proto.MlRelation.Transform): Transformer = { + val name = transformProto.getTransformer.getName + val uid = transformProto.getTransformer.getUid + val params = transformProto.getParams + getInstance[Transformer](name, uid, transformers, Some(params)) + } + + /** + * Call "load: function on the ML operator given the operator name + * + * @param className + * the ML operator name + * @param path + * the path to be loaded + * @return + * the ML instance + */ + def load(className: String, path: String): Object = { + val loadedMethod = SparkClassUtils.classForName(className).getMethod("load", classOf[String]) + loadedMethod.invoke(null, path) + } + + // Since we're using reflection way to get the attribute, in order not to + // leave a security hole, we define an allowed attribute list that can be accessed. + // The attributes could be retrieved from the corresponding python class + private lazy val ALLOWED_ATTRIBUTES = HashSet( + "toString", + "numFeatures", + "predict", // PredictionModel + "numClasses", + "predictRaw", // ClassificationModel + "predictProbability", // ProbabilisticClassificationModel + "coefficients", + "intercept", + "coefficientMatrix", + "interceptVector", // LogisticRegressionModel + "summary", + "hasSummary", + "evaluate", // LogisticRegressionModel + "predictions", + "predictionCol", + "labelCol", + "weightCol", + "labels", // _ClassificationSummary + "truePositiveRateByLabel", + "falsePositiveRateByLabel", // _ClassificationSummary + "precisionByLabel", + "recallByLabel", + "fMeasureByLabel", + "accuracy", // _ClassificationSummary + "weightedTruePositiveRate", + "weightedFalsePositiveRate", // _ClassificationSummary + "weightedRecall", + "weightedPrecision", + "weightedFMeasure", // _ClassificationSummary + "scoreCol", + "roc", + "areaUnderROC", + "pr", + "fMeasureByThreshold", // _BinaryClassificationSummary + "precisionByThreshold", + "recallByThreshold", // _BinaryClassificationSummary + "probabilityCol", + "featuresCol", // LogisticRegressionSummary + "objectiveHistory", + "totalIterations" // _TrainingSummary + ) + + def invokeMethodAllowed(obj: Object, methodName: String): Object = { + if (!ALLOWED_ATTRIBUTES.contains(methodName)) { + throw MLAttributeNotAllowedException(methodName) + } + invokeMethod(obj, methodName) + } + + def invokeMethodAllowed( + obj: Object, + methodName: String, + args: Array[Object], + parameterTypes: Array[Class[_]]): Object = { + if (!ALLOWED_ATTRIBUTES.contains(methodName)) { + throw MLAttributeNotAllowedException(methodName) + } + invokeMethod(obj, methodName, args, parameterTypes) + } + + def write(instance: MLWritable, writeProto: proto.MlCommand.Write): Unit = { + val writer = if (writeProto.getShouldOverwrite) { + instance.write.overwrite() + } else { + instance.write + } + val path = writeProto.getPath + val options = writeProto.getOptionsMap + options.forEach((k, v) => writer.option(k, v)) + writer.save(path) + } + +} diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/Serializer.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/Serializer.scala new file mode 100644 index 0000000000000..ad6735997f834 --- /dev/null +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/Serializer.scala @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.ml + +import org.apache.spark.connect.proto +import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, SparseMatrix, SparseVector} +import org.apache.spark.ml.param.Params +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter +import org.apache.spark.sql.connect.service.SessionHolder + +private[ml] object Serializer { + + /** + * Serialize the ML parameters, currently support Vector/Matrix and literals + * @param data + * the value of parameter + * @return + * proto.Param + */ + def serializeParam(data: Any): proto.Param = { + data match { + case v: DenseVector => + val denseBuilder = proto.Vector.Dense.newBuilder() + v.values.foreach(denseBuilder.addValue) + proto.Param + .newBuilder() + .setVector(proto.Vector.newBuilder().setDense(denseBuilder)) + .build() + case v: SparseVector => + val sparseBuilder = proto.Vector.Sparse.newBuilder().setSize(v.size) + v.indices.foreach(sparseBuilder.addIndex) + v.values.foreach(sparseBuilder.addValue) + proto.Param + .newBuilder() + .setVector(proto.Vector.newBuilder().setSparse(sparseBuilder)) + .build() + case v: DenseMatrix => + val denseBuilder = proto.Matrix.Dense.newBuilder() + v.values.foreach(denseBuilder.addValue) + denseBuilder.setNumCols(v.numCols) + denseBuilder.setNumRows(v.numRows) + denseBuilder.setIsTransposed(v.isTransposed) + proto.Param + .newBuilder() + .setMatrix(proto.Matrix.newBuilder().setDense(denseBuilder)) + .build() + case v: SparseMatrix => + val sparseBuilder = proto.Matrix.Sparse + .newBuilder() + .setNumCols(v.numCols) + .setNumRows(v.numRows) + v.values.foreach(sparseBuilder.addValue) + v.colPtrs.foreach(sparseBuilder.addColptr) + v.rowIndices.foreach(sparseBuilder.addRowIndex) + proto.Param + .newBuilder() + .setMatrix(proto.Matrix.newBuilder().setSparse(sparseBuilder)) + .build() + case _: Byte | _: Short | _: Int | _: Long | _: Float | _: Double | _: Boolean | _: String | + _: Array[_] => + proto.Param + .newBuilder() + .setLiteral(LiteralValueProtoConverter.toLiteralProto(data)) + .build() + + case other => throw MlUnsupportedException(s"$other not supported") + } + } + + def deserializeMethodArguments( + args: Array[proto.Fetch.Method.Args], + sessionHolder: SessionHolder): Array[(Object, Class[_])] = { + args.map { arg => + if (arg.hasParam) { + val param = arg.getParam + if (param.hasLiteral) { + param.getLiteral.getLiteralTypeCase match { + case proto.Expression.Literal.LiteralTypeCase.INTEGER => + (param.getLiteral.getInteger.asInstanceOf[Object], classOf[Int]) + case proto.Expression.Literal.LiteralTypeCase.FLOAT => + (param.getLiteral.getFloat.toDouble.asInstanceOf[Object], classOf[Double]) + case proto.Expression.Literal.LiteralTypeCase.STRING => + (param.getLiteral.getString, classOf[String]) + case proto.Expression.Literal.LiteralTypeCase.DOUBLE => + (param.getLiteral.getDouble.asInstanceOf[Object], classOf[Double]) + case proto.Expression.Literal.LiteralTypeCase.BOOLEAN => + (param.getLiteral.getBoolean.asInstanceOf[Object], classOf[Boolean]) + case other => + throw MlUnsupportedException(s"$other not supported") + } + } else if (param.hasVector) { + val vector = MLUtils.deserializeVector(param.getVector) + val vectorType = if (param.getVector.hasDense) { + classOf[DenseVector] + } else { + classOf[SparseVector] + } + (vector, vectorType) + } else if (param.hasMatrix) { + val matrix = MLUtils.deserializeMatrix(param.getMatrix) + val matrixType = if (param.getMatrix.hasDense) { + classOf[DenseMatrix] + } else { + classOf[SparseMatrix] + } + (matrix, matrixType) + } else { + throw MlUnsupportedException(s"$param not supported") + } + } else if (arg.hasInput) { + (MLUtils.parseRelationProto(arg.getInput, sessionHolder), classOf[Dataset[_]]) + } else { + throw MlUnsupportedException(s"$arg not supported") + } + } + } + + /** + * Serialize an instance of "Params" which could be estimator/model/evaluator ... + * @param instance + * of Params + * @return + * proto.MlParams + */ + def serializeParams(instance: Params): proto.MlParams = { + val builder = proto.MlParams.newBuilder() + instance.params.foreach { param => + if (instance.isSet(param)) { + val v = serializeParam(instance.get(param).get) + builder.putParams(param.name, v) + } + } + builder.build() + } +} diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 979fd83612e7b..94a1ab1618086 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -43,9 +43,9 @@ import org.apache.spark.connect.proto.WriteStreamOperationStart.TriggerCase import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.LogKeys.{DATAFRAME_ID, SESSION_ID} import org.apache.spark.resource.{ExecutorResourceRequest, ResourceProfile, TaskResourceProfile, TaskResourceRequest} -import org.apache.spark.sql.{Dataset, Encoders, ForeachWriter, Observation, RelationalGroupedDataset, Row, SparkSession} +import org.apache.spark.sql.{Column, Dataset, Encoders, ForeachWriter, Observation, RelationalGroupedDataset, Row, SparkSession} import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier, QueryPlanningTracker} -import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, GlobalTempView, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDataFrameStar, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar, UnresolvedTableValuedFunction, UnresolvedTranspose} +import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, GlobalTempView, LazyExpression, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDataFrameStar, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedPlanId, UnresolvedRegex, UnresolvedRelation, UnresolvedStar, UnresolvedStarWithColumns, UnresolvedStarWithColumnsRenames, UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction, UnresolvedTranspose} import org.apache.spark.sql.catalyst.encoders.{encoderFor, AgnosticEncoder, ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.UnboundRowEncoder import org.apache.spark.sql.catalyst.expressions._ @@ -55,11 +55,13 @@ import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, JoinType, L import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.{AppendColumns, Assignment, CoGroup, CollectMetrics, CommandResult, Deduplicate, DeduplicateWithinWatermark, DeleteAction, DeserializeToObject, Except, FlatMapGroupsWithState, InsertAction, InsertStarAction, Intersect, JoinWith, LocalRelation, LogicalGroupState, LogicalPlan, MapGroups, MapPartitions, MergeAction, Project, Sample, SerializeFromObject, Sort, SubqueryAlias, TypedFilter, Union, Unpivot, UnresolvedHint, UpdateAction, UpdateStarAction} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes -import org.apache.spark.sql.catalyst.trees.CurrentOrigin +import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, TreePattern} import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} -import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, ForeachWriterPacket, InvalidPlanInput, LiteralValueProtoConverter, StorageLevelProtoConverter, StreamingListenerPacket, UdfPacket} +import org.apache.spark.sql.classic.ClassicConversions._ +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, ForeachWriterPacket, InvalidCommandInput, InvalidPlanInput, LiteralValueProtoConverter, StorageLevelProtoConverter, StreamingListenerPacket, UdfPacket} import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_SIZE +import org.apache.spark.sql.connect.ml.MLHandler import org.apache.spark.sql.connect.plugin.SparkConnectPluginRegistry import org.apache.spark.sql.connect.service.{ExecuteHolder, SessionHolder, SparkConnectService} import org.apache.spark.sql.connect.utils.MetricGenerator @@ -76,8 +78,7 @@ import org.apache.spark.sql.execution.stat.StatFunctions import org.apache.spark.sql.execution.streaming.GroupStateImpl.groupStateTimeoutFromString import org.apache.spark.sql.execution.streaming.StreamingQueryWrapper import org.apache.spark.sql.expressions.{Aggregator, ReduceAggregator, SparkUserDefinedFunction, UserDefinedAggregator, UserDefinedFunction} -import org.apache.spark.sql.internal.{CatalogImpl, MergeIntoWriterImpl, TypedAggUtils} -import org.apache.spark.sql.internal.ExpressionUtils.column +import org.apache.spark.sql.internal.{CatalogImpl, MergeIntoWriterImpl, TypedAggUtils, UserDefinedFunctionUtils} import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StreamingQuery, StreamingQueryListener, StreamingQueryProgress, Trigger} import org.apache.spark.sql.types._ import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -85,11 +86,6 @@ import org.apache.spark.storage.CacheId import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils -final case class InvalidCommandInput( - private val message: String = "", - private val cause: Throwable = null) - extends Exception(message, cause) - class SparkConnectPlanner( val sessionHolder: SessionHolder, val executeHolderOpt: Option[ExecuteHolder] = None) @@ -106,7 +102,7 @@ class SparkConnectPlanner( @Since("4.0.0") @DeveloperApi def session: SparkSession = sessionHolder.session - import sessionHolder.session.RichColumn + import sessionHolder.session.toRichColumn private[connect] def parser = session.sessionState.sqlParser @@ -159,15 +155,15 @@ class SparkConnectPlanner( case proto.Relation.RelTypeCase.TAIL => transformTail(rel.getTail) case proto.Relation.RelTypeCase.JOIN => transformJoinOrJoinWith(rel.getJoin) case proto.Relation.RelTypeCase.AS_OF_JOIN => transformAsOfJoin(rel.getAsOfJoin) + case proto.Relation.RelTypeCase.LATERAL_JOIN => transformLateralJoin(rel.getLateralJoin) case proto.Relation.RelTypeCase.DEDUPLICATE => transformDeduplicate(rel.getDeduplicate) case proto.Relation.RelTypeCase.SET_OP => transformSetOperation(rel.getSetOp) case proto.Relation.RelTypeCase.SORT => transformSort(rel.getSort) case proto.Relation.RelTypeCase.DROP => transformDrop(rel.getDrop) case proto.Relation.RelTypeCase.AGGREGATE => transformAggregate(rel.getAggregate) case proto.Relation.RelTypeCase.SQL => transformSql(rel.getSql) - case proto.Relation.RelTypeCase.WITH_RELATIONS - if isValidSQLWithRefs(rel.getWithRelations) => - transformSqlWithRefs(rel.getWithRelations) + case proto.Relation.RelTypeCase.WITH_RELATIONS => + transformWithRelations(rel.getWithRelations) case proto.Relation.RelTypeCase.LOCAL_RELATION => transformLocalRelation(rel.getLocalRelation) case proto.Relation.RelTypeCase.SAMPLE => transformSample(rel.getSample) @@ -228,6 +224,10 @@ class SparkConnectPlanner( // Catalog API (internal-only) case proto.Relation.RelTypeCase.CATALOG => transformCatalog(rel.getCatalog) + // ML Relation + case proto.Relation.RelTypeCase.ML_RELATION => + MLHandler.transformMLRelation(rel.getMlRelation, sessionHolder).logicalPlan + // Handle plugins for Spark Connect Relation types. case proto.Relation.RelTypeCase.EXTENSION => transformRelationPlugin(rel.getExtension) @@ -554,7 +554,7 @@ class SparkConnectPlanner( .ofRows(session, transformRelation(rel.getInput)) .stat .sampleBy( - col = column(transformExpression(rel.getCol)), + col = Column(transformExpression(rel.getCol)), fractions = fractions.toMap, seed = if (rel.hasSeed) rel.getSeed else Utils.random.nextLong) .logicalPlan @@ -562,7 +562,7 @@ class SparkConnectPlanner( private def transformToSchema(rel: proto.ToSchema): LogicalPlan = { val schema = transformDataType(rel.getSchema) - assert(schema.isInstanceOf[StructType]) + assertPlan(schema.isInstanceOf[StructType]) Dataset .ofRows(session, transformRelation(rel.getInput)) @@ -571,10 +571,9 @@ class SparkConnectPlanner( } private def transformToDF(rel: proto.ToDF): LogicalPlan = { - Dataset - .ofRows(session, transformRelation(rel.getInput)) - .toDF(rel.getColumnNamesList.asScala.toSeq: _*) - .logicalPlan + UnresolvedSubqueryColumnAliases( + rel.getColumnNamesList.asScala.toSeq, + transformRelation(rel.getInput)) } private def transformMapPartitions(rel: proto.MapPartitions): LogicalPlan = { @@ -646,17 +645,17 @@ class SparkConnectPlanner( val pythonUdf = transformPythonUDF(commonUdf) val cols = rel.getGroupingExpressionsList.asScala.toSeq.map(expr => - column(transformExpression(expr))) + Column(transformExpression(expr))) val group = Dataset .ofRows(session, transformRelation(rel.getInput)) .groupBy(cols: _*) pythonUdf.evalType match { case PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF => - group.flatMapGroupsInPandas(column(pythonUdf)).logicalPlan + group.flatMapGroupsInPandas(Column(pythonUdf)).logicalPlan case PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF => - group.flatMapGroupsInArrow(column(pythonUdf)).logicalPlan + group.flatMapGroupsInArrow(Column(pythonUdf)).logicalPlan case _ => throw InvalidPlanInput( @@ -672,7 +671,8 @@ class SparkConnectPlanner( private def transformTypedGroupMap( rel: proto.GroupMap, commonUdf: proto.CommonInlineUserDefinedFunction): LogicalPlan = { - val udf = TypedScalaUdf(commonUdf) + val unpackedUdf = unpackUdf(commonUdf) + val udf = TypedScalaUdf(unpackedUdf, None) val ds = UntypedKeyValueGroupedDataset( rel.getInput, rel.getGroupingExpressionsList, @@ -702,6 +702,18 @@ class SparkConnectPlanner( InternalOutputModes(rel.getOutputMode) } + val stateSchema = DataTypeProtoConverter.toCatalystType(rel.getStateSchema) match { + case s: StructType => s + case other => + throw InvalidPlanInput( + s"Invalid state schema dataType $other for flatMapGroupsWithState") + } + val stateEncoder = TypedScalaUdf.encoderFor( + // the state agnostic encoder is the second element in the input encoders. + unpackedUdf.inputEncoders.tail.head, + "state", + Some(DataTypeUtils.toAttributes(stateSchema))) + val flatMapGroupsWithState = if (hasInitialState) { new FlatMapGroupsWithState( udf.function @@ -711,7 +723,7 @@ class SparkConnectPlanner( ds.groupingAttributes, ds.dataAttributes, udf.outputObjAttr, - initialDs.vEncoder.asInstanceOf[ExpressionEncoder[Any]], + stateEncoder.asInstanceOf[ExpressionEncoder[Any]], outputMode, rel.getIsMapGroupsWithState, timeoutConf, @@ -730,7 +742,7 @@ class SparkConnectPlanner( ds.groupingAttributes, ds.dataAttributes, udf.outputObjAttr, - initialDs.vEncoder.asInstanceOf[ExpressionEncoder[Any]], + stateEncoder.asInstanceOf[ExpressionEncoder[Any]], outputMode, rel.getIsMapGroupsWithState, timeoutConf, @@ -765,10 +777,10 @@ class SparkConnectPlanner( case proto.CommonInlineUserDefinedFunction.FunctionCase.PYTHON_UDF => val inputCols = rel.getInputGroupingExpressionsList.asScala.toSeq.map(expr => - column(transformExpression(expr))) + Column(transformExpression(expr))) val otherCols = rel.getOtherGroupingExpressionsList.asScala.toSeq.map(expr => - column(transformExpression(expr))) + Column(transformExpression(expr))) val input = Dataset .ofRows(session, transformRelation(rel.getInput)) @@ -783,10 +795,10 @@ class SparkConnectPlanner( pythonUdf.evalType match { case PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF => - input.flatMapCoGroupsInPandas(other, pythonUdf).logicalPlan + input.flatMapCoGroupsInPandas(other, Column(pythonUdf)).logicalPlan case PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF => - input.flatMapCoGroupsInArrow(other, pythonUdf).logicalPlan + input.flatMapCoGroupsInArrow(other, Column(pythonUdf)).logicalPlan case _ => throw InvalidPlanInput( @@ -838,9 +850,10 @@ class SparkConnectPlanner( kEncoder: ExpressionEncoder[_], vEncoder: ExpressionEncoder[_], analyzed: LogicalPlan, - dataAttributes: Seq[Attribute], + analyzedData: LogicalPlan, groupingAttributes: Seq[Attribute], sortOrder: Seq[SortOrder]) { + val dataAttributes: Seq[Attribute] = analyzedData.output val valueDeserializer: Expression = UnresolvedDeserializer(vEncoder.deserializer, dataAttributes) } @@ -880,18 +893,20 @@ class SparkConnectPlanner( logicalPlan: LogicalPlan, groupingExprs: java.util.List[proto.Expression], sortOrder: Seq[SortOrder]): UntypedKeyValueGroupedDataset = { - assert(groupingExprs.size() >= 1) + val analyzed = session.sessionState.executePlan(logicalPlan).analyzed + + assertPlan(groupingExprs.size() >= 1) val dummyFunc = TypedScalaUdf(groupingExprs.get(0), None) val groupExprs = groupingExprs.asScala.toSeq.drop(1).map(expr => transformExpression(expr)) val (qe, aliasedGroupings) = - RelationalGroupedDataset.handleGroupingExpression(logicalPlan, session, groupExprs) + RelationalGroupedDataset.handleGroupingExpression(analyzed, session, groupExprs) UntypedKeyValueGroupedDataset( dummyFunc.outEnc, dummyFunc.inEnc, qe.analyzed, - logicalPlan.output, + analyzed, aliasedGroupings, sortOrder) } @@ -900,20 +915,22 @@ class SparkConnectPlanner( logicalPlan: LogicalPlan, groupingExprs: java.util.List[proto.Expression], sortOrder: Seq[SortOrder]): UntypedKeyValueGroupedDataset = { - assert(groupingExprs.size() == 1) - val groupFunc = TypedScalaUdf(groupingExprs.get(0), Some(logicalPlan.output)) + val analyzed = session.sessionState.executePlan(logicalPlan).analyzed + + assertPlan(groupingExprs.size() == 1) + val groupFunc = TypedScalaUdf(groupingExprs.get(0), Some(analyzed.output)) val vEnc = groupFunc.inEnc val kEnc = groupFunc.outEnc - val withGroupingKey = AppendColumns(groupFunc.function, vEnc, kEnc, logicalPlan) + val withGroupingKey = AppendColumns(groupFunc.function, vEnc, kEnc, analyzed) // The input logical plan of KeyValueGroupedDataset need to be executed and analyzed - val analyzed = session.sessionState.executePlan(withGroupingKey).analyzed + val withGroupingKeyAnalyzed = session.sessionState.executePlan(withGroupingKey).analyzed UntypedKeyValueGroupedDataset( kEnc, vEnc, + withGroupingKeyAnalyzed, analyzed, - logicalPlan.output, withGroupingKey.newColumns, sortOrder) } @@ -948,19 +965,23 @@ class SparkConnectPlanner( } } - def apply( - commonUdf: proto.CommonInlineUserDefinedFunction, - inputAttrs: Option[Seq[Attribute]] = None): TypedScalaUdf = { - val udf = unpackUdf(commonUdf) + def apply(udf: UdfPacket, inputAttrs: Option[Seq[Attribute]]): TypedScalaUdf = { // There might be more than one inputs, but we only interested in the first one. // Most typed API takes one UDF input. // For the few that takes more than one inputs, e.g. grouping function mapping UDFs, // the first input which is the key of the grouping function. - assert(udf.inputEncoders.nonEmpty) + assertPlan(udf.inputEncoders.nonEmpty) val inEnc = udf.inputEncoders.head // single input encoder or key encoder TypedScalaUdf(udf.function, udf.outputEncoder, inEnc, inputAttrs) } + def apply( + commonUdf: proto.CommonInlineUserDefinedFunction, + inputAttrs: Option[Seq[Attribute]] = None): TypedScalaUdf = { + val udf = unpackUdf(commonUdf) + apply(udf, inputAttrs) + } + def encoderFor( encoder: AgnosticEncoder[_], errorType: String, @@ -982,7 +1003,7 @@ class SparkConnectPlanner( private def transformApplyInPandasWithState(rel: proto.ApplyInPandasWithState): LogicalPlan = { val pythonUdf = transformPythonUDF(rel.getFunc) val cols = - rel.getGroupingExpressionsList.asScala.toSeq.map(expr => column(transformExpression(expr))) + rel.getGroupingExpressionsList.asScala.toSeq.map(expr => Column(transformExpression(expr))) val outputSchema = parseSchema(rel.getOutputSchema) @@ -992,7 +1013,7 @@ class SparkConnectPlanner( .ofRows(session, transformRelation(rel.getInput)) .groupBy(cols: _*) .applyInPandasWithState( - column(pythonUdf), + Column(pythonUdf), outputSchema, stateSchema, rel.getOutputMode, @@ -1049,25 +1070,21 @@ class SparkConnectPlanner( } private def transformWithColumnsRenamed(rel: proto.WithColumnsRenamed): LogicalPlan = { - if (rel.getRenamesCount > 0) { - val (colNames, newColNames) = rel.getRenamesList.asScala.toSeq.map { rename => + val (colNames, newColNames) = if (rel.getRenamesCount > 0) { + rel.getRenamesList.asScala.toSeq.map { rename => (rename.getColName, rename.getNewColName) }.unzip - Dataset - .ofRows(session, transformRelation(rel.getInput)) - .withColumnsRenamed(colNames, newColNames) - .logicalPlan } else { // for backward compatibility - Dataset - .ofRows(session, transformRelation(rel.getInput)) - .withColumnsRenamed(rel.getRenameColumnsMapMap) - .logicalPlan + rel.getRenameColumnsMapMap.asScala.toSeq.unzip } + Project( + Seq(UnresolvedStarWithColumnsRenames(existingNames = colNames, newNames = newColNames)), + transformRelation(rel.getInput)) } private def transformWithColumns(rel: proto.WithColumns): LogicalPlan = { - val (colNames, cols, metadata) = + val (colNames, exprs, metadata) = rel.getAliasesList.asScala.toSeq.map { alias => if (alias.getNameCount != 1) { throw InvalidPlanInput(s"""WithColumns require column name only contains one name part, @@ -1080,13 +1097,16 @@ class SparkConnectPlanner( Metadata.empty } - (alias.getName(0), column(transformExpression(alias.getExpr)), metadata) + (alias.getName(0), transformExpression(alias.getExpr), metadata) }.unzip3 - Dataset - .ofRows(session, transformRelation(rel.getInput)) - .withColumns(colNames, cols, metadata) - .logicalPlan + Project( + Seq( + UnresolvedStarWithColumns( + colNames = colNames, + exprs = exprs, + explicitMetadata = Some(metadata))), + transformRelation(rel.getInput)) } private def transformWithWatermark(rel: proto.WithWatermark): LogicalPlan = { @@ -1142,7 +1162,7 @@ class SparkConnectPlanner( private def transformUnpivot(rel: proto.Unpivot): LogicalPlan = { val ids = rel.getIdsList.asScala.toArray.map { expr => - column(transformExpression(expr)) + Column(transformExpression(expr)) } if (!rel.hasValues) { @@ -1155,7 +1175,7 @@ class SparkConnectPlanner( transformRelation(rel.getInput)) } else { val values = rel.getValues.getValuesList.asScala.toArray.map { expr => - column(transformExpression(expr)) + Column(transformExpression(expr)) } Unpivot( @@ -1184,20 +1204,20 @@ class SparkConnectPlanner( private def transformCollectMetrics(rel: proto.CollectMetrics, planId: Long): LogicalPlan = { val metrics = rel.getMetricsList.asScala.toSeq.map { expr => - column(transformExpression(expr)) + Column(transformExpression(expr)) } val name = rel.getName val input = transformRelation(rel.getInput) if (input.isStreaming || executeHolderOpt.isEmpty) { - CollectMetrics(name, metrics.map(_.named), transformRelation(rel.getInput), planId) + CollectMetrics(name, metrics.map(_.named), input, planId) } else { // TODO this might be too complex for no good reason. It might // be easier to inspect the plan after it completes. val observation = Observation(name) session.observationManager.register(observation, planId) executeHolderOpt.get.addObservation(name, observation) - CollectMetrics(name, metrics.map(_.named), transformRelation(rel.getInput), planId) + CollectMetrics(name, metrics.map(_.named), input, planId) } } @@ -1435,7 +1455,7 @@ class SparkConnectPlanner( } private def transformFilter(rel: proto.Filter): LogicalPlan = { - assert(rel.hasInput) + assertPlan(rel.hasInput) val baseRel = transformRelation(rel.getInput) val cond = rel.getCondition if (isTypedScalaUdfExpr(cond)) { @@ -1462,8 +1482,9 @@ class SparkConnectPlanner( private def transformTypedFilter( fun: proto.CommonInlineUserDefinedFunction, child: LogicalPlan): TypedFilter = { - val udf = TypedScalaUdf(fun, Some(child.output)) - TypedFilter(udf.function, child)(udf.inEnc) + val analyzed = session.sessionState.executePlan(child).analyzed + val udf = TypedScalaUdf(fun, Some(analyzed.output)) + TypedFilter(udf.function, analyzed)(udf.inEnc) } private def transformProject(rel: proto.Project): LogicalPlan = { @@ -1473,11 +1494,19 @@ class SparkConnectPlanner( logical.OneRowRelation() } + val logicalPlan = + if (rel.getExpressionsList.asScala.toSeq.exists( + _.getExprTypeCase == proto.Expression.ExprTypeCase.TYPED_AGGREGATE_EXPRESSION)) { + session.sessionState.executePlan(baseRel).analyzed + } else { + baseRel + } + val projection = rel.getExpressionsList.asScala.toSeq - .map(transformExpression(_, Some(baseRel))) + .map(transformExpression(_, Some(logicalPlan))) .map(toNamedExpression) - logical.Project(projectList = projection, child = baseRel) + logical.Project(projectList = projection, child = logicalPlan) } /** @@ -1561,6 +1590,10 @@ class SparkConnectPlanner( transformMergeAction(exp.getMergeAction) case proto.Expression.ExprTypeCase.TYPED_AGGREGATE_EXPRESSION => transformTypedAggregateExpression(exp.getTypedAggregateExpression, baseRelationOpt) + case proto.Expression.ExprTypeCase.LAZY_EXPRESSION => + transformLazyExpression(exp.getLazyExpression) + case proto.Expression.ExprTypeCase.SUBQUERY_EXPRESSION => + transformSubqueryExpression(exp.getSubqueryExpression) case _ => throw InvalidPlanInput( s"Expression with ID: ${exp.getExprTypeCase.getNumber} is not supported") @@ -1634,14 +1667,18 @@ class SparkConnectPlanner( fun.getArgumentsList.asScala.map(transformExpression).toSeq, isDistinct = fun.getIsDistinct) } else { - // Spark Connect historically used the global namespace to lookup a couple of internal - // functions (e.g. product, collect_top_k, unwrap_udt, ...). In Spark 4 we moved these - // functions to a dedicated namespace, however in order to stay backwards compatible we still - // need to allow connect to use the global namespace. Here we check if a function is - // registered in the internal function registry, and we reroute the lookup to the internal - // registry. val name = fun.getFunctionName - val internal = FunctionRegistry.internal.functionExists(FunctionIdentifier(name)) + val internal = if (fun.hasIsInternal) { + fun.getIsInternal + } else { + // Spark Connect historically used the global namespace to look up a couple of internal + // functions (e.g. product, collect_top_k, unwrap_udt, ...). In Spark 4 we moved these + // functions to a dedicated namespace, however in order to stay backwards compatible we + // still need to allow Connect to use the global namespace. Here we check if a function is + // registered in the internal function registry, and we reroute the lookup to the internal + // registry. + FunctionRegistry.internal.functionExists(FunctionIdentifier(name)) + } UnresolvedFunction( name :: Nil, fun.getArgumentsList.asScala.map(transformExpression).toSeq, @@ -1723,40 +1760,42 @@ class SparkConnectPlanner( } /** - * Translates a Scala user-defined function from proto to the Catalyst expression. + * Translates a Scala user-defined function or aggregator from proto to the corresponding + * Catalyst expression. * * @param fun - * Proto representation of the Scala user-defined function. + * Proto representation of the Scala user-defined function or aggregator. * @return - * ScalaUDF. + * An expression, either a ScalaUDF or a ScalaAggregator. */ private def transformScalaUDF(fun: proto.CommonInlineUserDefinedFunction): Expression = { - val udf = fun.getScalarScalaUdf - val udfPacket = unpackUdf(fun) - if (udf.getAggregate) { - ScalaAggregator( - transformScalaFunction(fun).asInstanceOf[UserDefinedAggregator[Any, Any, Any]], - fun.getArgumentsList.asScala.map(transformExpression).toSeq) - .toAggregateExpression() - } else { - ScalaUDF( - function = udfPacket.function, - dataType = transformDataType(udf.getOutputType), - children = fun.getArgumentsList.asScala.map(transformExpression).toSeq, - inputEncoders = udfPacket.inputEncoders.map(e => Try(ExpressionEncoder(e)).toOption), - outputEncoder = Option(ExpressionEncoder(udfPacket.outputEncoder)), - udfName = Option(fun.getFunctionName), - nullable = udf.getNullable, - udfDeterministic = fun.getDeterministic) + val children = fun.getArgumentsList.asScala.map(transformExpression).toSeq + transformScalaFunction(fun) match { + case udf: SparkUserDefinedFunction => + UserDefinedFunctionUtils.toScalaUDF(udf, children) + case uda: UserDefinedAggregator[_, _, _] => + ScalaAggregator(uda, children).toAggregateExpression() + case other => + throw InvalidPlanInput( + s"Unsupported UserDefinedFunction implementation: ${other.getClass}") } } + /** + * Translates a Scala user-defined function or aggregator. from proto to a UserDefinedFunction. + * + * @param fun + * Proto representation of the Scala user-defined function or aggregator. + * @return + * A concrete UserDefinedFunction implementation, either a SparkUserDefinedFunction or a + * UserDefinedAggregator. + */ private def transformScalaFunction( fun: proto.CommonInlineUserDefinedFunction): UserDefinedFunction = { val udf = fun.getScalarScalaUdf val udfPacket = unpackUdf(fun) if (udf.getAggregate) { - assert(udfPacket.inputEncoders.size == 1, "UDAF should have exactly one input encoder") + assertPlan(udfPacket.inputEncoders.size == 1, "UDAF should have exactly one input encoder") UserDefinedAggregator( aggregator = udfPacket.function.asInstanceOf[Aggregator[Any, Any, Any]], inputEncoder = ExpressionEncoder(udfPacket.inputEncoders.head), @@ -2074,7 +2113,7 @@ class SparkConnectPlanner( } private def transformJoin(rel: proto.Join): LogicalPlan = { - assert(rel.hasLeft && rel.hasRight, "Both join sides must be present") + assertPlan(rel.hasLeft && rel.hasRight, "Both join sides must be present") if (rel.hasJoinCondition && rel.getUsingColumnsCount > 0) { throw InvalidPlanInput( s"Using columns or join conditions cannot be set at the same time in Join") @@ -2112,10 +2151,10 @@ class SparkConnectPlanner( private def transformAsOfJoin(rel: proto.AsOfJoin): LogicalPlan = { val left = Dataset.ofRows(session, transformRelation(rel.getLeft)) val right = Dataset.ofRows(session, transformRelation(rel.getRight)) - val leftAsOf = column(transformExpression(rel.getLeftAsOf)) - val rightAsOf = column(transformExpression(rel.getRightAsOf)) + val leftAsOf = Column(transformExpression(rel.getLeftAsOf)) + val rightAsOf = Column(transformExpression(rel.getRightAsOf)) val joinType = rel.getJoinType - val tolerance = if (rel.hasTolerance) column(transformExpression(rel.getTolerance)) else null + val tolerance = if (rel.hasTolerance) Column(transformExpression(rel.getTolerance)) else null val allowExactMatches = rel.getAllowExactMatches val direction = rel.getDirection @@ -2131,7 +2170,7 @@ class SparkConnectPlanner( allowExactMatches = allowExactMatches, direction = direction) } else { - val joinExprs = if (rel.hasJoinExpr) column(transformExpression(rel.getJoinExpr)) else null + val joinExprs = if (rel.hasJoinExpr) Column(transformExpression(rel.getJoinExpr)) else null left.joinAsOf( other = right, leftAsOf = leftAsOf, @@ -2145,8 +2184,21 @@ class SparkConnectPlanner( joined.logicalPlan } + private def transformLateralJoin(rel: proto.LateralJoin): LogicalPlan = { + assertPlan(rel.hasLeft && rel.hasRight, "Both join sides must be present") + val joinCondition = + if (rel.hasJoinCondition) Some(transformExpression(rel.getJoinCondition)) else None + val joinType = transformJoinType( + if (rel.getJoinType != null) rel.getJoinType else proto.Join.JoinType.JOIN_TYPE_INNER) + logical.LateralJoin( + left = transformRelation(rel.getLeft), + right = LateralSubquery(transformRelation(rel.getRight)), + joinType = joinType, + condition = joinCondition) + } + private def transformSort(sort: proto.Sort): LogicalPlan = { - assert(sort.getOrderCount > 0, "'order' must be present and contain elements.") + assertPlan(sort.getOrderCount > 0, "'order' must be present and contain elements.") logical.Sort( child = transformRelation(sort.getInput), global = sort.getIsGlobal, @@ -2172,7 +2224,7 @@ class SparkConnectPlanner( private def transformDrop(rel: proto.Drop): LogicalPlan = { var output = Dataset.ofRows(session, transformRelation(rel.getInput)) if (rel.getColumnsCount > 0) { - val cols = rel.getColumnsList.asScala.toSeq.map(expr => column(transformExpression(expr))) + val cols = rel.getColumnsList.asScala.toSeq.map(expr => Column(transformExpression(expr))) output = output.drop(cols.head, cols.tail: _*) } if (rel.getColumnNamesCount > 0) { @@ -2202,7 +2254,7 @@ class SparkConnectPlanner( val keyColumn = TypedAggUtils.aggKeyColumn(ds.kEncoder, ds.groupingAttributes) val namedColumns = rel.getAggregateExpressionsList.asScala.toSeq - .map(expr => transformExpressionWithTypedReduceExpression(expr, input)) + .map(expr => transformExpressionWithTypedReduceExpression(expr, ds.analyzedData)) .map(toNamedExpression) logical.Aggregate(ds.groupingAttributes, keyColumn +: namedColumns, ds.analyzed) } @@ -2213,9 +2265,17 @@ class SparkConnectPlanner( } val input = transformRelation(rel.getInput) + val logicalPlan = + if (rel.getAggregateExpressionsList.asScala.toSeq.exists( + _.getExprTypeCase == proto.Expression.ExprTypeCase.TYPED_AGGREGATE_EXPRESSION)) { + session.sessionState.executePlan(input).analyzed + } else { + input + } + val groupingExprs = rel.getGroupingExpressionsList.asScala.toSeq.map(transformExpression) val aggExprs = rel.getAggregateExpressionsList.asScala.toSeq - .map(expr => transformExpressionWithTypedReduceExpression(expr, input)) + .map(expr => transformExpressionWithTypedReduceExpression(expr, logicalPlan)) val aliasedAgg = (groupingExprs ++ aggExprs).map(toNamedExpression) rel.getGroupType match { @@ -2223,19 +2283,19 @@ class SparkConnectPlanner( logical.Aggregate( groupingExpressions = groupingExprs, aggregateExpressions = aliasedAgg, - child = input) + child = logicalPlan) case proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP => logical.Aggregate( groupingExpressions = Seq(Rollup(groupingExprs.map(Seq(_)))), aggregateExpressions = aliasedAgg, - child = input) + child = logicalPlan) case proto.Aggregate.GroupType.GROUP_TYPE_CUBE => logical.Aggregate( groupingExpressions = Seq(Cube(groupingExprs.map(Seq(_)))), aggregateExpressions = aliasedAgg, - child = input) + child = logicalPlan) case proto.Aggregate.GroupType.GROUP_TYPE_PIVOT => if (!rel.hasPivot) { @@ -2247,7 +2307,7 @@ class SparkConnectPlanner( rel.getPivot.getValuesList.asScala.toSeq.map(transformLiteral) } else { RelationalGroupedDataset - .collectPivotValues(Dataset.ofRows(session, input), column(pivotExpr)) + .collectPivotValues(Dataset.ofRows(session, logicalPlan), Column(pivotExpr)) .map(expressions.Literal.apply) } logical.Pivot( @@ -2255,7 +2315,7 @@ class SparkConnectPlanner( pivotColumn = pivotExpr, pivotValues = valueExprs, aggregates = aggExprs, - child = input) + child = logicalPlan) case proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS => val groupingSetsExprs = rel.getGroupingSetsList.asScala.toSeq.map { getGroupingSets => @@ -2267,7 +2327,7 @@ class SparkConnectPlanner( groupingSets = groupingSetsExprs, userGivenGroupByExprs = groupingExprs)), aggregateExpressions = aliasedAgg, - child = input) + child = logicalPlan) case other => throw InvalidPlanInput(s"Unknown Group Type $other") } @@ -2276,10 +2336,8 @@ class SparkConnectPlanner( private def transformTypedReduceExpression( fun: proto.Expression.UnresolvedFunction, dataAttributes: Seq[Attribute]): Expression = { - assert(fun.getFunctionName == "reduce") - if (fun.getArgumentsCount != 1) { - throw InvalidPlanInput("reduce requires single child expression") - } + assertPlan(fun.getFunctionName == "reduce") + assertPlan(fun.getArgumentsCount == 1, "reduce requires single child expression") val udf = fun.getArgumentsList.asScala match { case collection.Seq(e) if e.hasCommonInlineUserDefinedFunction && @@ -2309,10 +2367,10 @@ class SparkConnectPlanner( expr: proto.TypedAggregateExpression, baseRelationOpt: Option[LogicalPlan]): AggregateExpression = { val udf = expr.getScalarScalaUdf - assert(udf.getAggregate) + assertPlan(udf.getAggregate) val udfPacket = unpackScalaUDF[UdfPacket](udf) - assert(udfPacket.inputEncoders.size == 1, "UDAF should have exactly one input encoder") + assertPlan(udfPacket.inputEncoders.size == 1, "UDAF should have exactly one input encoder") val aggregator = udfPacket.function.asInstanceOf[Aggregator[Any, Any, Any]] val tae = @@ -2345,17 +2403,17 @@ class SparkConnectPlanner( }.toSeq action.getActionType match { case proto.MergeAction.ActionType.ACTION_TYPE_DELETE => - assert(assignments.isEmpty, "Delete action should not have assignment.") + assertPlan(assignments.isEmpty, "Delete action should not have assignment.") DeleteAction(condition) case proto.MergeAction.ActionType.ACTION_TYPE_INSERT => InsertAction(condition, assignments) case proto.MergeAction.ActionType.ACTION_TYPE_INSERT_STAR => - assert(assignments.isEmpty, "InsertStar action should not have assignment.") + assertPlan(assignments.isEmpty, "InsertStar action should not have assignment.") InsertStarAction(condition) case proto.MergeAction.ActionType.ACTION_TYPE_UPDATE => UpdateAction(condition, assignments) case proto.MergeAction.ActionType.ACTION_TYPE_UPDATE_STAR => - assert(assignments.isEmpty, "UpdateStar action should not have assignment.") + assertPlan(assignments.isEmpty, "UpdateStar action should not have assignment.") UpdateStarAction(condition) case _ => throw InvalidPlanInput(s"Unsupported merge action type ${action.getActionType}.") @@ -2407,11 +2465,27 @@ class SparkConnectPlanner( handleRemoveCachedRemoteRelationCommand(command.getRemoveCachedRemoteRelationCommand) case proto.Command.CommandTypeCase.MERGE_INTO_TABLE_COMMAND => handleMergeIntoTableCommand(command.getMergeIntoTableCommand) + case proto.Command.CommandTypeCase.ML_COMMAND => + handleMlCommand(command.getMlCommand, responseObserver) case _ => throw new UnsupportedOperationException(s"$command not supported.") } } + private def handleMlCommand( + command: proto.MlCommand, + responseObserver: StreamObserver[proto.ExecutePlanResponse]): Unit = { + val result = MLHandler.handleMlCommand(sessionHolder, command) + executeHolder.eventsManager.postFinished() + responseObserver.onNext( + proto.ExecutePlanResponse + .newBuilder() + .setSessionId(sessionId) + .setServerSideSessionId(sessionHolder.serverSessionId) + .setMlCommandResult(result) + .build()) + } + private def handleSqlCommand( command: SqlCommand, responseObserver: StreamObserver[ExecutePlanResponse]): Unit = { @@ -2574,12 +2648,12 @@ class SparkConnectPlanner( if (!namedArguments.isEmpty) { session.sql( sql.getQuery, - namedArguments.asScala.toMap.transform((_, e) => column(transformExpression(e))), + namedArguments.asScala.toMap.transform((_, e) => Column(transformExpression(e))), tracker) } else if (!posArguments.isEmpty) { session.sql( sql.getQuery, - posArguments.asScala.map(e => column(transformExpression(e))).toArray, + posArguments.asScala.map(e => Column(transformExpression(e))).toArray, tracker) } else if (!args.isEmpty) { session.sql( @@ -2710,6 +2784,7 @@ class SparkConnectPlanner( name = tableIdentifier, userSpecifiedColumns = Nil, comment = None, + collation = None, properties = Map.empty, originalText = None, plan = transformRelation(createView.getInput), @@ -2830,7 +2905,7 @@ class SparkConnectPlanner( if (writeOperation.getPartitioningColumnsCount > 0) { val names = writeOperation.getPartitioningColumnsList.asScala .map(transformExpression) - .map(column) + .map(Column(_)) .toSeq w.partitionedBy(names.head, names.tail: _*) } @@ -2848,7 +2923,7 @@ class SparkConnectPlanner( w.create() } case proto.WriteOperationV2.Mode.MODE_OVERWRITE => - w.overwrite(column(transformExpression(writeOperation.getOverwriteCondition))) + w.overwrite(Column(transformExpression(writeOperation.getOverwriteCondition))) case proto.WriteOperationV2.Mode.MODE_OVERWRITE_PARTITIONS => w.overwritePartitions() case proto.WriteOperationV2.Mode.MODE_APPEND => @@ -2946,10 +3021,9 @@ class SparkConnectPlanner( fn case StreamingForeachFunction.FunctionCase.SCALA_FUNCTION => - val scalaFn = Utils.deserialize[StreamingForeachBatchHelper.ForeachBatchFnType]( + StreamingForeachBatchHelper.scalaForeachBatchWrapper( writeOp.getForeachBatch.getScalaFunction.getPayload.toByteArray, - Utils.getContextOrSparkClassLoader) - StreamingForeachBatchHelper.scalaForeachBatchWrapper(scalaFn, sessionHolder) + sessionHolder) case StreamingForeachFunction.FunctionCase.FUNCTION_NOT_SET => throw InvalidPlanInput("Unexpected foreachBatch function") // Unreachable @@ -3410,7 +3484,7 @@ class SparkConnectPlanner( val sourceDs = Dataset.ofRows(session, transformRelation(cmd.getSourceTablePlan)) val mergeInto = sourceDs - .mergeInto(cmd.getTargetTableName, column(transformExpression(cmd.getMergeCondition))) + .mergeInto(cmd.getTargetTableName, Column(transformExpression(cmd.getMergeCondition))) .asInstanceOf[MergeIntoWriterImpl[Row]] mergeInto.matchedActions ++= matchedActions mergeInto.notMatchedActions ++= notMatchedActions @@ -3567,7 +3641,7 @@ class SparkConnectPlanner( getCreateExternalTable: proto.CreateExternalTable): LogicalPlan = { val schema = if (getCreateExternalTable.hasSchema) { val struct = transformDataType(getCreateExternalTable.getSchema) - assert(struct.isInstanceOf[StructType]) + assertPlan(struct.isInstanceOf[StructType]) struct.asInstanceOf[StructType] } else { new StructType @@ -3597,7 +3671,7 @@ class SparkConnectPlanner( private def transformCreateTable(getCreateTable: proto.CreateTable): LogicalPlan = { val schema = if (getCreateTable.hasSchema) { val struct = transformDataType(getCreateTable.getSchema) - assert(struct.isInstanceOf[StructType]) + assertPlan(struct.isInstanceOf[StructType]) struct.asInstanceOf[StructType] } else { new StructType @@ -3709,4 +3783,61 @@ class SparkConnectPlanner( session.catalog.listCatalogs().logicalPlan } } + + private def transformLazyExpression(getLazyExpression: proto.LazyExpression): Expression = { + LazyExpression(transformExpression(getLazyExpression.getChild)) + } + + private def transformSubqueryExpression( + getSubqueryExpression: proto.SubqueryExpression): Expression = { + val planId = getSubqueryExpression.getPlanId + getSubqueryExpression.getSubqueryType match { + case proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_SCALAR => + UnresolvedScalarSubqueryPlanId(planId) + case proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_EXISTS => + UnresolvedExistsPlanId(planId) + case other => throw InvalidPlanInput(s"Unknown SubqueryType $other") + } + } + + private def transformWithRelations(getWithRelations: proto.WithRelations): LogicalPlan = { + if (isValidSQLWithRefs(getWithRelations)) { + transformSqlWithRefs(getWithRelations) + } else { + // Wrap the plan to keep the original planId. + val plan = Project(Seq(UnresolvedStar(None)), transformRelation(getWithRelations.getRoot)) + + val relations = getWithRelations.getReferencesList.asScala.map { ref => + if (ref.hasCommon && ref.getCommon.hasPlanId) { + val planId = ref.getCommon.getPlanId + val plan = transformRelation(ref) + planId -> plan + } else { + throw InvalidPlanInput("Invalid WithRelation reference") + } + }.toMap + + val missingPlanIds = mutable.Set.empty[Long] + val withRelations = plan + .transformAllExpressionsWithPruning(_.containsPattern(TreePattern.UNRESOLVED_PLAN_ID)) { + case u: UnresolvedPlanId => + if (relations.contains(u.planId)) { + u.withPlan(relations(u.planId)) + } else { + missingPlanIds += u.planId + u + } + } + assertPlan( + missingPlanIds.isEmpty, + "Missing relation in WithRelations: " + + s"${missingPlanIds.mkString("(", ", ", ")")} not in " + + s"${relations.keys.mkString("(", ", ", ")")}") + withRelations + } + } + + private def assertPlan(assertion: Boolean, message: => String = ""): Unit = { + if (!assertion) throw InvalidPlanInput(message) + } } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala index df883a5c86814..ab6bed7152c09 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala @@ -27,12 +27,15 @@ import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.api.python.{PythonException, PythonWorkerUtils, SimplePythonFunction, SpecialLengths, StreamingPythonRunner} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKeys.{DATAFRAME_ID, QUERY_ID, RUN_ID, SESSION_ID} -import org.apache.spark.sql.DataFrame +import org.apache.spark.internal.LogKeys.{DATAFRAME_ID, QUERY_ID, RUN_ID_STRING, SESSION_ID} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, AgnosticEncoders} +import org.apache.spark.sql.connect.common.ForeachWriterPacket import org.apache.spark.sql.connect.service.SessionHolder import org.apache.spark.sql.connect.service.SparkConnectService import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.sql.streaming.StreamingQueryListener +import org.apache.spark.util.Utils /** * A helper class for handling ForeachBatch related functionality in Spark Connect servers @@ -88,13 +91,31 @@ object StreamingForeachBatchHelper extends Logging { * DataFrame, so the user code actually runs with legacy DataFrame and session.. */ def scalaForeachBatchWrapper( - fn: ForeachBatchFnType, + payloadBytes: Array[Byte], sessionHolder: SessionHolder): ForeachBatchFnType = { + val foreachBatchPkt = + Utils.deserialize[ForeachWriterPacket](payloadBytes, Utils.getContextOrSparkClassLoader) + val fn = foreachBatchPkt.foreachWriter.asInstanceOf[(Dataset[Any], Long) => Unit] + val encoder = foreachBatchPkt.datasetEncoder.asInstanceOf[AgnosticEncoder[Any]] // TODO(SPARK-44462): Set up Spark Connect session. // Do we actually need this for the first version? dataFrameCachingWrapper( (args: FnArgsWithId) => { - fn(args.df, args.batchId) // dfId is not used, see hack comment above. + // dfId is not used, see hack comment above. + try { + val ds = if (AgnosticEncoders.UnboundRowEncoder == encoder) { + // When the dataset is a DataFrame (Dataset[Row). + args.df.asInstanceOf[Dataset[Any]] + } else { + // Recover the Dataset from the DataFrame using the encoder. + Dataset.apply(args.df.sparkSession, args.df.logicalPlan)(encoder) + } + fn(ds, args.batchId) + } catch { + case t: Throwable => + logError(s"Calling foreachBatch fn failed", t) + throw t + } }, sessionHolder) } @@ -203,7 +224,7 @@ object StreamingForeachBatchHelper extends Logging { Option(cleanerCache.remove(key)).foreach { cleaner => logInfo( log"Cleaning up runner for queryId ${MDC(QUERY_ID, key.queryId)} " + - log"runId ${MDC(RUN_ID, key.runId)}.") + log"runId ${MDC(RUN_ID_STRING, key.runId)}.") cleaner.close() } } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteEventsManager.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteEventsManager.scala index faa7582d169f1..61cd95621d156 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteEventsManager.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteEventsManager.scala @@ -145,13 +145,19 @@ case class ExecuteEventsManager(executeHolder: ExecuteHolder, clock: Clock) { * * @param analyzedPlan * The analyzed plan generated by the Connect request plan. None when the request does not - * generate a plan. + * generate a Spark plan or analysis fails. + * @param parsedPlan + * The analyzed plan generated by the Connect request plan. None when the request does not + * generate a plan or does not fail analysis. */ - def postAnalyzed(analyzedPlan: Option[LogicalPlan] = None): Unit = { + def postAnalyzed( + analyzedPlan: Option[LogicalPlan] = None, + parsedPlan: Option[LogicalPlan] = None): Unit = { assertStatus(List(ExecuteStatus.Started, ExecuteStatus.Analyzed), ExecuteStatus.Analyzed) val event = SparkListenerConnectOperationAnalyzed(jobTag, operationId, clock.getTimeMillis()) event.analyzedPlan = analyzedPlan + event.parsedPlan = parsedPlan listenerBus.post(event) } @@ -251,6 +257,12 @@ case class ExecuteEventsManager(executeHolder: ExecuteHolder, clock: Clock) { postAnalyzed(Some(analyzedPlan)) } + override def analysisFailed( + tracker: QueryPlanningTracker, + parsedPlan: LogicalPlan): Unit = { + postAnalyzed(parsedPlan = Some(parsedPlan)) + } + def readyForExecution(tracker: QueryPlanningTracker): Unit = postReadyForExecution() })) } @@ -341,9 +353,15 @@ case class SparkListenerConnectOperationAnalyzed( extraTags: Map[String, String] = Map.empty) extends SparkListenerEvent { + /** + * Parsed Spark plan generated by the Connect request. None when the Connect request does not + * generate a Spark plan or does not fail analysis. + */ + @JsonIgnore var parsedPlan: Option[LogicalPlan] = None + /** * Analyzed Spark plan generated by the Connect request. None when the Connect request does not - * generate a Spark plan. + * generate a Spark plan or analysis fails. */ @JsonIgnore var analyzedPlan: Option[LogicalPlan] = None } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala index 821ddb2c85d58..94638151f7f18 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala @@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable import scala.jdk.CollectionConverters._ +import com.google.protobuf.GeneratedMessage + import org.apache.spark.SparkEnv import org.apache.spark.connect.proto import org.apache.spark.internal.Logging @@ -81,6 +83,10 @@ private[connect] class ExecuteHolder( val observations: mutable.Map[String, Observation] = mutable.Map.empty + lazy val allObservationAndPlanIds: Map[String, Long] = { + ExecuteHolder.collectAllObservationAndPlanIds(request.getPlan).toMap + } + private val runner: ExecuteThreadRunner = new ExecuteThreadRunner(this) /** System.currentTimeMillis when this ExecuteHolder was created. */ @@ -289,6 +295,26 @@ private[connect] class ExecuteHolder( def operationId: String = key.operationId } +private object ExecuteHolder { + private def collectAllObservationAndPlanIds( + planOrMessage: GeneratedMessage, + collected: mutable.Map[String, Long] = mutable.Map.empty): mutable.Map[String, Long] = { + planOrMessage match { + case relation: proto.Relation if relation.hasCollectMetrics => + collected += relation.getCollectMetrics.getName -> relation.getCommon.getPlanId + collectAllObservationAndPlanIds(relation.getCollectMetrics.getInput, collected) + case _ => + planOrMessage.getAllFields.values().asScala.foreach { + case message: GeneratedMessage => + collectAllObservationAndPlanIds(message, collected) + case _ => + // not a message (probably a primitive type), do nothing + } + } + collected + } +} + /** Used to identify ExecuteHolder jobTag among SparkContext.SPARK_JOB_TAGS. */ object ExecuteJobTag { private val prefix = "SparkConnect_OperationTag" diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala index 5dced7acfb0d2..5b56b7079a897 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala @@ -37,6 +37,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.connect.common.InvalidPlanInput import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.sql.connect.ml.MLCache import org.apache.spark.sql.connect.planner.PythonStreamingQueryListener import org.apache.spark.sql.connect.planner.StreamingForeachBatchHelper import org.apache.spark.sql.connect.service.ExecuteKey @@ -111,6 +112,9 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio private[spark] lazy val dataFrameCache: ConcurrentMap[String, DataFrame] = new ConcurrentHashMap() + // ML model cache + private[connect] lazy val mlCache = new MLCache() + // Mapping from id to StreamingQueryListener. Used for methods like removeListener() in // StreamingQueryManager. private lazy val listenerCache: ConcurrentMap[String, StreamingQueryListener] = @@ -301,7 +305,7 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio // Clean up all artifacts. // Note: there can be concurrent AddArtifact calls still adding something. - artifactManager.cleanUpResources() + artifactManager.close() // Clean up running streaming queries. // Note: there can be concurrent streaming queries being started. @@ -322,6 +326,8 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio // remove all executions and no new executions will be added in the meanwhile. SparkConnectService.executionManager.removeAllExecutionsForSession(this.key) + mlCache.clear() + eventManager.postClosed() } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAddArtifactsHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAddArtifactsHandler.scala index 72403016404c8..3ba79402e99ef 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAddArtifactsHandler.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAddArtifactsHandler.scala @@ -87,8 +87,8 @@ class SparkConnectAddArtifactsHandler(val responseObserver: StreamObserver[AddAr ErrorUtils.handleError( "addArtifacts.onNext", responseObserver, - holder.userId, - holder.sessionId, + req.getUserContext.getUserId, + req.getSessionId, None, false, Some(() => { diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala index 6c5d95ac67d3d..8ca021c5be39e 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.Dataset import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, InvalidPlanInput, StorageLevelProtoConverter} import org.apache.spark.sql.connect.planner.SparkConnectPlanner import org.apache.spark.sql.execution.{CodegenMode, CostMode, ExtendedMode, FormattedMode, SimpleMode} +import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.ArrayImplicits._ private[connect] class SparkConnectAnalyzeHandler( @@ -206,6 +207,17 @@ private[connect] class SparkConnectAnalyzeHandler( .setStorageLevel(StorageLevelProtoConverter.toConnectProtoType(storageLevel)) .build()) + case proto.AnalyzePlanRequest.AnalyzeCase.JSON_TO_DDL => + val ddl = DataType + .fromJson(request.getJsonToDdl.getJsonString) + .asInstanceOf[StructType] + .toDDL + builder.setJsonToDdl( + proto.AnalyzePlanResponse.JsonToDDL + .newBuilder() + .setDdlString(ddl) + .build()) + case other => throw InvalidPlanInput(s"Unknown Analyze Method $other!") } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala index c5e484e022bc4..06bc24b6ccae6 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala @@ -73,11 +73,21 @@ class SparkConnectConfigHandler(responseObserver: StreamObserver[proto.ConfigRes private def handleSet( operation: proto.ConfigRequest.Set, conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val silent = operation.hasSilent && operation.getSilent val builder = proto.ConfigResponse.newBuilder() operation.getPairsList.asScala.iterator.foreach { pair => val (key, value) = SparkConnectConfigHandler.toKeyValue(pair) - conf.set(key, value.orNull) - getWarning(key).foreach(builder.addWarnings) + try { + conf.set(key, value.orNull) + getWarning(key).foreach(builder.addWarnings) + } catch { + case e: Throwable => + if (silent) { + builder.addWarnings(s"Failed to set $key to $value due to ${e.getMessage}") + } else { + throw e + } + } } builder } diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectReleaseSessionHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectReleaseSessionHandler.scala index ec7a7f3bd242c..c36f07fc67f8f 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectReleaseSessionHandler.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectReleaseSessionHandler.scala @@ -37,7 +37,8 @@ class SparkConnectReleaseSessionHandler( val maybeSession = SparkConnectService.sessionManager.getIsolatedSessionIfPresent(key) maybeSession.foreach(f => responseBuilder.setServerSideSessionId(f.serverSessionId)) - SparkConnectService.sessionManager.closeSession(key) + val allowReconnect = v.getAllowReconnect + SparkConnectService.sessionManager.closeSession(key, allowReconnect) responseObserver.onNext(responseBuilder.build()) responseObserver.onCompleted() diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala index a306856efa33c..c59fd02a829ae 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala @@ -134,7 +134,9 @@ class SparkConnectSessionManager extends Logging { } // Removes session from sessionStore and returns it. - private def removeSessionHolder(key: SessionKey): Option[SessionHolder] = { + private def removeSessionHolder( + key: SessionKey, + allowReconnect: Boolean = false): Option[SessionHolder] = { var sessionHolder: Option[SessionHolder] = None // The session holder should remain in the session store until it is added to the closed session @@ -144,9 +146,11 @@ class SparkConnectSessionManager extends Logging { sessionHolder = Option(sessionStore.get(key)) sessionHolder.foreach { s => - // Put into closedSessionsCache to prevent the same session from being recreated by - // getOrCreateIsolatedSession. - closedSessionsCache.put(s.key, s.getSessionHolderInfo) + if (!allowReconnect) { + // Put into closedSessionsCache to prevent the same session from being recreated by + // getOrCreateIsolatedSession when reconnection isn't allowed. + closedSessionsCache.put(s.key, s.getSessionHolderInfo) + } // Then, remove the session holder from the session store. sessionStore.remove(key) @@ -154,17 +158,21 @@ class SparkConnectSessionManager extends Logging { sessionHolder } - // Shut downs the session after removing. - private def shutdownSessionHolder(sessionHolder: SessionHolder): Unit = { + // Shuts down the session after removing. + private def shutdownSessionHolder( + sessionHolder: SessionHolder, + allowReconnect: Boolean = false): Unit = { sessionHolder.close() - // Update in closedSessionsCache: above it wasn't updated with closedTime etc. yet. - closedSessionsCache.put(sessionHolder.key, sessionHolder.getSessionHolderInfo) + if (!allowReconnect) { + // Update in closedSessionsCache: above it wasn't updated with closedTime etc. yet. + closedSessionsCache.put(sessionHolder.key, sessionHolder.getSessionHolderInfo) + } } - def closeSession(key: SessionKey): Unit = { - val sessionHolder = removeSessionHolder(key) + def closeSession(key: SessionKey, allowReconnect: Boolean = false): Unit = { + val sessionHolder = removeSessionHolder(key, allowReconnect) // Rest of the cleanup: the session cannot be accessed anymore by getOrCreateIsolatedSession. - sessionHolder.foreach(shutdownSessionHolder(_)) + sessionHolder.foreach(shutdownSessionHolder(_, allowReconnect)) } private[connect] def shutdown(): Unit = { @@ -289,8 +297,10 @@ class SparkConnectSessionManager extends Logging { * Used for testing */ private[connect] def invalidateAllSessions(): Unit = { - periodicMaintenance(defaultInactiveTimeoutMs = 0L, ignoreCustomTimeout = true) - assert(sessionStore.isEmpty) + sessionStore.forEach((key, sessionHolder) => { + removeSessionHolder(key) + shutdownSessionHolder(sessionHolder) + }) closedSessionsCache.invalidateAll() } diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkConnectServerTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkConnectServerTest.scala index b04c42a730785..3c857554dc756 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkConnectServerTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkConnectServerTest.scala @@ -128,6 +128,17 @@ trait SparkConnectServerTest extends SharedSparkSession { req.build() } + protected def buildReleaseSessionRequest( + sessionId: String = defaultSessionId, + allowReconnect: Boolean = false) = { + proto.ReleaseSessionRequest + .newBuilder() + .setUserContext(userContext) + .setSessionId(sessionId) + .setAllowReconnect(allowReconnect) + .build() + } + protected def buildPlan(query: String) = { proto.Plan.newBuilder().setRoot(dsl.sql(query)).build() } diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ml/MLSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ml/MLSuite.scala new file mode 100644 index 0000000000000..bea7072b2034e --- /dev/null +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ml/MLSuite.scala @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.ml + +import java.io.File + +import org.apache.spark.SparkFunSuite +import org.apache.spark.connect.proto +import org.apache.spark.ml.classification.LogisticRegressionModel +import org.apache.spark.ml.linalg.{Vectors, VectorUDT} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection +import org.apache.spark.sql.catalyst.types.DataTypeUtils +import org.apache.spark.sql.connect.SparkConnectTestUtils +import org.apache.spark.sql.connect.planner.SparkConnectPlanTest +import org.apache.spark.sql.types.{FloatType, Metadata, StructField, StructType} +import org.apache.spark.util.Utils + +trait FakeArrayParams extends Params { + final val arrayString: StringArrayParam = + new StringArrayParam(this, "arrayString", "array string") + + final def getArrayString: Array[String] = $(arrayString) + + final val arrayDouble: DoubleArrayParam = + new DoubleArrayParam(this, "arrayDouble", "array double") + + final def getArrayDouble: Array[Double] = $(arrayDouble) + + final val arrayInt: IntArrayParam = new IntArrayParam(this, "arrayInt", "array int") + + final def getArrayInt: Array[Int] = $(arrayInt) + + final val int: IntParam = new IntParam(this, "int", "int") + + final def getInt: Int = $(int) + + final val float: FloatParam = new FloatParam(this, "float", "float") + + final def getFloat: Float = $(float) + + final val boolean: BooleanParam = new BooleanParam(this, "boolean", "boolean") + + final def getBoolean: Boolean = $(boolean) + + final val double: DoubleParam = new DoubleParam(this, "double", "double") + + final def getDouble: Double = $(double) +} + +class FakedML(override val uid: String) extends FakeArrayParams { + def this() = this(Identifiable.randomUID("FakedML")) + + override def copy(extra: ParamMap): Params = this +} + +class MLSuite extends SparkFunSuite with SparkConnectPlanTest { + + def createLocalRelationProto: proto.Relation = { + val udt = new VectorUDT() + val rows = Seq( + InternalRow(1.0f, udt.serialize(Vectors.dense(Array(1.0, 2.0)))), + InternalRow(1.0f, udt.serialize(Vectors.dense(Array(2.0, -1.0)))), + InternalRow(0.0f, udt.serialize(Vectors.dense(Array(-3.0, -2.0)))), + InternalRow(0.0f, udt.serialize(Vectors.dense(Array(-1.0, -2.0))))) + + val schema = StructType( + Seq( + StructField("label", FloatType), + StructField("features", new VectorUDT(), false, Metadata.empty))) + + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + createLocalRelationProto(DataTypeUtils.toAttributes(schema), inputRows, "UTC", Some(schema)) + } + + test("reconcileParam") { + val fakedML = new FakedML + val params = proto.MlParams + .newBuilder() + .putParams( + "boolean", + proto.Param + .newBuilder() + .setLiteral(proto.Expression.Literal.newBuilder().setBoolean(true)) + .build()) + .putParams( + "double", + proto.Param + .newBuilder() + .setLiteral(proto.Expression.Literal.newBuilder().setDouble(1.0)) + .build()) + .putParams( + "int", + proto.Param + .newBuilder() + .setLiteral(proto.Expression.Literal.newBuilder().setInteger(10)) + .build()) + .putParams( + "float", + proto.Param + .newBuilder() + .setLiteral(proto.Expression.Literal.newBuilder().setFloat(10.0f)) + .build()) + .putParams( + "arrayString", + proto.Param + .newBuilder() + .setLiteral( + proto.Expression.Literal + .newBuilder() + .setArray( + proto.Expression.Literal.Array + .newBuilder() + .setElementType(proto.DataType + .newBuilder() + .setString(proto.DataType.String.getDefaultInstance) + .build()) + .addElements(proto.Expression.Literal.newBuilder().setString("hello")) + .addElements(proto.Expression.Literal.newBuilder().setString("world")) + .build()) + .build()) + .build()) + .putParams( + "arrayInt", + proto.Param + .newBuilder() + .setLiteral( + proto.Expression.Literal + .newBuilder() + .setArray( + proto.Expression.Literal.Array + .newBuilder() + .setElementType(proto.DataType + .newBuilder() + .setInteger(proto.DataType.Integer.getDefaultInstance) + .build()) + .addElements(proto.Expression.Literal.newBuilder().setInteger(1)) + .addElements(proto.Expression.Literal.newBuilder().setInteger(2)) + .build()) + .build()) + .build()) + .putParams( + "arrayDouble", + proto.Param + .newBuilder() + .setLiteral( + proto.Expression.Literal + .newBuilder() + .setArray( + proto.Expression.Literal.Array + .newBuilder() + .setElementType(proto.DataType + .newBuilder() + .setDouble(proto.DataType.Double.getDefaultInstance) + .build()) + .addElements(proto.Expression.Literal.newBuilder().setDouble(11.0)) + .addElements(proto.Expression.Literal.newBuilder().setDouble(12.0)) + .build()) + .build()) + .build()) + .build() + MLUtils.setInstanceParams(fakedML, params) + assert(fakedML.getInt === 10) + assert(fakedML.getFloat === 10.0) + assert(fakedML.getArrayInt === Array(1, 2)) + assert(fakedML.getArrayDouble === Array(11.0, 12.0)) + assert(fakedML.getArrayString === Array("hello", "world")) + assert(fakedML.getBoolean === true) + assert(fakedML.getDouble === 1.0) + } + + test("LogisticRegression works") { + val sessionHolder = SparkConnectTestUtils.createDummySessionHolder(spark) + + def verifyModel(modelId: String, hasSummary: Boolean = false): Unit = { + val model = sessionHolder.mlCache.get(modelId) + // Model is cached + assert(model != null) + assert(model.isInstanceOf[LogisticRegressionModel]) + val lrModel = model.asInstanceOf[LogisticRegressionModel] + assert(lrModel.getMaxIter === 2) + + // Fetch double attribute + val interceptCommand = proto.MlCommand + .newBuilder() + .setFetch( + proto.Fetch + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId)) + .addMethods(proto.Fetch.Method.newBuilder().setMethod("intercept"))) + .build() + val interceptResult = MLHandler.handleMlCommand(sessionHolder, interceptCommand) + assert(interceptResult.getParam.getLiteral.getDouble === lrModel.intercept) + + // Fetch Vector attribute + val coefficientsCommand = proto.MlCommand + .newBuilder() + .setFetch( + proto.Fetch + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId)) + .addMethods(proto.Fetch.Method.newBuilder().setMethod("coefficients"))) + .build() + val coefficientsResult = MLHandler.handleMlCommand(sessionHolder, coefficientsCommand) + val deserializedCoefficients = + MLUtils.deserializeVector(coefficientsResult.getParam.getVector) + assert(deserializedCoefficients === lrModel.coefficients) + + // Fetch Matrix attribute + val coefficientsMatrixCommand = proto.MlCommand + .newBuilder() + .setFetch( + proto.Fetch + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId)) + .addMethods(proto.Fetch.Method.newBuilder().setMethod("coefficientMatrix"))) + .build() + val coefficientsMatrixResult = + MLHandler.handleMlCommand(sessionHolder, coefficientsMatrixCommand) + val deserializedCoefficientsMatrix = + MLUtils.deserializeMatrix(coefficientsMatrixResult.getParam.getMatrix) + assert(lrModel.coefficientMatrix === deserializedCoefficientsMatrix) + + // Predict with sparse vector + val sparseVector = Vectors.dense(Array(0.0, 2.0)).toSparse + val predictCommand = proto.MlCommand + .newBuilder() + .setFetch( + proto.Fetch + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId)) + .addMethods( + proto.Fetch.Method + .newBuilder() + .setMethod("predict") + .addArgs(proto.Fetch.Method.Args + .newBuilder() + .setParam(Serializer.serializeParam(sparseVector))))) + .build() + val predictResult = MLHandler.handleMlCommand(sessionHolder, predictCommand) + val predictValue = predictResult.getParam.getLiteral.getDouble + assert(lrModel.predict(sparseVector) === predictValue) + + // The loaded model doesn't have summary + if (hasSummary) { + // Fetch summary attribute + val accuracyCommand = proto.MlCommand + .newBuilder() + .setFetch( + proto.Fetch + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId)) + .addMethods(proto.Fetch.Method.newBuilder().setMethod("summary")) + .addMethods(proto.Fetch.Method.newBuilder().setMethod("accuracy"))) + .build() + val accuracyResult = MLHandler.handleMlCommand(sessionHolder, accuracyCommand) + assert(lrModel.summary.accuracy === accuracyResult.getParam.getLiteral.getDouble) + + val weightedFMeasureCommand = proto.MlCommand + .newBuilder() + .setFetch( + proto.Fetch + .newBuilder() + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId)) + .addMethods(proto.Fetch.Method.newBuilder().setMethod("summary")) + .addMethods( + proto.Fetch.Method + .newBuilder() + .setMethod("weightedFMeasure") + .addArgs(proto.Fetch.Method.Args + .newBuilder() + .setParam(Serializer.serializeParam(2.5))))) + .build() + val weightedFMeasureResult = + MLHandler.handleMlCommand(sessionHolder, weightedFMeasureCommand) + assert( + lrModel.summary.weightedFMeasure(2.5) === + weightedFMeasureResult.getParam.getLiteral.getDouble) + } + } + + try { + val fitCommand = proto.MlCommand + .newBuilder() + .setFit( + proto.MlCommand.Fit + .newBuilder() + .setDataset(createLocalRelationProto) + .setEstimator( + proto.MlOperator + .newBuilder() + .setName("org.apache.spark.ml.classification.LogisticRegression") + .setUid("LogisticRegression") + .setType(proto.MlOperator.OperatorType.ESTIMATOR)) + .setParams( + proto.MlParams + .newBuilder() + .putParams( + "maxIter", + proto.Param + .newBuilder() + .setLiteral(proto.Expression.Literal + .newBuilder() + .setInteger(2)) + .build()))) + .build() + val fitResult = MLHandler.handleMlCommand(sessionHolder, fitCommand) + val modelId = fitResult.getOperatorInfo.getObjRef.getId + + verifyModel(modelId, true) + + // read/write + val tempDir = Utils.createTempDir(namePrefix = this.getClass.getName) + try { + val path = new File(tempDir, Identifiable.randomUID("LogisticRegression")).getPath + val writeCmd = proto.MlCommand + .newBuilder() + .setWrite( + proto.MlCommand.Write + .newBuilder() + .setPath(path) + .setObjRef(proto.ObjectRef.newBuilder().setId(modelId))) + .build() + MLHandler.handleMlCommand(sessionHolder, writeCmd) + + val readCmd = proto.MlCommand + .newBuilder() + .setRead( + proto.MlCommand.Read + .newBuilder() + .setOperator( + proto.MlOperator + .newBuilder() + .setName("org.apache.spark.ml.classification.LogisticRegressionModel") + .setType(proto.MlOperator.OperatorType.MODEL)) + .setPath(path)) + .build() + + val readResult = MLHandler.handleMlCommand(sessionHolder, readCmd) + verifyModel(readResult.getOperatorInfo.getObjRef.getId) + + } finally { + Utils.deleteRecursively(tempDir) + } + + } finally { + sessionHolder.mlCache.clear() + } + } + + test("Exception: Unsupported ML operator") { + intercept[MlUnsupportedException] { + val sessionHolder = SparkConnectTestUtils.createDummySessionHolder(spark) + val command = proto.MlCommand + .newBuilder() + .setFit( + proto.MlCommand.Fit + .newBuilder() + .setDataset(createLocalRelationProto) + .setEstimator( + proto.MlOperator + .newBuilder() + .setName("org.apache.spark.ml.NotExistingML") + .setUid("FakedUid") + .setType(proto.MlOperator.OperatorType.ESTIMATOR))) + .build() + MLHandler.handleMlCommand(sessionHolder, command) + } + } +} diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala index e44d3eacc66df..55c492f511049 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala @@ -26,9 +26,10 @@ import org.apache.spark.connect.proto import org.apache.spark.connect.proto.Expression.{Alias, ExpressionString, UnresolvedStar} import org.apache.spark.sql.{AnalysisException, Dataset, Row} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedFunction, UnresolvedRelation} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.connect.SparkConnectTestUtils import org.apache.spark.sql.connect.common.InvalidPlanInput @@ -88,7 +89,8 @@ trait SparkConnectPlanTest extends SharedSparkSession { def createLocalRelationProto( attrs: Seq[AttributeReference], data: Seq[InternalRow], - timeZoneId: String = "UTC"): proto.Relation = { + timeZoneId: String = "UTC", + schema: Option[StructType] = None): proto.Relation = { val localRelationBuilder = proto.LocalRelation.newBuilder() val bytes = ArrowConverters @@ -102,6 +104,7 @@ trait SparkConnectPlanTest extends SharedSparkSession { .next() localRelationBuilder.setData(ByteString.copyFrom(bytes)) + schema.foreach(s => localRelationBuilder.setSchema(s.json)) proto.Relation.newBuilder().setLocalRelation(localRelationBuilder.build()).build() } } @@ -274,7 +277,7 @@ class SparkConnectPlannerSuite extends SparkFunSuite with SparkConnectPlanTest { test("Simple Join") { val incompleteJoin = proto.Relation.newBuilder.setJoin(proto.Join.newBuilder.setLeft(readRel)).build() - intercept[AssertionError](transform(incompleteJoin)) + intercept[InvalidPlanInput](transform(incompleteJoin)) // Join type JOIN_TYPE_UNSPECIFIED is not supported. intercept[InvalidPlanInput] { @@ -503,26 +506,27 @@ class SparkConnectPlannerSuite extends SparkFunSuite with SparkConnectPlanTest { } test("Test duplicated names in WithColumns") { - intercept[AnalysisException] { - transform( - proto.Relation - .newBuilder() - .setWithColumns( - proto.WithColumns - .newBuilder() - .setInput(readRel) - .addAliases(proto.Expression.Alias + val logical = transform( + proto.Relation + .newBuilder() + .setWithColumns( + proto.WithColumns + .newBuilder() + .setInput(readRel) + .addAliases( + proto.Expression.Alias .newBuilder() .addName("test") .setExpr(proto.Expression.newBuilder .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32)))) - .addAliases(proto.Expression.Alias - .newBuilder() - .addName("test") - .setExpr(proto.Expression.newBuilder - .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32))))) - .build()) - } + .addAliases(proto.Expression.Alias + .newBuilder() + .addName("test") + .setExpr(proto.Expression.newBuilder + .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32))))) + .build()) + + intercept[AnalysisException](Dataset.ofRows(spark, logical)) } test("Test multi nameparts for column names in WithColumns") { @@ -884,4 +888,36 @@ class SparkConnectPlannerSuite extends SparkFunSuite with SparkConnectPlanTest { intercept[AnalysisException](Dataset.ofRows(spark, logical)) } + + test("Internal functions") { + def getProjectRelationWithFn(name: String, isInternal: Option[Boolean]): proto.Relation = { + val fn = proto.Expression.UnresolvedFunction.newBuilder.setFunctionName(name) + isInternal.foreach(fn.setIsInternal) + val proj = proto.Project.newBuilder + .setInput(readRel) + .addExpressions(proto.Expression.newBuilder.setUnresolvedFunction(fn)) + proto.Relation.newBuilder.setProject(proj).build() + } + + def getUnresolvedFunction(plan: LogicalPlan): UnresolvedFunction = + plan.expressions.head.asInstanceOf[UnresolvedAlias].child.asInstanceOf[UnresolvedFunction] + + // "bloom_filter_agg" is an internal function. + val plan1 = transform(getProjectRelationWithFn("bloom_filter_agg", isInternal = None)) + val fn1 = getUnresolvedFunction(plan1) + assert(fn1.nameParts.head == "bloom_filter_agg") + assert(fn1.isInternal) + + // "abcde" is not an internal function. + val plan2 = transform(getProjectRelationWithFn("abcde", isInternal = None)) + val fn2 = getUnresolvedFunction(plan2) + assert(fn2.nameParts.head == "abcde") + assert(!fn2.isInternal) + + // "abcde" is not an internal function but we could set it to be internal. + val plan3 = transform(getProjectRelationWithFn("abcde", isInternal = Some(true))) + val fn3 = getUnresolvedFunction(plan3) + assert(fn3.nameParts.head == "abcde") + assert(fn3.isInternal) + } } diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala index cad7fe6370827..1a86ced3a2ac9 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericInt import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftAnti, LeftOuter, LeftSemi, PlanTest, RightOuter} import org.apache.spark.sql.catalyst.plans.logical.{CollectMetrics, Distinct, LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.types.DataTypeUtils -import org.apache.spark.sql.connect.common.InvalidPlanInput +import org.apache.spark.sql.connect.common.{InvalidCommandInput, InvalidPlanInput} import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto import org.apache.spark.sql.connect.dsl.MockRemoteSession import org.apache.spark.sql.connect.dsl.commands._ diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala index d6d137e6d91aa..5e88725691656 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala @@ -919,7 +919,8 @@ class SparkConnectServiceSuite } class MockSparkListener() extends SparkListener { val semaphoreStarted = new Semaphore(0) - var executeHolder = Option.empty[ExecuteHolder] + // Accessed by multiple threads in parallel. + @volatile var executeHolder = Option.empty[ExecuteHolder] override def onOtherEvent(event: SparkListenerEvent): Unit = { event match { case e: SparkListenerConnectOperationStarted => diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceE2ESuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceE2ESuite.scala index f86298a8b5b98..f24560259a883 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceE2ESuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceE2ESuite.scala @@ -245,4 +245,26 @@ class SparkConnectServiceE2ESuite extends SparkConnectServerTest { assert(queryError.getMessage.contains("INVALID_HANDLE.SESSION_CHANGED")) } } + + test("Client is allowed to reconnect to released session if allow_reconnect is set") { + withRawBlockingStub { stub => + val sessionId = UUID.randomUUID.toString() + val iter = + stub.executePlan( + buildExecutePlanRequest( + buildPlan("select * from range(1000000)"), + sessionId = sessionId)) + iter.hasNext // guarantees the request was received by server. + + stub.releaseSession(buildReleaseSessionRequest(sessionId, allowReconnect = true)) + + val iter2 = + stub.executePlan( + buildExecutePlanRequest( + buildPlan("select * from range(1000000)"), + sessionId = sessionId)) + // guarantees the request was received by server. No exception should be thrown on reuse + iter2.hasNext + } + } } diff --git a/sql/connect/shims/src/main/scala/org/apache/spark/shims.scala b/sql/connect/shims/src/main/scala/org/apache/spark/shims.scala index ad8771a03b287..9c5fb515580a7 100644 --- a/sql/connect/shims/src/main/scala/org/apache/spark/shims.scala +++ b/sql/connect/shims/src/main/scala/org/apache/spark/shims.scala @@ -32,7 +32,6 @@ package rdd { package sql { class ExperimentalMethods class SparkSessionExtensions - class SQLContext package execution { class QueryExecution diff --git a/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt b/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt index 50a31e7e73bb1..0c14099f23b73 100644 --- a/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt @@ -2,147 +2,147 @@ aggregate without grouping ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor agg w/o group: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -agg w/o group wholestage off 35098 35975 1240 59.8 16.7 1.0X -agg w/o group wholestage on 2835 2844 9 739.9 1.4 12.4X +agg w/o group wholestage off 35412 35848 616 59.2 16.9 1.0X +agg w/o group wholestage on 2849 2853 5 736.1 1.4 12.4X ================================================================================================ stat functions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor stddev: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -stddev wholestage off 4102 4138 51 25.6 39.1 1.0X -stddev wholestage on 974 983 6 107.6 9.3 4.2X +stddev wholestage off 4533 4585 74 23.1 43.2 1.0X +stddev wholestage on 987 991 4 106.3 9.4 4.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor kurtosis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -kurtosis wholestage off 21188 21367 253 4.9 202.1 1.0X -kurtosis wholestage on 992 993 2 105.7 9.5 21.4X +kurtosis wholestage off 21221 21417 277 4.9 202.4 1.0X +kurtosis wholestage on 995 1000 4 105.4 9.5 21.3X ================================================================================================ aggregate with linear keys ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 6757 6835 110 12.4 80.5 1.0X -codegen = T, hashmap = F 3850 4003 160 21.8 45.9 1.8X -codegen = T, row-based hashmap = T 1222 1238 15 68.6 14.6 5.5X -codegen = T, vectorized hashmap = T 804 814 9 104.3 9.6 8.4X +codegen = F 6703 6748 64 12.5 79.9 1.0X +codegen = T, hashmap = F 3746 3777 36 22.4 44.7 1.8X +codegen = T, row-based hashmap = T 1226 1233 5 68.4 14.6 5.5X +codegen = T, vectorized hashmap = T 812 820 12 103.4 9.7 8.3X ================================================================================================ aggregate with randomized keys ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 7331 7374 60 11.4 87.4 1.0X -codegen = T, hashmap = F 4664 4687 24 18.0 55.6 1.6X -codegen = T, row-based hashmap = T 1620 1627 7 51.8 19.3 4.5X -codegen = T, vectorized hashmap = T 1113 1171 72 75.4 13.3 6.6X +codegen = F 7454 7484 42 11.3 88.9 1.0X +codegen = T, hashmap = F 4485 4585 87 18.7 53.5 1.7X +codegen = T, row-based hashmap = T 1676 1685 9 50.1 20.0 4.4X +codegen = T, vectorized hashmap = T 1061 1110 85 79.0 12.7 7.0X ================================================================================================ aggregate with string key ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w string key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2485 2510 35 8.4 118.5 1.0X -codegen = T, hashmap = F 1519 1529 10 13.8 72.4 1.6X -codegen = T, row-based hashmap = T 994 1010 16 21.1 47.4 2.5X -codegen = T, vectorized hashmap = T 804 815 11 26.1 38.3 3.1X +codegen = F 2296 2326 43 9.1 109.5 1.0X +codegen = T, hashmap = F 1499 1512 12 14.0 71.5 1.5X +codegen = T, row-based hashmap = T 996 1013 21 21.1 47.5 2.3X +codegen = T, vectorized hashmap = T 798 801 2 26.3 38.1 2.9X ================================================================================================ aggregate with decimal key ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w decimal key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2018 2050 46 10.4 96.2 1.0X -codegen = T, hashmap = F 1305 1318 18 16.1 62.2 1.5X -codegen = T, row-based hashmap = T 499 505 6 42.0 23.8 4.0X -codegen = T, vectorized hashmap = T 313 317 4 67.0 14.9 6.4X +codegen = F 2042 2096 76 10.3 97.4 1.0X +codegen = T, hashmap = F 1362 1374 18 15.4 64.9 1.5X +codegen = T, row-based hashmap = T 479 501 17 43.8 22.8 4.3X +codegen = T, vectorized hashmap = T 312 326 13 67.3 14.9 6.6X ================================================================================================ aggregate with multiple key types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w multiple keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 4453 4457 6 4.7 212.3 1.0X -codegen = T, hashmap = F 2320 2333 20 9.0 110.6 1.9X -codegen = T, row-based hashmap = T 1821 1826 6 11.5 86.8 2.4X -codegen = T, vectorized hashmap = T 1600 1652 74 13.1 76.3 2.8X +codegen = F 4229 4241 16 5.0 201.7 1.0X +codegen = T, hashmap = F 2316 2320 6 9.1 110.4 1.8X +codegen = T, row-based hashmap = T 1819 1827 11 11.5 86.8 2.3X +codegen = T, vectorized hashmap = T 1518 1519 1 13.8 72.4 2.8X ================================================================================================ max function bytecode size of wholestagecodegen ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor max function bytecode size: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 358 384 22 1.8 545.8 1.0X -codegen = T, hugeMethodLimit = 10000 134 160 24 4.9 204.1 2.7X -codegen = T, hugeMethodLimit = 1500 129 145 16 5.1 196.1 2.8X +codegen = F 368 389 23 1.8 561.7 1.0X +codegen = T, hugeMethodLimit = 10000 143 162 16 4.6 218.9 2.6X +codegen = T, hugeMethodLimit = 1500 140 154 12 4.7 214.1 2.6X ================================================================================================ cube ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cube: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cube wholestage off 1962 1973 16 2.7 374.2 1.0X -cube wholestage on 1054 1075 24 5.0 201.0 1.9X +cube wholestage off 2051 2074 33 2.6 391.2 1.0X +cube wholestage on 1065 1078 10 4.9 203.1 1.9X ================================================================================================ hash and BytesToBytesMap ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor BytesToBytesMap: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeRowhash 146 147 3 143.7 7.0 1.0X -murmur3 hash 53 54 1 392.4 2.5 2.7X -fast hash 24 24 0 887.7 1.1 6.2X -arrayEqual 136 136 0 153.9 6.5 1.1X -Java HashMap (Long) 62 72 8 338.3 3.0 2.4X -Java HashMap (two ints) 85 88 2 245.8 4.1 1.7X -Java HashMap (UnsafeRow) 492 495 2 42.6 23.5 0.3X -LongToUnsafeRowMap (opt=false) 350 354 3 59.9 16.7 0.4X -LongToUnsafeRowMap (opt=true) 79 82 5 263.9 3.8 1.8X -BytesToBytesMap (off Heap) 459 471 12 45.7 21.9 0.3X -BytesToBytesMap (on Heap) 466 468 2 45.0 22.2 0.3X -Aggregate HashMap 30 30 2 697.8 1.4 4.9X +UnsafeRowhash 146 147 1 143.2 7.0 1.0X +murmur3 hash 54 55 4 390.7 2.6 2.7X +fast hash 24 24 0 883.4 1.1 6.2X +arrayEqual 137 137 0 153.2 6.5 1.1X +Java HashMap (Long) 61 67 7 344.8 2.9 2.4X +Java HashMap (two ints) 76 79 2 275.1 3.6 1.9X +Java HashMap (UnsafeRow) 531 533 2 39.5 25.3 0.3X +LongToUnsafeRowMap (opt=false) 346 349 4 60.6 16.5 0.4X +LongToUnsafeRowMap (opt=true) 80 80 1 262.5 3.8 1.8X +BytesToBytesMap (off Heap) 440 443 4 47.6 21.0 0.3X +BytesToBytesMap (on Heap) 456 459 4 46.0 21.7 0.3X +Aggregate HashMap 30 31 0 689.1 1.5 4.8X diff --git a/sql/core/benchmarks/AggregateBenchmark-results.txt b/sql/core/benchmarks/AggregateBenchmark-results.txt index f1118da89122d..73ca9abe75a6b 100644 --- a/sql/core/benchmarks/AggregateBenchmark-results.txt +++ b/sql/core/benchmarks/AggregateBenchmark-results.txt @@ -2,147 +2,147 @@ aggregate without grouping ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor agg w/o group: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -agg w/o group wholestage off 37435 38685 1769 56.0 17.9 1.0X -agg w/o group wholestage on 3364 3369 3 623.4 1.6 11.1X +agg w/o group wholestage off 39769 40576 1142 52.7 19.0 1.0X +agg w/o group wholestage on 3381 3388 13 620.3 1.6 11.8X ================================================================================================ stat functions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor stddev: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -stddev wholestage off 4461 4505 63 23.5 42.5 1.0X -stddev wholestage on 976 980 3 107.5 9.3 4.6X +stddev wholestage off 4522 4526 7 23.2 43.1 1.0X +stddev wholestage on 987 992 7 106.2 9.4 4.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor kurtosis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -kurtosis wholestage off 20698 20799 143 5.1 197.4 1.0X -kurtosis wholestage on 990 992 2 105.9 9.4 20.9X +kurtosis wholestage off 21070 21074 5 5.0 200.9 1.0X +kurtosis wholestage on 994 998 4 105.5 9.5 21.2X ================================================================================================ aggregate with linear keys ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 6646 6677 43 12.6 79.2 1.0X -codegen = T, hashmap = F 4024 4116 118 20.8 48.0 1.7X -codegen = T, row-based hashmap = T 1240 1255 13 67.7 14.8 5.4X -codegen = T, vectorized hashmap = T 816 838 14 102.8 9.7 8.1X +codegen = F 7074 7120 64 11.9 84.3 1.0X +codegen = T, hashmap = F 3968 4028 63 21.1 47.3 1.8X +codegen = T, row-based hashmap = T 1211 1214 3 69.3 14.4 5.8X +codegen = T, vectorized hashmap = T 827 846 15 101.5 9.9 8.6X ================================================================================================ aggregate with randomized keys ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 7348 7361 18 11.4 87.6 1.0X -codegen = T, hashmap = F 4766 4799 30 17.6 56.8 1.5X -codegen = T, row-based hashmap = T 1712 1734 23 49.0 20.4 4.3X -codegen = T, vectorized hashmap = T 1052 1057 5 79.7 12.5 7.0X +codegen = F 7673 7686 18 10.9 91.5 1.0X +codegen = T, hashmap = F 4857 4875 20 17.3 57.9 1.6X +codegen = T, row-based hashmap = T 1702 1710 14 49.3 20.3 4.5X +codegen = T, vectorized hashmap = T 1077 1115 48 77.9 12.8 7.1X ================================================================================================ aggregate with string key ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w string key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2303 2306 3 9.1 109.8 1.0X -codegen = T, hashmap = F 1467 1472 7 14.3 70.0 1.6X -codegen = T, row-based hashmap = T 989 998 9 21.2 47.2 2.3X -codegen = T, vectorized hashmap = T 794 799 4 26.4 37.9 2.9X +codegen = F 2478 2506 40 8.5 118.2 1.0X +codegen = T, hashmap = F 1510 1517 10 13.9 72.0 1.6X +codegen = T, row-based hashmap = T 1008 1019 19 20.8 48.1 2.5X +codegen = T, vectorized hashmap = T 779 790 8 26.9 37.1 3.2X ================================================================================================ aggregate with decimal key ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w decimal key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2026 2039 18 10.4 96.6 1.0X -codegen = T, hashmap = F 1349 1352 4 15.5 64.3 1.5X -codegen = T, row-based hashmap = T 437 445 6 47.9 20.9 4.6X -codegen = T, vectorized hashmap = T 316 322 5 66.4 15.1 6.4X +codegen = F 2118 2182 92 9.9 101.0 1.0X +codegen = T, hashmap = F 1307 1334 38 16.0 62.3 1.6X +codegen = T, row-based hashmap = T 433 436 2 48.4 20.7 4.9X +codegen = T, vectorized hashmap = T 320 327 7 65.6 15.2 6.6X ================================================================================================ aggregate with multiple key types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Aggregate w multiple keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 4168 4201 47 5.0 198.7 1.0X -codegen = T, hashmap = F 2412 2418 8 8.7 115.0 1.7X -codegen = T, row-based hashmap = T 1661 1663 2 12.6 79.2 2.5X -codegen = T, vectorized hashmap = T 1606 1610 5 13.1 76.6 2.6X +codegen = F 4200 4204 7 5.0 200.3 1.0X +codegen = T, hashmap = F 2346 2384 53 8.9 111.9 1.8X +codegen = T, row-based hashmap = T 1869 1875 9 11.2 89.1 2.2X +codegen = T, vectorized hashmap = T 1608 1612 6 13.0 76.7 2.6X ================================================================================================ max function bytecode size of wholestagecodegen ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor max function bytecode size: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 385 401 10 1.7 587.4 1.0X -codegen = T, hugeMethodLimit = 10000 141 157 12 4.7 214.5 2.7X -codegen = T, hugeMethodLimit = 1500 127 141 11 5.2 193.9 3.0X +codegen = F 394 430 27 1.7 600.5 1.0X +codegen = T, hugeMethodLimit = 10000 140 156 15 4.7 213.5 2.8X +codegen = T, hugeMethodLimit = 1500 136 144 6 4.8 207.1 2.9X ================================================================================================ cube ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cube: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cube wholestage off 1943 1948 7 2.7 370.6 1.0X -cube wholestage on 1110 1130 17 4.7 211.7 1.8X +cube wholestage off 1971 2004 47 2.7 376.0 1.0X +cube wholestage on 1129 1158 25 4.6 215.3 1.7X ================================================================================================ hash and BytesToBytesMap ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor BytesToBytesMap: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeRowhash 203 204 2 103.4 9.7 1.0X -murmur3 hash 68 69 1 308.6 3.2 3.0X -fast hash 71 71 0 296.3 3.4 2.9X -arrayEqual 144 145 1 145.7 6.9 1.4X -Java HashMap (Long) 66 69 4 318.6 3.1 3.1X -Java HashMap (two ints) 80 84 10 263.7 3.8 2.5X -Java HashMap (UnsafeRow) 532 536 3 39.4 25.4 0.4X -LongToUnsafeRowMap (opt=false) 335 337 1 62.6 16.0 0.6X -LongToUnsafeRowMap (opt=true) 78 78 1 269.7 3.7 2.6X -BytesToBytesMap (off Heap) 484 487 3 43.4 23.1 0.4X -BytesToBytesMap (on Heap) 484 491 5 43.4 23.1 0.4X -Aggregate HashMap 30 31 1 690.1 1.4 6.7X +UnsafeRowhash 204 204 1 102.9 9.7 1.0X +murmur3 hash 69 70 0 301.7 3.3 2.9X +fast hash 71 72 1 294.3 3.4 2.9X +arrayEqual 144 145 1 145.1 6.9 1.4X +Java HashMap (Long) 66 69 5 319.8 3.1 3.1X +Java HashMap (two ints) 85 87 2 247.5 4.0 2.4X +Java HashMap (UnsafeRow) 547 554 12 38.3 26.1 0.4X +LongToUnsafeRowMap (opt=false) 347 348 1 60.4 16.6 0.6X +LongToUnsafeRowMap (opt=true) 74 74 1 285.3 3.5 2.8X +BytesToBytesMap (off Heap) 487 490 2 43.0 23.2 0.4X +BytesToBytesMap (on Heap) 511 514 2 41.0 24.4 0.4X +Aggregate HashMap 30 30 0 703.2 1.4 6.8X diff --git a/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt b/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt index dd6aabd2695fd..5bbbe6c90a83d 100644 --- a/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt @@ -1,28 +1,28 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor year month interval one column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -year month interval one column enable radix 23157 23546 354 4.3 231.6 1.0X -year month interval one column disable radix 33035 33049 14 3.0 330.3 0.7X +year month interval one column enable radix 23762 24352 802 4.2 237.6 1.0X +year month interval one column disable radix 33034 33062 30 3.0 330.3 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor year month interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -year month interval two columns enable radix 33726 33825 89 3.0 337.3 1.0X -year month interval two columns disable radix 33759 34063 472 3.0 337.6 1.0X +year month interval two columns enable radix 33122 33540 609 3.0 331.2 1.0X +year month interval two columns disable radix 34128 34610 427 2.9 341.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor day time interval one columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval one columns enable radix 23123 23169 61 4.3 231.2 1.0X -day time interval one columns disable radix 34121 34201 96 2.9 341.2 0.7X +day time interval one columns enable radix 21937 22337 373 4.6 219.4 1.0X +day time interval one columns disable radix 32194 32530 557 3.1 321.9 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor day time interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval two columns enable radix 35022 35250 212 2.9 350.2 1.0X -day time interval two columns disable radix 35240 35498 224 2.8 352.4 1.0X +day time interval two columns enable radix 33037 33149 97 3.0 330.4 1.0X +day time interval two columns disable radix 32944 33036 103 3.0 329.4 1.0X diff --git a/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt b/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt index e8aadd025df2d..67acf452919e7 100644 --- a/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt +++ b/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt @@ -1,28 +1,28 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor year month interval one column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -year month interval one column enable radix 22561 22685 121 4.4 225.6 1.0X -year month interval one column disable radix 32247 32353 132 3.1 322.5 0.7X +year month interval one column enable radix 22822 22918 117 4.4 228.2 1.0X +year month interval one column disable radix 32739 33177 697 3.1 327.4 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor year month interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -year month interval two columns enable radix 33236 33446 207 3.0 332.4 1.0X -year month interval two columns disable radix 34800 34873 63 2.9 348.0 1.0X +year month interval two columns enable radix 33341 33730 549 3.0 333.4 1.0X +year month interval two columns disable radix 33557 33961 565 3.0 335.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor day time interval one columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval one columns enable radix 21978 22020 66 4.5 219.8 1.0X -day time interval one columns disable radix 33183 33211 38 3.0 331.8 0.7X +day time interval one columns enable radix 21059 21193 133 4.7 210.6 1.0X +day time interval one columns disable radix 32196 32724 660 3.1 322.0 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor day time interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval two columns enable radix 34526 34670 185 2.9 345.3 1.0X -day time interval two columns disable radix 35632 35826 191 2.8 356.3 1.0X +day time interval two columns enable radix 33887 34787 793 3.0 338.9 1.0X +day time interval two columns disable radix 35163 35274 181 2.8 351.6 1.0X diff --git a/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt b/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt index 52092328fd576..ab6e5283dba89 100644 --- a/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt +++ b/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 1974 2002 47 10.1 98.7 1.0X -apache 10784 10862 90 1.9 539.2 0.2X +java 2144 2180 49 9.3 107.2 1.0X +apache 11251 11315 62 1.8 562.6 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2431 2445 12 8.2 121.6 1.0X -apache 12049 12094 41 1.7 602.5 0.2X +java 2446 2450 4 8.2 122.3 1.0X +apache 12449 12535 76 1.6 622.4 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2857 2867 14 7.0 142.8 1.0X -apache 13281 13344 56 1.5 664.0 0.2X +java 2873 2875 3 7.0 143.7 1.0X +apache 13571 13613 41 1.5 678.5 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2943 2973 50 6.8 147.1 1.0X -apache 14384 14421 32 1.4 719.2 0.2X +java 2910 2918 9 6.9 145.5 1.0X +apache 14577 14593 25 1.4 728.9 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3435 3439 4 5.8 171.7 1.0X -apache 12572 12615 40 1.6 628.6 0.3X +java 3169 3171 2 6.3 158.5 1.0X +apache 12500 12611 100 1.6 625.0 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 4040 4052 11 5.0 202.0 1.0X -apache 14274 14363 120 1.4 713.7 0.3X +java 4313 4314 2 4.6 215.7 1.0X +apache 14491 14571 70 1.4 724.5 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 4756 4767 17 4.2 237.8 1.0X -apache 16291 16304 20 1.2 814.6 0.3X +java 4932 4951 24 4.1 246.6 1.0X +apache 15728 15782 49 1.3 786.4 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 5200 5209 8 3.8 260.0 1.0X -apache 17434 17540 101 1.1 871.7 0.3X +java 5328 5330 2 3.8 266.4 1.0X +apache 17182 17223 36 1.2 859.1 0.3X diff --git a/sql/core/benchmarks/Base64Benchmark-results.txt b/sql/core/benchmarks/Base64Benchmark-results.txt index 3e8d7e2727c34..12f54feeed1d5 100644 --- a/sql/core/benchmarks/Base64Benchmark-results.txt +++ b/sql/core/benchmarks/Base64Benchmark-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2287 2338 50 8.7 114.3 1.0X -apache 10870 10993 126 1.8 543.5 0.2X +java 2097 2144 64 9.5 104.9 1.0X +apache 11350 11380 29 1.8 567.5 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2930 2945 21 6.8 146.5 1.0X -apache 12069 12172 108 1.7 603.4 0.2X +java 2624 2631 11 7.6 131.2 1.0X +apache 12395 12421 40 1.6 619.7 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3403 3410 7 5.9 170.1 1.0X -apache 13236 13327 139 1.5 661.8 0.3X +java 3196 3212 14 6.3 159.8 1.0X +apache 13591 13708 167 1.5 679.6 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3914 3947 29 5.1 195.7 1.0X -apache 14411 14441 27 1.4 720.6 0.3X +java 3665 3679 12 5.5 183.3 1.0X +apache 14907 14947 42 1.3 745.4 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3572 3580 9 5.6 178.6 1.0X -apache 12652 12656 7 1.6 632.6 0.3X +java 3319 3362 37 6.0 166.0 1.0X +apache 12471 12532 64 1.6 623.5 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 4918 4919 2 4.1 245.9 1.0X -apache 14579 14601 20 1.4 728.9 0.3X +java 4068 4068 0 4.9 203.4 1.0X +apache 14651 14708 53 1.4 732.5 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 5939 5949 10 3.4 296.9 1.0X -apache 16626 16675 58 1.2 831.3 0.4X +java 5456 5463 12 3.7 272.8 1.0X +apache 16405 16460 69 1.2 820.2 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 6666 6672 5 3.0 333.3 1.0X -apache 18901 18922 24 1.1 945.0 0.4X +java 6111 6119 8 3.3 305.5 1.0X +apache 17824 17959 117 1.1 891.2 0.3X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt index 5cf56352fa761..2d1b73ac4e241 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt @@ -2,195 +2,195 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 8070 8132 88 12.4 80.7 1.0X -With bloom filter 10025 10082 81 10.0 100.2 0.8X +Without bloom filter 7949 7971 31 12.6 79.5 1.0X +With bloom filter 9864 9897 47 10.1 98.6 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 882 890 7 113.4 8.8 1.0X -With bloom filter, blocksize: 2097152 567 577 10 176.4 5.7 1.6X +Without bloom filter, blocksize: 2097152 895 941 40 111.7 9.0 1.0X +With bloom filter, blocksize: 2097152 838 876 34 119.3 8.4 1.1X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 810 836 22 123.4 8.1 1.0X -With bloom filter, blocksize: 4194304 550 568 22 181.8 5.5 1.5X +Without bloom filter, blocksize: 4194304 1345 1348 4 74.3 13.5 1.0X +With bloom filter, blocksize: 4194304 835 842 6 119.7 8.4 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 823 836 11 121.5 8.2 1.0X -With bloom filter, blocksize: 6291456 540 563 17 185.3 5.4 1.5X +Without bloom filter, blocksize: 6291456 1344 1351 11 74.4 13.4 1.0X +With bloom filter, blocksize: 6291456 814 827 12 122.8 8.1 1.7X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 797 821 21 125.5 8.0 1.0X -With bloom filter, blocksize: 8388608 533 553 23 187.5 5.3 1.5X +Without bloom filter, blocksize: 8388608 1352 1363 16 74.0 13.5 1.0X +With bloom filter, blocksize: 8388608 812 819 11 123.1 8.1 1.7X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 859 876 15 116.4 8.6 1.0X -With bloom filter, blocksize: 12582912 545 576 22 183.4 5.5 1.6X +Without bloom filter, blocksize: 12582912 1347 1356 13 74.3 13.5 1.0X +With bloom filter, blocksize: 12582912 816 830 20 122.6 8.2 1.7X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 810 841 26 123.4 8.1 1.0X -With bloom filter, blocksize: 16777216 554 575 15 180.5 5.5 1.5X +Without bloom filter, blocksize: 16777216 1322 1322 0 75.7 13.2 1.0X +With bloom filter, blocksize: 16777216 793 803 10 126.1 7.9 1.7X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 845 852 7 118.4 8.4 1.0X -With bloom filter, blocksize: 33554432 545 564 16 183.4 5.5 1.5X +Without bloom filter, blocksize: 33554432 1315 1324 12 76.0 13.2 1.0X +With bloom filter, blocksize: 33554432 790 810 32 126.6 7.9 1.7X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Without bloom filter 12141 12156 21 8.2 121.4 1.0X -With bloom filter 21175 21296 172 4.7 211.7 0.6X -With adaptive bloom filter & 3 candidates 20846 20897 71 4.8 208.5 0.6X -With adaptive bloom filter & 5 candidates 20731 20989 365 4.8 207.3 0.6X -With adaptive bloom filter & 9 candidates 23208 23264 79 4.3 232.1 0.5X -With adaptive bloom filter & 15 candidates 23293 23349 78 4.3 232.9 0.5X +Without bloom filter 10145 10239 133 9.9 101.4 1.0X +With bloom filter 21381 21403 32 4.7 213.8 0.5X +With adaptive bloom filter & 3 candidates 21248 21357 154 4.7 212.5 0.5X +With adaptive bloom filter & 5 candidates 21353 21396 60 4.7 213.5 0.5X +With adaptive bloom filter & 9 candidates 21141 21175 48 4.7 211.4 0.5X +With adaptive bloom filter & 15 candidates 21062 21121 83 4.7 210.6 0.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 451 502 37 221.9 4.5 1.0X -With bloom filter, blocksize: 2097152 174 186 12 573.8 1.7 2.6X +Without bloom filter, blocksize: 2097152 426 455 27 234.8 4.3 1.0X +With bloom filter, blocksize: 2097152 182 188 6 550.1 1.8 2.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 404 409 4 247.6 4.0 1.0X -With bloom filter, blocksize: 4194304 139 150 7 719.2 1.4 2.9X +Without bloom filter, blocksize: 4194304 406 416 9 246.5 4.1 1.0X +With bloom filter, blocksize: 4194304 129 135 4 772.6 1.3 3.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 416 423 7 240.5 4.2 1.0X -With bloom filter, blocksize: 6291456 141 152 10 709.9 1.4 3.0X +Without bloom filter, blocksize: 6291456 405 409 3 247.0 4.0 1.0X +With bloom filter, blocksize: 6291456 133 142 7 749.6 1.3 3.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 419 432 10 238.6 4.2 1.0X -With bloom filter, blocksize: 8388608 210 223 7 476.2 2.1 2.0X +Without bloom filter, blocksize: 8388608 413 423 9 242.3 4.1 1.0X +With bloom filter, blocksize: 8388608 162 169 5 616.2 1.6 2.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 422 430 9 236.8 4.2 1.0X -With bloom filter, blocksize: 12582912 325 330 4 307.2 3.3 1.3X +Without bloom filter, blocksize: 12582912 419 436 24 238.5 4.2 1.0X +With bloom filter, blocksize: 12582912 345 355 7 289.7 3.5 1.2X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 420 436 22 238.3 4.2 1.0X -With bloom filter, blocksize: 16777216 398 428 29 251.2 4.0 1.1X +Without bloom filter, blocksize: 16777216 455 469 12 219.8 4.5 1.0X +With bloom filter, blocksize: 16777216 353 425 44 283.5 3.5 1.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 428 439 9 233.5 4.3 1.0X -With bloom filter, blocksize: 33554432 430 441 15 232.4 4.3 1.0X +Without bloom filter, blocksize: 33554432 448 459 7 223.4 4.5 1.0X +With bloom filter, blocksize: 33554432 419 429 8 238.5 4.2 1.1X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 286df98479f97..f01ad4e47f807 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,195 +2,195 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 8021 8137 165 12.5 80.2 1.0X -With bloom filter 10132 10186 76 9.9 101.3 0.8X +Without bloom filter 7507 7623 165 13.3 75.1 1.0X +With bloom filter 9512 9543 45 10.5 95.1 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 876 940 61 114.2 8.8 1.0X -With bloom filter, blocksize: 2097152 588 618 21 169.9 5.9 1.5X +Without bloom filter, blocksize: 2097152 867 880 11 115.3 8.7 1.0X +With bloom filter, blocksize: 2097152 604 641 38 165.5 6.0 1.4X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 837 839 2 119.4 8.4 1.0X -With bloom filter, blocksize: 4194304 579 601 34 172.7 5.8 1.4X +Without bloom filter, blocksize: 4194304 823 839 21 121.5 8.2 1.0X +With bloom filter, blocksize: 4194304 558 564 8 179.1 5.6 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 787 797 9 127.0 7.9 1.0X -With bloom filter, blocksize: 6291456 532 548 12 188.1 5.3 1.5X +Without bloom filter, blocksize: 6291456 810 813 3 123.5 8.1 1.0X +With bloom filter, blocksize: 6291456 524 550 20 190.7 5.2 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 796 799 4 125.7 8.0 1.0X -With bloom filter, blocksize: 8388608 534 548 10 187.1 5.3 1.5X +Without bloom filter, blocksize: 8388608 798 803 7 125.4 8.0 1.0X +With bloom filter, blocksize: 8388608 560 604 32 178.7 5.6 1.4X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 836 839 3 119.7 8.4 1.0X -With bloom filter, blocksize: 12582912 517 544 19 193.4 5.2 1.6X +Without bloom filter, blocksize: 12582912 837 843 8 119.5 8.4 1.0X +With bloom filter, blocksize: 12582912 537 548 10 186.1 5.4 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 793 796 4 126.1 7.9 1.0X -With bloom filter, blocksize: 16777216 570 574 5 175.3 5.7 1.4X +Without bloom filter, blocksize: 16777216 804 806 4 124.5 8.0 1.0X +With bloom filter, blocksize: 16777216 514 526 9 194.6 5.1 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 784 794 12 127.5 7.8 1.0X -With bloom filter, blocksize: 33554432 565 587 27 177.1 5.6 1.4X +Without bloom filter, blocksize: 33554432 801 807 7 124.8 8.0 1.0X +With bloom filter, blocksize: 33554432 520 551 28 192.5 5.2 1.5X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Without bloom filter 11173 11180 11 9.0 111.7 1.0X -With bloom filter 19387 19485 138 5.2 193.9 0.6X -With adaptive bloom filter & 3 candidates 19252 19395 202 5.2 192.5 0.6X -With adaptive bloom filter & 5 candidates 19204 19337 188 5.2 192.0 0.6X -With adaptive bloom filter & 9 candidates 19267 19380 160 5.2 192.7 0.6X -With adaptive bloom filter & 15 candidates 19144 19184 57 5.2 191.4 0.6X +Without bloom filter 10073 10250 250 9.9 100.7 1.0X +With bloom filter 13981 14127 206 7.2 139.8 0.7X +With adaptive bloom filter & 3 candidates 13992 14059 94 7.1 139.9 0.7X +With adaptive bloom filter & 5 candidates 14691 14804 160 6.8 146.9 0.7X +With adaptive bloom filter & 9 candidates 14634 14805 242 6.8 146.3 0.7X +With adaptive bloom filter & 15 candidates 14698 14727 41 6.8 147.0 0.7X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 447 476 24 223.6 4.5 1.0X -With bloom filter, blocksize: 2097152 177 185 5 565.6 1.8 2.5X +Without bloom filter, blocksize: 2097152 438 457 17 228.1 4.4 1.0X +With bloom filter, blocksize: 2097152 173 182 9 576.9 1.7 2.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 424 440 14 236.0 4.2 1.0X -With bloom filter, blocksize: 4194304 127 135 7 790.4 1.3 3.3X +Without bloom filter, blocksize: 4194304 416 423 7 240.4 4.2 1.0X +With bloom filter, blocksize: 4194304 116 124 7 864.6 1.2 3.6X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 423 439 16 236.2 4.2 1.0X -With bloom filter, blocksize: 6291456 130 139 9 768.6 1.3 3.3X +Without bloom filter, blocksize: 6291456 412 421 13 242.9 4.1 1.0X +With bloom filter, blocksize: 6291456 145 153 6 687.4 1.5 2.8X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 426 435 7 235.0 4.3 1.0X -With bloom filter, blocksize: 8388608 204 214 6 489.3 2.0 2.1X +Without bloom filter, blocksize: 8388608 417 423 5 240.0 4.2 1.0X +With bloom filter, blocksize: 8388608 158 164 5 634.4 1.6 2.6X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 426 447 23 234.5 4.3 1.0X -With bloom filter, blocksize: 12582912 295 306 8 339.2 2.9 1.4X +Without bloom filter, blocksize: 12582912 413 414 2 242.4 4.1 1.0X +With bloom filter, blocksize: 12582912 312 318 5 320.5 3.1 1.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 427 441 9 234.0 4.3 1.0X -With bloom filter, blocksize: 16777216 372 392 12 268.5 3.7 1.1X +Without bloom filter, blocksize: 16777216 418 424 4 239.1 4.2 1.0X +With bloom filter, blocksize: 16777216 368 417 62 271.5 3.7 1.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 508 524 14 197.0 5.1 1.0X -With bloom filter, blocksize: 33554432 439 463 31 227.7 4.4 1.2X +Without bloom filter, blocksize: 33554432 485 503 21 206.3 4.8 1.0X +With bloom filter, blocksize: 33554432 429 457 41 233.0 4.3 1.1X diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt index ac33c0edbcd24..3bb4debe8d59f 100644 --- a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt @@ -2,69 +2,69 @@ Parquet writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_1_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1630 1688 82 9.7 103.6 1.0X -Output Single Double Column 1848 1854 10 8.5 117.5 0.9X -Output Int and String Column 4604 4635 44 3.4 292.7 0.4X -Output Partitions 3399 3432 46 4.6 216.1 0.5X -Output Buckets 4919 4925 9 3.2 312.7 0.3X +Output Single Int Column 1793 1815 30 8.8 114.0 1.0X +Output Single Double Column 1935 2005 99 8.1 123.0 0.9X +Output Int and String Column 4319 4531 299 3.6 274.6 0.4X +Output Partitions 3240 3261 29 4.9 206.0 0.6X +Output Buckets 4415 4427 17 3.6 280.7 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_2_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1917 1930 19 8.2 121.9 1.0X -Output Single Double Column 1739 1765 35 9.0 110.6 1.1X -Output Int and String Column 5231 5240 13 3.0 332.6 0.4X -Output Partitions 3531 3537 9 4.5 224.5 0.5X -Output Buckets 4815 4816 1 3.3 306.1 0.4X +Output Single Int Column 2013 2013 1 7.8 128.0 1.0X +Output Single Double Column 1892 1899 9 8.3 120.3 1.1X +Output Int and String Column 5133 5137 6 3.1 326.3 0.4X +Output Partitions 3584 3586 3 4.4 227.8 0.6X +Output Buckets 4639 4641 4 3.4 294.9 0.4X ================================================================================================ ORC writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor ORC writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1145 1150 8 13.7 72.8 1.0X -Output Single Double Column 1775 1788 18 8.9 112.8 0.6X -Output Int and String Column 4092 4104 17 3.8 260.2 0.3X -Output Partitions 2516 2532 22 6.3 160.0 0.5X -Output Buckets 3555 3574 26 4.4 226.0 0.3X +Output Single Int Column 1036 1039 4 15.2 65.9 1.0X +Output Single Double Column 1709 1719 14 9.2 108.6 0.6X +Output Int and String Column 3780 3818 54 4.2 240.3 0.3X +Output Partitions 2598 2600 3 6.1 165.2 0.4X +Output Buckets 3551 3566 21 4.4 225.8 0.3X ================================================================================================ JSON writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1710 1721 15 9.2 108.7 1.0X -Output Single Double Column 2405 2421 22 6.5 152.9 0.7X -Output Int and String Column 4262 4274 18 3.7 271.0 0.4X -Output Partitions 3190 3211 30 4.9 202.8 0.5X -Output Buckets 4134 4160 36 3.8 262.8 0.4X +Output Single Int Column 1586 1590 6 9.9 100.9 1.0X +Output Single Double Column 2260 2270 13 7.0 143.7 0.7X +Output Int and String Column 4163 4179 23 3.8 264.7 0.4X +Output Partitions 3109 3131 31 5.1 197.7 0.5X +Output Buckets 4002 4011 12 3.9 254.5 0.4X ================================================================================================ CSV writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor CSV writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 3871 3895 34 4.1 246.1 1.0X -Output Single Double Column 4379 4382 4 3.6 278.4 0.9X -Output Int and String Column 6820 6835 21 2.3 433.6 0.6X -Output Partitions 5555 5573 26 2.8 353.2 0.7X -Output Buckets 6679 6696 24 2.4 424.6 0.6X +Output Single Int Column 3556 3577 28 4.4 226.1 1.0X +Output Single Double Column 4048 4062 21 3.9 257.3 0.9X +Output Int and String Column 6714 6719 7 2.3 426.9 0.5X +Output Partitions 5340 5353 19 2.9 339.5 0.7X +Output Buckets 6447 6466 26 2.4 409.9 0.6X diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt index 56c83e0940856..5c3d4bad772a9 100644 --- a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt +++ b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt @@ -2,69 +2,69 @@ Parquet writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_1_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1736 1765 40 9.1 110.4 1.0X -Output Single Double Column 1840 1879 56 8.6 117.0 0.9X -Output Int and String Column 4395 4435 57 3.6 279.4 0.4X -Output Partitions 3279 3373 132 4.8 208.5 0.5X -Output Buckets 4598 4602 6 3.4 292.3 0.4X +Output Single Int Column 1738 1772 48 9.1 110.5 1.0X +Output Single Double Column 1821 1838 25 8.6 115.8 1.0X +Output Int and String Column 4749 4776 39 3.3 301.9 0.4X +Output Partitions 3238 3272 48 4.9 205.9 0.5X +Output Buckets 4428 4461 46 3.6 281.5 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_2_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1790 1801 15 8.8 113.8 1.0X -Output Single Double Column 1857 1868 17 8.5 118.0 1.0X -Output Int and String Column 4717 4735 26 3.3 299.9 0.4X -Output Partitions 3187 3212 35 4.9 202.6 0.6X -Output Buckets 4353 4358 6 3.6 276.8 0.4X +Output Single Int Column 1961 1992 44 8.0 124.7 1.0X +Output Single Double Column 1876 1878 2 8.4 119.3 1.0X +Output Int and String Column 4831 4837 8 3.3 307.2 0.4X +Output Partitions 3430 3431 2 4.6 218.1 0.6X +Output Buckets 4226 4231 6 3.7 268.7 0.5X ================================================================================================ ORC writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor ORC writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 941 953 20 16.7 59.8 1.0X -Output Single Double Column 1563 1569 8 10.1 99.4 0.6X -Output Int and String Column 3838 3868 43 4.1 244.0 0.2X -Output Partitions 2514 2542 40 6.3 159.8 0.4X -Output Buckets 3554 3555 2 4.4 225.9 0.3X +Output Single Int Column 1076 1084 12 14.6 68.4 1.0X +Output Single Double Column 1808 1811 4 8.7 114.9 0.6X +Output Int and String Column 3981 4036 78 4.0 253.1 0.3X +Output Partitions 2561 2602 59 6.1 162.8 0.4X +Output Buckets 3521 3557 51 4.5 223.9 0.3X ================================================================================================ JSON writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1606 1613 10 9.8 102.1 1.0X -Output Single Double Column 2245 2257 17 7.0 142.7 0.7X -Output Int and String Column 3818 3837 26 4.1 242.7 0.4X -Output Partitions 3154 3181 38 5.0 200.5 0.5X -Output Buckets 4123 4132 12 3.8 262.1 0.4X +Output Single Int Column 1624 1643 27 9.7 103.2 1.0X +Output Single Double Column 2272 2275 4 6.9 144.5 0.7X +Output Int and String Column 3996 4048 73 3.9 254.1 0.4X +Output Partitions 3045 3054 13 5.2 193.6 0.5X +Output Buckets 3899 3903 6 4.0 247.9 0.4X ================================================================================================ CSV writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor CSV writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 3260 3286 36 4.8 207.3 1.0X -Output Single Double Column 4065 4076 15 3.9 258.4 0.8X -Output Int and String Column 6295 6310 21 2.5 400.2 0.5X -Output Partitions 5151 5177 37 3.1 327.5 0.6X -Output Buckets 6173 6209 51 2.5 392.5 0.5X +Output Single Int Column 3445 3446 2 4.6 219.0 1.0X +Output Single Double Column 3674 3717 61 4.3 233.6 0.9X +Output Int and String Column 6085 6090 7 2.6 386.9 0.6X +Output Partitions 5107 5123 22 3.1 324.7 0.7X +Output Buckets 6098 6123 35 2.6 387.7 0.6X diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt index c650aa1efbb32..9320ff01601c4 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt @@ -2,26 +2,26 @@ byte array comparisons ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -2-7 byte 254 257 1 257.8 3.9 1.0X -8-16 byte 409 437 37 160.3 6.2 0.6X -16-32 byte 415 416 1 158.0 6.3 0.6X -512-1024 byte 540 542 1 121.3 8.2 0.5X -512 byte slow 1524 1553 23 43.0 23.3 0.2X -2-7 byte 313 314 1 209.5 4.8 0.8X +2-7 byte 256 259 2 256.2 3.9 1.0X +8-16 byte 412 438 34 159.1 6.3 0.6X +16-32 byte 409 411 3 160.1 6.2 0.6X +512-1024 byte 544 546 1 120.6 8.3 0.5X +512 byte slow 1543 1570 23 42.5 23.5 0.2X +2-7 byte 315 316 1 208.2 4.8 0.8X ================================================================================================ byte array equals ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Byte Array equals 538 541 8 297.6 3.4 1.0X +Byte Array equals 548 551 1 291.9 3.4 1.0X diff --git a/sql/core/benchmarks/ByteArrayBenchmark-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-results.txt index 723af23b06a3f..d76d86ce54c31 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-results.txt @@ -2,26 +2,26 @@ byte array comparisons ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -2-7 byte 258 259 1 254.2 3.9 1.0X -8-16 byte 392 402 11 167.4 6.0 0.7X -16-32 byte 396 398 1 165.4 6.0 0.7X -512-1024 byte 519 523 2 126.4 7.9 0.5X -512 byte slow 3255 3273 13 20.1 49.7 0.1X -2-7 byte 249 250 1 263.0 3.8 1.0X +2-7 byte 259 264 3 253.3 3.9 1.0X +8-16 byte 427 462 23 153.5 6.5 0.6X +16-32 byte 485 486 1 135.3 7.4 0.5X +512-1024 byte 609 612 3 107.7 9.3 0.4X +512 byte slow 1485 1518 30 44.1 22.7 0.2X +2-7 byte 299 301 1 218.8 4.6 0.9X ================================================================================================ byte array equals ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Byte Array equals 555 556 1 288.3 3.5 1.0X +Byte Array equals 550 552 3 290.8 3.4 1.0X diff --git a/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt index cc0b3cdaffd11..8ef55135b58f9 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt @@ -2,76 +2,76 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 25656 25710 55 0.0 513115.4 1.0X +One quoted string 24592 24650 50 0.0 491842.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 59317 59851 631 0.0 59316.9 1.0X -Select 100 columns 22419 22524 133 0.0 22419.0 2.6X -Select one column 18736 18821 95 0.1 18736.0 3.2X -count() 4289 4377 88 0.2 4289.5 13.8X -Select 100 columns, one bad input field 27081 27108 26 0.0 27080.9 2.2X -Select 100 columns, corrupt record field 30668 30949 319 0.0 30668.3 1.9X +Select 1000 columns 58745 59095 571 0.0 58745.1 1.0X +Select 100 columns 21111 21163 55 0.0 21111.4 2.8X +Select one column 17328 17405 112 0.1 17328.0 3.4X +count() 3655 4076 698 0.3 3654.7 16.1X +Select 100 columns, one bad input field 25285 25302 16 0.0 25284.7 2.3X +Select 100 columns, corrupt record field 28990 29050 52 0.0 28989.8 2.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 10795 10819 21 0.9 1079.5 1.0X -Select 1 column + count() 7409 7416 8 1.3 740.9 1.5X -count() 1712 1714 1 5.8 171.2 6.3X +Select 10 columns + count() 11213 11266 46 0.9 1121.3 1.0X +Select 1 column + count() 7751 7770 18 1.3 775.1 1.4X +count() 1657 1659 2 6.0 165.7 6.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 859 861 2 11.6 85.9 1.0X -to_csv(timestamp) 6073 6115 62 1.6 607.3 0.1X -write timestamps to files 6478 6487 7 1.5 647.8 0.1X -Create a dataset of dates 974 981 11 10.3 97.4 0.9X -to_csv(date) 4516 4523 9 2.2 451.6 0.2X -write dates to files 4714 4723 9 2.1 471.4 0.2X +Create a dataset of timestamps 877 888 17 11.4 87.7 1.0X +to_csv(timestamp) 5444 5471 34 1.8 544.4 0.2X +write timestamps to files 6094 6122 40 1.6 609.4 0.1X +Create a dataset of dates 1067 1076 14 9.4 106.7 0.8X +to_csv(date) 4115 4127 21 2.4 411.5 0.2X +write dates to files 4389 4456 59 2.3 438.9 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1167 1177 11 8.6 116.7 1.0X -read timestamps from files 9490 9517 29 1.1 949.0 0.1X -infer timestamps from files 19176 19254 112 0.5 1917.6 0.1X -read date text from files 1133 1149 23 8.8 113.3 1.0X -read date from files 8327 8344 30 1.2 832.7 0.1X -infer date from files 17583 17672 77 0.6 1758.3 0.1X -timestamp strings 1310 1318 7 7.6 131.0 0.9X -parse timestamps from Dataset[String] 11767 11853 85 0.8 1176.7 0.1X -infer timestamps from Dataset[String] 21178 21486 268 0.5 2117.8 0.1X -date strings 1602 1610 8 6.2 160.2 0.7X -parse dates from Dataset[String] 10041 10114 112 1.0 1004.1 0.1X -from_csv(timestamp) 10377 10493 115 1.0 1037.7 0.1X -from_csv(date) 9618 9622 3 1.0 961.8 0.1X -infer error timestamps from Dataset[String] with default format 11925 11968 40 0.8 1192.5 0.1X -infer error timestamps from Dataset[String] with user-provided format 11724 11807 72 0.9 1172.4 0.1X -infer error timestamps from Dataset[String] with legacy format 11781 11879 86 0.8 1178.1 0.1X +read timestamp text from files 1210 1214 4 8.3 121.0 1.0X +read timestamps from files 12528 12534 9 0.8 1252.8 0.1X +infer timestamps from files 24564 24614 48 0.4 2456.4 0.0X +read date text from files 1120 1125 6 8.9 112.0 1.1X +read date from files 11502 11540 35 0.9 1150.2 0.1X +infer date from files 23415 23704 263 0.4 2341.5 0.1X +timestamp strings 1205 1208 3 8.3 120.5 1.0X +parse timestamps from Dataset[String] 13589 13639 48 0.7 1358.9 0.1X +infer timestamps from Dataset[String] 25468 25568 115 0.4 2546.8 0.0X +date strings 1561 1565 4 6.4 156.1 0.8X +parse dates from Dataset[String] 12235 12255 25 0.8 1223.5 0.1X +from_csv(timestamp) 11514 11596 71 0.9 1151.4 0.1X +from_csv(date) 10604 10621 26 0.9 1060.4 0.1X +infer error timestamps from Dataset[String] with default format 14746 14761 14 0.7 1474.6 0.1X +infer error timestamps from Dataset[String] with user-provided format 14743 14787 56 0.7 1474.3 0.1X +infer error timestamps from Dataset[String] with legacy format 14690 14731 56 0.7 1469.0 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 4681 4704 32 0.0 46811.8 1.0X -pushdown disabled 4660 4679 28 0.0 46601.3 1.0X -w/ filters 762 778 16 0.1 7623.6 6.1X +w/o filters 4813 4844 53 0.0 48133.6 1.0X +pushdown disabled 4615 4624 16 0.0 46145.3 1.0X +w/ filters 804 806 3 0.1 8035.4 6.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Read as Intervals 781 785 7 0.4 2602.2 1.0X -Read Raw Strings 291 294 3 1.0 969.3 2.7X +Read as Intervals 815 816 1 0.4 2716.7 1.0X +Read Raw Strings 331 337 6 0.9 1104.2 2.5X diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt b/sql/core/benchmarks/CSVBenchmark-results.txt index 5626bbfb08fbd..e96f233c77793 100644 --- a/sql/core/benchmarks/CSVBenchmark-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-results.txt @@ -2,76 +2,76 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 25766 25929 155 0.0 515313.0 1.0X +One quoted string 24513 24556 44 0.0 490253.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 51465 51922 628 0.0 51465.3 1.0X -Select 100 columns 21796 21940 152 0.0 21796.0 2.4X -Select one column 18651 18703 52 0.1 18651.2 2.8X -count() 3342 3448 103 0.3 3341.9 15.4X -Select 100 columns, one bad input field 27416 27481 60 0.0 27416.2 1.9X -Select 100 columns, corrupt record field 30540 30699 138 0.0 30539.8 1.7X +Select 1000 columns 55661 56102 621 0.0 55661.4 1.0X +Select 100 columns 21761 22015 246 0.0 21761.5 2.6X +Select one column 18450 18504 72 0.1 18449.8 3.0X +count() 3329 3412 72 0.3 3329.0 16.7X +Select 100 columns, one bad input field 27253 27287 48 0.0 27252.5 2.0X +Select 100 columns, corrupt record field 30624 30679 90 0.0 30624.5 1.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 9495 9525 26 1.1 949.5 1.0X -Select 1 column + count() 6922 6961 52 1.4 692.2 1.4X -count() 1742 1752 9 5.7 174.2 5.5X +Select 10 columns + count() 9849 9871 24 1.0 984.9 1.0X +Select 1 column + count() 7065 7075 15 1.4 706.5 1.4X +count() 1738 1741 4 5.8 173.8 5.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 912 958 65 11.0 91.2 1.0X -to_csv(timestamp) 7089 7112 31 1.4 708.9 0.1X -write timestamps to files 7242 7267 22 1.4 724.2 0.1X -Create a dataset of dates 1157 1185 38 8.6 115.7 0.8X -to_csv(date) 5034 5080 65 2.0 503.4 0.2X -write dates to files 5089 5107 29 2.0 508.9 0.2X +Create a dataset of timestamps 821 825 5 12.2 82.1 1.0X +to_csv(timestamp) 6711 6729 20 1.5 671.1 0.1X +write timestamps to files 6843 6858 25 1.5 684.3 0.1X +Create a dataset of dates 939 942 3 10.6 93.9 0.9X +to_csv(date) 4684 4697 21 2.1 468.4 0.2X +write dates to files 4479 4495 13 2.2 447.9 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1228 1233 4 8.1 122.8 1.0X -read timestamps from files 10598 10626 30 0.9 1059.8 0.1X -infer timestamps from files 21159 21181 19 0.5 2115.9 0.1X -read date text from files 1148 1151 3 8.7 114.8 1.1X -read date from files 10147 10180 35 1.0 1014.7 0.1X -infer date from files 21078 21110 47 0.5 2107.8 0.1X -timestamp strings 1354 1366 21 7.4 135.4 0.9X -parse timestamps from Dataset[String] 12127 12153 23 0.8 1212.7 0.1X -infer timestamps from Dataset[String] 22539 22566 27 0.4 2253.9 0.1X -date strings 1857 1862 5 5.4 185.7 0.7X -parse dates from Dataset[String] 11906 11931 30 0.8 1190.6 0.1X -from_csv(timestamp) 10716 10744 37 0.9 1071.6 0.1X -from_csv(date) 11123 11140 15 0.9 1112.3 0.1X -infer error timestamps from Dataset[String] with default format 12274 12281 9 0.8 1227.4 0.1X -infer error timestamps from Dataset[String] with user-provided format 12281 12304 26 0.8 1228.1 0.1X -infer error timestamps from Dataset[String] with legacy format 12300 12307 9 0.8 1230.0 0.1X +read timestamp text from files 1190 1197 6 8.4 119.0 1.0X +read timestamps from files 10627 10667 52 0.9 1062.7 0.1X +infer timestamps from files 21086 21135 69 0.5 2108.6 0.1X +read date text from files 1081 1084 5 9.2 108.1 1.1X +read date from files 10254 10265 12 1.0 1025.4 0.1X +infer date from files 20908 20924 18 0.5 2090.8 0.1X +timestamp strings 1173 1175 3 8.5 117.3 1.0X +parse timestamps from Dataset[String] 12413 12473 57 0.8 1241.3 0.1X +infer timestamps from Dataset[String] 22801 22829 42 0.4 2280.1 0.1X +date strings 1653 1657 4 6.1 165.3 0.7X +parse dates from Dataset[String] 12033 12057 25 0.8 1203.3 0.1X +from_csv(timestamp) 10339 10355 18 1.0 1033.9 0.1X +from_csv(date) 10554 10561 11 0.9 1055.4 0.1X +infer error timestamps from Dataset[String] with default format 12871 12878 10 0.8 1287.1 0.1X +infer error timestamps from Dataset[String] with user-provided format 12845 12867 33 0.8 1284.5 0.1X +infer error timestamps from Dataset[String] with legacy format 12872 12904 37 0.8 1287.2 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 4058 4061 2 0.0 40583.1 1.0X -pushdown disabled 4092 4099 10 0.0 40924.1 1.0X -w/ filters 699 705 8 0.1 6990.7 5.8X +w/o filters 4281 4288 7 0.0 42807.9 1.0X +pushdown disabled 4059 4067 10 0.0 40590.4 1.1X +w/ filters 764 775 13 0.1 7640.9 5.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Read as Intervals 737 742 9 0.4 2456.8 1.0X -Read Raw Strings 294 300 10 1.0 979.5 2.5X +Read as Intervals 709 716 8 0.4 2364.5 1.0X +Read Raw Strings 295 298 3 1.0 984.4 2.4X diff --git a/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt b/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt index 47781a2cc6e1f..2093ce53a24d3 100644 --- a/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt @@ -2,121 +2,121 @@ Char Varchar Write Side Perf w/o Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 6905 7223 322 5.8 172.6 1.0X -write char with length 5 10769 10842 66 3.7 269.2 0.6X -write varchar with length 5 7615 7654 35 5.3 190.4 0.9X +write string with length 5 7118 7215 151 5.6 178.0 1.0X +write char with length 5 12527 12590 97 3.2 313.2 0.6X +write varchar with length 5 7976 8046 82 5.0 199.4 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 3624 3637 21 5.5 181.2 1.0X -write char with length 10 6455 6488 33 3.1 322.7 0.6X -write varchar with length 10 3802 3861 79 5.3 190.1 1.0X +write string with length 10 3745 3751 6 5.3 187.2 1.0X +write char with length 10 6606 6702 83 3.0 330.3 0.6X +write varchar with length 10 3782 3810 25 5.3 189.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 1770 1784 17 5.6 177.0 1.0X -write char with length 20 4741 4751 13 2.1 474.1 0.4X -write varchar with length 20 1921 1926 6 5.2 192.1 0.9X +write string with length 20 1774 1777 5 5.6 177.4 1.0X +write char with length 20 4778 4829 46 2.1 477.8 0.4X +write varchar with length 20 2047 2048 1 4.9 204.7 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 945 955 13 5.3 189.0 1.0X -write char with length 40 3669 3697 26 1.4 733.7 0.3X -write varchar with length 40 1024 1029 5 4.9 204.7 0.9X +write string with length 40 931 950 17 5.4 186.3 1.0X +write char with length 40 3709 3713 3 1.3 741.9 0.3X +write varchar with length 40 1065 1068 2 4.7 213.0 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 648 671 29 5.1 194.5 1.0X -write char with length 60 3258 3278 17 1.0 977.5 0.2X -write varchar with length 60 726 738 12 4.6 217.8 0.9X +write string with length 60 662 675 14 5.0 198.5 1.0X +write char with length 60 3355 3360 5 1.0 1006.5 0.2X +write varchar with length 60 680 691 11 4.9 204.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 522 526 6 4.8 208.7 1.0X -write char with length 80 3151 3173 31 0.8 1260.3 0.2X -write varchar with length 80 555 564 8 4.5 222.2 0.9X +write string with length 80 500 505 5 5.0 200.1 1.0X +write char with length 80 3147 3173 22 0.8 1258.9 0.2X +write varchar with length 80 541 549 9 4.6 216.3 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 423 450 26 4.7 211.3 1.0X -write char with length 100 3057 3067 14 0.7 1528.3 0.1X -write varchar with length 100 472 478 6 4.2 235.9 0.9X +write string with length 100 403 417 13 5.0 201.6 1.0X +write char with length 100 3107 3113 10 0.6 1553.5 0.1X +write varchar with length 100 440 447 8 4.5 219.8 0.9X ================================================================================================ Char Varchar Write Side Perf w/ Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 10664 10748 114 3.8 266.6 1.0X -write char with length 5 13099 13173 91 3.1 327.5 0.8X -write varchar with length 5 12595 12606 10 3.2 314.9 0.8X +write string with length 5 10431 10499 102 3.8 260.8 1.0X +write char with length 5 12396 12404 9 3.2 309.9 0.8X +write varchar with length 5 12858 12889 49 3.1 321.5 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 5412 5423 14 3.7 270.6 1.0X -write char with length 10 8402 8405 5 2.4 420.1 0.6X -write varchar with length 10 8000 8031 31 2.5 400.0 0.7X +write string with length 10 5537 5556 18 3.6 276.8 1.0X +write char with length 10 8103 8104 2 2.5 405.1 0.7X +write varchar with length 10 8414 8427 12 2.4 420.7 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 3326 3331 5 3.0 332.6 1.0X -write char with length 20 5576 5586 9 1.8 557.6 0.6X -write varchar with length 20 5699 5708 9 1.8 569.9 0.6X +write string with length 20 3560 3566 7 2.8 356.0 1.0X +write char with length 20 5738 5741 3 1.7 573.8 0.6X +write varchar with length 20 5787 5803 22 1.7 578.7 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 2210 2214 4 2.3 441.9 1.0X -write char with length 40 4306 4306 0 1.2 861.2 0.5X -write varchar with length 40 4509 4524 16 1.1 901.8 0.5X +write string with length 40 2352 2364 11 2.1 470.4 1.0X +write char with length 40 4431 4449 27 1.1 886.2 0.5X +write varchar with length 40 4461 4473 11 1.1 892.2 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 1894 1901 6 1.8 568.2 1.0X -write char with length 60 4088 4093 5 0.8 1226.4 0.5X -write varchar with length 60 3982 3987 5 0.8 1194.5 0.5X +write string with length 60 1923 1928 4 1.7 577.0 1.0X +write char with length 60 4005 4013 12 0.8 1201.6 0.5X +write varchar with length 60 4028 4033 5 0.8 1208.3 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 1785 1793 6 1.4 714.2 1.0X -write char with length 80 3937 3952 14 0.6 1574.6 0.5X -write varchar with length 80 3942 3959 24 0.6 1576.9 0.5X +write string with length 80 1969 1986 15 1.3 787.5 1.0X +write char with length 80 3999 4009 16 0.6 1599.5 0.5X +write varchar with length 80 4030 4040 9 0.6 1612.1 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 1629 1640 10 1.2 814.6 1.0X -write char with length 100 3686 3693 12 0.5 1842.9 0.4X -write varchar with length 100 3905 3921 15 0.5 1952.6 0.4X +write string with length 100 1616 1626 15 1.2 808.2 1.0X +write char with length 100 3851 3852 1 0.5 1925.7 0.4X +write varchar with length 100 3841 3858 18 0.5 1920.7 0.4X diff --git a/sql/core/benchmarks/CharVarcharBenchmark-results.txt b/sql/core/benchmarks/CharVarcharBenchmark-results.txt index 03a64c6904e9c..8b8a7fe89d469 100644 --- a/sql/core/benchmarks/CharVarcharBenchmark-results.txt +++ b/sql/core/benchmarks/CharVarcharBenchmark-results.txt @@ -2,121 +2,121 @@ Char Varchar Write Side Perf w/o Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 6760 7092 292 5.9 169.0 1.0X -write char with length 5 9848 9929 87 4.1 246.2 0.7X -write varchar with length 5 7633 7676 37 5.2 190.8 0.9X +write string with length 5 6706 6744 33 6.0 167.7 1.0X +write char with length 5 10593 10612 31 3.8 264.8 0.6X +write varchar with length 5 8187 8238 84 4.9 204.7 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 3611 3629 27 5.5 180.5 1.0X -write char with length 10 6130 6165 50 3.3 306.5 0.6X -write varchar with length 10 3742 3772 26 5.3 187.1 1.0X +write string with length 10 3968 3985 21 5.0 198.4 1.0X +write char with length 10 6729 6805 67 3.0 336.5 0.6X +write varchar with length 10 3987 4047 101 5.0 199.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 1775 1786 11 5.6 177.5 1.0X -write char with length 20 4560 4562 2 2.2 456.0 0.4X -write varchar with length 20 1923 1933 8 5.2 192.3 0.9X +write string with length 20 1968 1985 18 5.1 196.8 1.0X +write char with length 20 4800 4832 36 2.1 480.0 0.4X +write varchar with length 20 2016 2025 13 5.0 201.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 935 958 30 5.3 187.1 1.0X -write char with length 40 3475 3480 4 1.4 695.0 0.3X -write varchar with length 40 1019 1038 18 4.9 203.9 0.9X +write string with length 40 1042 1044 2 4.8 208.4 1.0X +write char with length 40 3794 3804 13 1.3 758.9 0.3X +write varchar with length 40 1124 1129 7 4.4 224.8 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 663 666 4 5.0 199.0 1.0X -write char with length 60 3240 3248 10 1.0 972.1 0.2X -write varchar with length 60 711 715 5 4.7 213.2 0.9X +write string with length 60 690 696 6 4.8 207.1 1.0X +write char with length 60 3430 3443 18 1.0 1029.0 0.2X +write varchar with length 60 759 770 9 4.4 227.8 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 533 544 10 4.7 213.2 1.0X -write char with length 80 3024 3028 5 0.8 1209.6 0.2X -write varchar with length 80 560 561 1 4.5 223.9 1.0X +write string with length 80 560 569 11 4.5 223.8 1.0X +write char with length 80 3212 3222 11 0.8 1284.8 0.2X +write varchar with length 80 607 612 6 4.1 242.7 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 460 464 4 4.3 230.0 1.0X -write char with length 100 2973 2975 1 0.7 1486.7 0.2X -write varchar with length 100 483 486 3 4.1 241.4 1.0X +write string with length 100 437 444 11 4.6 218.4 1.0X +write char with length 100 3106 3109 5 0.6 1552.8 0.1X +write varchar with length 100 483 494 13 4.1 241.5 0.9X ================================================================================================ Char Varchar Write Side Perf w/ Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 8798 8827 25 4.5 219.9 1.0X -write char with length 5 11984 11999 19 3.3 299.6 0.7X -write varchar with length 5 12379 12401 20 3.2 309.5 0.7X +write string with length 5 9537 9571 32 4.2 238.4 1.0X +write char with length 5 12811 12868 70 3.1 320.3 0.7X +write varchar with length 5 12857 12877 24 3.1 321.4 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 5290 5307 19 3.8 264.5 1.0X -write char with length 10 7536 7538 2 2.7 376.8 0.7X -write varchar with length 10 7489 7519 39 2.7 374.5 0.7X +write string with length 10 5556 5559 3 3.6 277.8 1.0X +write char with length 10 8349 8354 6 2.4 417.5 0.7X +write varchar with length 10 7827 7844 23 2.6 391.4 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 3213 3218 6 3.1 321.3 1.0X -write char with length 20 5570 5578 7 1.8 557.0 0.6X -write varchar with length 20 5245 5261 15 1.9 524.5 0.6X +write string with length 20 3488 3499 10 2.9 348.8 1.0X +write char with length 20 5847 5854 7 1.7 584.7 0.6X +write varchar with length 20 5408 5409 1 1.8 540.8 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 2121 2125 7 2.4 424.2 1.0X -write char with length 40 4399 4419 17 1.1 879.8 0.5X -write varchar with length 40 4118 4124 5 1.2 823.7 0.5X +write string with length 40 2430 2434 6 2.1 486.1 1.0X +write char with length 40 4492 4494 3 1.1 898.4 0.5X +write varchar with length 40 4131 4140 8 1.2 826.1 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 1884 1891 6 1.8 565.1 1.0X -write char with length 60 3939 3941 4 0.8 1181.6 0.5X -write varchar with length 60 3584 3591 5 0.9 1075.3 0.5X +write string with length 60 1900 1906 5 1.8 570.0 1.0X +write char with length 60 4153 4155 3 0.8 1245.8 0.5X +write varchar with length 60 3659 3660 2 0.9 1097.7 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 1838 1842 3 1.4 735.4 1.0X -write char with length 80 3823 3835 13 0.7 1529.1 0.5X -write varchar with length 80 3454 3456 2 0.7 1381.8 0.5X +write string with length 80 1816 1821 5 1.4 726.5 1.0X +write char with length 80 4030 4050 18 0.6 1611.9 0.5X +write varchar with length 80 3744 3758 12 0.7 1497.6 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 1635 1643 10 1.2 817.3 1.0X -write char with length 100 3697 3704 6 0.5 1848.3 0.4X -write varchar with length 100 3355 3375 18 0.6 1677.3 0.5X +write string with length 100 1674 1687 14 1.2 836.9 1.0X +write char with length 100 3922 3927 7 0.5 1961.0 0.4X +write varchar with length 100 3503 3505 3 0.6 1751.4 0.5X diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index 88db9ebfa1e34..8a1599b3cfe42 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -1,88 +1,88 @@ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1193 1194 1 0.1 11929.0 1.0X -UTF8_LCASE 2717 2721 6 0.0 27168.5 2.3X -UNICODE 17991 17993 2 0.0 179913.6 15.1X -UNICODE_CI 17837 17842 7 0.0 178369.9 15.0X +UTF8_BINARY 1360 1360 1 0.1 13597.4 1.0X +UTF8_LCASE 2411 2417 9 0.0 24106.7 1.8X +UNICODE 16945 16969 34 0.0 169452.6 12.5X +UNICODE_CI 16645 16671 36 0.0 166452.8 12.2X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1523 1523 0 0.1 15233.9 1.0X -UTF8_LCASE 2441 2441 0 0.0 24407.9 1.6X -UNICODE 17875 17884 13 0.0 178749.6 11.7X -UNICODE_CI 17701 17703 2 0.0 177013.8 11.6X +UTF8_BINARY 1751 1753 2 0.1 17513.9 1.0X +UTF8_LCASE 2571 2573 3 0.0 25712.7 1.5X +UNICODE 16594 16625 44 0.0 165935.1 9.5X +UNICODE_CI 16422 16423 3 0.0 164215.1 9.4X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2660 2666 9 0.0 26601.1 1.0X -UTF8_LCASE 5013 5016 3 0.0 50134.0 1.9X -UNICODE 75622 75623 1 0.0 756217.3 28.4X -UNICODE_CI 63036 63042 9 0.0 630360.9 23.7X +UTF8_BINARY 2817 2824 9 0.0 28170.1 1.0X +UTF8_LCASE 5427 5428 1 0.0 54268.5 1.9X +UNICODE 70045 70096 72 0.0 700450.7 24.9X +UNICODE_CI 56364 56433 97 0.0 563641.8 20.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2121 2122 0 0.0 21214.2 1.0X -UTF8_LCASE 27635 27636 1 0.0 276347.7 13.0X -UNICODE 523746 524012 376 0.0 5237460.5 246.9X -UNICODE_CI 520134 520227 131 0.0 5201343.3 245.2X +UTF8_BINARY 1644 1645 1 0.1 16440.3 1.0X +UTF8_LCASE 14804 14846 59 0.0 148037.2 9.0X +UNICODE 308825 309294 663 0.0 3088250.5 187.8X +UNICODE_CI 310637 312537 2688 0.0 3106367.6 188.9X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2767 2769 4 0.0 27666.3 1.0X -UTF8_LCASE 26861 26861 1 0.0 268606.4 9.7X -UNICODE 518540 518815 389 0.0 5185401.3 187.4X -UNICODE_CI 521156 521261 148 0.0 5211559.5 188.4X +UTF8_BINARY 1941 1942 1 0.1 19412.9 1.0X +UTF8_LCASE 10354 10409 78 0.0 103535.9 5.3X +UNICODE 309786 310124 478 0.0 3097864.6 159.6X +UNICODE_CI 313038 313960 1303 0.0 3130382.9 161.3X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2919 2921 3 0.0 29190.2 1.0X -UTF8_LCASE 26862 26862 1 0.0 268618.0 9.2X -UNICODE 504534 504927 556 0.0 5045340.3 172.8X -UNICODE_CI 506542 506565 32 0.0 5065423.0 173.5X +UTF8_BINARY 1958 1961 4 0.1 19579.3 1.0X +UTF8_LCASE 10329 10332 5 0.0 103285.8 5.3X +UNICODE 323944 328005 5743 0.0 3239437.8 165.5X +UNICODE_CI 332646 333139 697 0.0 3326457.7 169.9X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------- -UNICODE 419 425 5 0.2 4189.2 1.0X -UNICODE_CI 416 426 6 0.2 4163.2 1.0X +UNICODE 370 371 1 0.3 3698.1 1.0X +UNICODE_CI 370 370 1 0.3 3696.5 1.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinaryICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 575 576 0 0.2 5754.0 1.0X -UTF8_LCASE 575 576 1 0.2 5747.8 1.0X -UNICODE 576 576 0 0.2 5761.5 1.0X -UNICODE_CI 576 578 2 0.2 5758.0 1.0X +UTF8_BINARY 592 593 1 0.2 5915.6 1.0X +UTF8_LCASE 593 593 1 0.2 5926.8 1.0X +UNICODE 591 593 1 0.2 5912.9 1.0X +UNICODE_CI 593 594 1 0.2 5934.1 1.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinary: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ----------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 159 159 1 0.6 1587.6 1.0X -UTF8_LCASE 159 159 0 0.6 1586.6 1.0X -UNICODE 158 159 1 0.6 1584.9 1.0X -UNICODE_CI 159 160 1 0.6 1586.1 1.0X +UTF8_BINARY 105 109 10 0.9 1054.8 1.0X +UTF8_LCASE 105 106 1 0.9 1053.3 1.0X +UNICODE 105 106 1 0.9 1054.2 1.0X +UNICODE_CI 105 106 0 1.0 1051.1 1.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execLowercase: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 397 405 5 0.3 3974.4 1.0X -UTF8_LCASE 401 405 5 0.2 4009.5 1.0X -UNICODE 395 399 3 0.3 3953.9 1.0X -UNICODE_CI 395 400 3 0.3 3952.0 1.0X +UTF8_BINARY 370 371 1 0.3 3698.3 1.0X +UTF8_LCASE 370 371 1 0.3 3697.7 1.0X +UNICODE 369 370 1 0.3 3692.7 1.0X +UNICODE_CI 370 371 1 0.3 3697.2 1.0X diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index 8402a2db6d869..cbd0727ce92e4 100644 --- a/sql/core/benchmarks/CollationBenchmark-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -1,88 +1,88 @@ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1223 1224 1 0.1 12231.5 1.0X -UTF8_LCASE 3280 3281 1 0.0 32803.3 2.7X -UNICODE 17207 17207 0 0.0 172065.7 14.1X -UNICODE_CI 16560 16565 7 0.0 165604.3 13.5X +UTF8_BINARY 1380 1381 1 0.1 13801.3 1.0X +UTF8_LCASE 3334 3336 3 0.0 33337.1 2.4X +UNICODE 19004 19005 1 0.0 190039.6 13.8X +UNICODE_CI 18686 18699 18 0.0 186856.4 13.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1656 1657 0 0.1 16564.0 1.0X -UTF8_LCASE 3320 3321 0 0.0 33203.0 2.0X -UNICODE 16392 16393 2 0.0 163921.3 9.9X -UNICODE_CI 16314 16319 6 0.0 163143.3 9.8X +UTF8_BINARY 1739 1739 0 0.1 17392.0 1.0X +UTF8_LCASE 4175 4175 0 0.0 41745.1 2.4X +UNICODE 20212 20220 11 0.0 202124.2 11.6X +UNICODE_CI 20078 20086 11 0.0 200782.6 11.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2812 2813 1 0.0 28119.0 1.0X -UTF8_LCASE 5682 5685 4 0.0 56823.2 2.0X -UNICODE 71678 71685 10 0.0 716777.4 25.5X -UNICODE_CI 60660 60670 15 0.0 606597.4 21.6X +UTF8_BINARY 3112 3115 4 0.0 31119.4 1.0X +UTF8_LCASE 6348 6354 9 0.0 63477.9 2.0X +UNICODE 67421 67436 22 0.0 674208.7 21.7X +UNICODE_CI 54039 54056 24 0.0 540394.5 17.4X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2528 2528 1 0.0 25276.8 1.0X -UTF8_LCASE 28034 28050 24 0.0 280335.5 11.1X -UNICODE 521518 521690 242 0.0 5215184.7 206.3X -UNICODE_CI 508188 508312 176 0.0 5081880.5 201.0X +UTF8_BINARY 1677 1678 1 0.1 16768.6 1.0X +UTF8_LCASE 17476 17480 5 0.0 174760.6 10.4X +UNICODE 324829 324937 153 0.0 3248290.7 193.7X +UNICODE_CI 317534 317742 294 0.0 3175340.9 189.4X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2772 2774 4 0.0 27715.0 1.0X -UTF8_LCASE 27387 27390 4 0.0 273872.8 9.9X -UNICODE 501025 501076 72 0.0 5010249.5 180.8X -UNICODE_CI 506654 506666 16 0.0 5066544.6 182.8X +UTF8_BINARY 2040 2041 1 0.0 20400.9 1.0X +UTF8_LCASE 17099 17100 1 0.0 170991.2 8.4X +UNICODE 314251 314484 330 0.0 3142508.7 154.0X +UNICODE_CI 319313 319690 533 0.0 3193131.6 156.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2886 2888 3 0.0 28858.9 1.0X -UTF8_LCASE 27433 27445 17 0.0 274326.2 9.5X -UNICODE 501068 501186 168 0.0 5010676.2 173.6X -UNICODE_CI 506619 506655 52 0.0 5066185.6 175.6X +UTF8_BINARY 2077 2077 1 0.0 20765.6 1.0X +UTF8_LCASE 16903 16905 2 0.0 169034.4 8.1X +UNICODE 326824 328355 2165 0.0 3268239.0 157.4X +UNICODE_CI 334072 334237 233 0.0 3340722.1 160.9X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------- -UNICODE 407 411 4 0.2 4065.4 1.0X -UNICODE_CI 419 423 3 0.2 4194.1 1.0X +UNICODE 301 301 0 0.3 3006.6 1.0X +UNICODE_CI 300 301 1 0.3 3003.5 1.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinaryICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 564 565 2 0.2 5639.2 1.0X -UTF8_LCASE 563 563 0 0.2 5629.0 1.0X -UNICODE 563 565 2 0.2 5634.3 1.0X -UNICODE_CI 564 564 0 0.2 5640.9 1.0X +UTF8_BINARY 599 600 1 0.2 5992.2 1.0X +UTF8_LCASE 599 600 1 0.2 5994.4 1.0X +UNICODE 599 600 1 0.2 5985.1 1.0X +UNICODE_CI 597 598 1 0.2 5971.7 1.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinary: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ----------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 165 166 1 0.6 1647.3 1.0X -UTF8_LCASE 165 165 1 0.6 1646.7 1.0X -UNICODE 165 165 1 0.6 1646.5 1.0X -UNICODE_CI 165 166 1 0.6 1648.7 1.0X +UTF8_BINARY 184 185 1 0.5 1844.8 1.0X +UTF8_LCASE 185 185 0 0.5 1847.3 1.0X +UNICODE 184 185 1 0.5 1844.9 1.0X +UNICODE_CI 185 185 0 0.5 1845.6 1.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execLowercase: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 391 399 6 0.3 3912.1 1.0X -UTF8_LCASE 389 399 7 0.3 3894.2 1.0X -UNICODE 383 391 6 0.3 3828.6 1.0X -UNICODE_CI 383 387 2 0.3 3833.0 1.0X +UTF8_BINARY 324 325 1 0.3 3242.0 1.0X +UTF8_LCASE 325 326 2 0.3 3251.5 1.0X +UNICODE 325 326 1 0.3 3251.9 1.0X +UNICODE_CI 324 326 1 0.3 3242.6 1.0X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt index 4da64ade11d68..ffdd34f6aaa8c 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt @@ -1,88 +1,88 @@ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 156 156 0 0.3 3887.8 1.0X -UTF8_LCASE 9717 9729 18 0.0 242914.7 62.5X -UNICODE 5026 5027 2 0.0 125640.1 32.3X -UNICODE_CI 4969 4972 4 0.0 124224.9 32.0X +UTF8_BINARY 171 172 1 0.2 4282.8 1.0X +UTF8_LCASE 7012 7018 9 0.0 175288.2 40.9X +UNICODE 5206 5207 0 0.0 130157.7 30.4X +UNICODE_CI 5220 5220 0 0.0 130499.0 30.5X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 279 279 0 0.1 6969.5 1.0X -UTF8_LCASE 9624 9628 5 0.0 240611.6 34.5X -UNICODE 5243 5244 0 0.0 131080.1 18.8X -UNICODE_CI 5173 5173 0 0.0 129322.8 18.6X +UTF8_BINARY 315 316 1 0.1 7871.4 1.0X +UTF8_LCASE 7036 7038 4 0.0 175888.2 22.3X +UNICODE 5343 5344 1 0.0 133571.1 17.0X +UNICODE_CI 5284 5284 0 0.0 132104.2 16.8X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 383 383 0 0.1 9576.7 1.0X -UTF8_LCASE 4927 4931 6 0.0 123170.3 12.9X -UNICODE 17244 17261 24 0.0 431096.6 45.0X -UNICODE_CI 12968 12970 3 0.0 324194.1 33.9X +UTF8_BINARY 382 383 1 0.1 9557.2 1.0X +UTF8_LCASE 3587 3592 6 0.0 89683.4 9.4X +UNICODE 15310 15322 16 0.0 382753.7 40.0X +UNICODE_CI 12531 12543 17 0.0 313269.1 32.8X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 535 536 2 0.1 13371.6 1.0X -UTF8_LCASE 9479 9480 2 0.0 236964.5 17.7X -UNICODE 93629 93676 66 0.0 2340726.5 175.1X -UNICODE_CI 93222 93309 124 0.0 2330541.2 174.3X +UTF8_BINARY 350 350 0 0.1 8742.0 1.0X +UTF8_LCASE 9013 9020 11 0.0 225317.3 25.8X +UNICODE 58338 58368 43 0.0 1458444.2 166.8X +UNICODE_CI 58821 58928 152 0.0 1470530.7 168.2X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 430 431 1 0.1 10755.8 1.0X -UTF8_LCASE 6550 6551 2 0.0 163753.7 15.2X -UNICODE 87435 87467 45 0.0 2185886.8 203.2X -UNICODE_CI 90113 90255 201 0.0 2252836.0 209.5X +UTF8_BINARY 290 291 1 0.1 7250.9 1.0X +UTF8_LCASE 5323 5325 3 0.0 133079.8 18.4X +UNICODE 57879 57976 138 0.0 1446968.2 199.6X +UNICODE_CI 59098 59188 127 0.0 1477459.9 203.8X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 455 456 2 0.1 11369.5 1.0X -UTF8_LCASE 7108 7115 9 0.0 177705.2 15.6X -UNICODE 101835 101866 43 0.0 2545883.9 223.9X -UNICODE_CI 100962 101026 91 0.0 2524045.2 222.0X +UTF8_BINARY 300 302 1 0.1 7507.7 1.0X +UTF8_LCASE 5310 5318 11 0.0 132754.5 17.7X +UNICODE 64787 64833 65 0.0 1619680.3 215.7X +UNICODE_CI 64384 64419 50 0.0 1609603.5 214.4X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------- -UNICODE 254 255 1 0.2 6346.5 1.0X -UNICODE_CI 254 254 0 0.2 6348.1 1.0X +UNICODE 214 215 1 0.2 5339.5 1.0X +UNICODE_CI 214 215 0 0.2 5355.7 1.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinaryICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 322 323 1 0.1 8046.3 1.0X -UTF8_LCASE 322 324 2 0.1 8059.0 1.0X -UNICODE 322 323 1 0.1 8050.7 1.0X -UNICODE_CI 322 325 4 0.1 8062.4 1.0X +UTF8_BINARY 318 318 1 0.1 7946.2 1.0X +UTF8_LCASE 318 319 1 0.1 7945.3 1.0X +UNICODE 318 319 1 0.1 7950.9 1.0X +UNICODE_CI 317 318 1 0.1 7931.1 1.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinary: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ----------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 119 120 1 0.3 2972.1 1.0X -UTF8_LCASE 119 120 1 0.3 2971.9 1.0X -UNICODE 119 120 1 0.3 2970.3 1.0X -UNICODE_CI 119 120 1 0.3 2968.6 1.0X +UTF8_BINARY 84 85 0 0.5 2101.1 1.0X +UTF8_LCASE 84 85 1 0.5 2097.7 1.0X +UNICODE 84 85 1 0.5 2106.4 1.0X +UNICODE_CI 84 85 1 0.5 2111.5 1.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execLowercase: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 254 255 1 0.2 6345.2 1.0X -UTF8_LCASE 254 255 0 0.2 6351.8 1.0X -UNICODE 254 255 0 0.2 6352.9 1.0X -UNICODE_CI 254 254 0 0.2 6341.2 1.0X +UTF8_BINARY 214 215 2 0.2 5342.3 1.0X +UTF8_LCASE 214 215 1 0.2 5348.9 1.0X +UNICODE 214 215 1 0.2 5349.8 1.0X +UNICODE_CI 214 215 0 0.2 5354.4 1.0X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt index fba59f3893e22..fa21ecbbb6593 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt @@ -1,88 +1,88 @@ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 125 126 1 0.3 3128.6 1.0X -UTF8_LCASE 10335 10345 14 0.0 258377.4 82.6X -UNICODE 5604 5610 8 0.0 140110.8 44.8X -UNICODE_CI 5570 5577 9 0.0 139252.7 44.5X +UTF8_BINARY 141 146 3 0.3 3523.3 1.0X +UTF8_LCASE 7725 7753 40 0.0 193120.1 54.8X +UNICODE 5788 5824 51 0.0 144696.8 41.1X +UNICODE_CI 5997 6002 7 0.0 149920.7 42.6X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 293 294 2 0.1 7326.8 1.0X -UTF8_LCASE 10035 10035 1 0.0 250865.2 34.2X -UNICODE 5578 5580 3 0.0 139455.8 19.0X -UNICODE_CI 5539 5541 2 0.0 138483.8 18.9X +UTF8_BINARY 337 346 5 0.1 8433.8 1.0X +UTF8_LCASE 7829 7852 33 0.0 195727.0 23.2X +UNICODE 6096 6116 29 0.0 152404.8 18.1X +UNICODE_CI 6112 6131 26 0.0 152805.7 18.1X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 388 388 0 0.1 9699.6 1.0X -UTF8_LCASE 4965 4967 3 0.0 124121.3 12.8X -UNICODE 15750 15753 5 0.0 393740.9 40.6X -UNICODE_CI 12509 12511 2 0.0 312735.5 32.2X +UTF8_BINARY 452 455 3 0.1 11306.0 1.0X +UTF8_LCASE 3968 3990 32 0.0 99194.0 8.8X +UNICODE 15247 15296 69 0.0 381186.5 33.7X +UNICODE_CI 12374 12397 32 0.0 309347.5 27.4X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 421 422 2 0.1 10512.9 1.0X -UTF8_LCASE 10793 10796 5 0.0 269819.0 25.7X -UNICODE 94324 94330 9 0.0 2358090.9 224.3X -UNICODE_CI 91647 91748 143 0.0 2291174.6 217.9X +UTF8_BINARY 435 446 7 0.1 10881.1 1.0X +UTF8_LCASE 10346 10366 29 0.0 258656.4 23.8X +UNICODE 78521 78598 110 0.0 1963015.5 180.4X +UNICODE_CI 80810 81202 554 0.0 2020241.0 185.7X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 452 453 0 0.1 11307.9 1.0X -UTF8_LCASE 6871 6872 2 0.0 171782.0 15.2X -UNICODE 90881 90924 60 0.0 2272034.5 200.9X -UNICODE_CI 91333 91363 42 0.0 2283331.3 201.9X +UTF8_BINARY 321 324 3 0.1 8021.8 1.0X +UTF8_LCASE 5970 5976 10 0.0 149242.0 18.6X +UNICODE 86151 86522 525 0.0 2153773.0 268.5X +UNICODE_CI 89308 90327 1441 0.0 2232710.9 278.3X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 451 452 2 0.1 11268.1 1.0X -UTF8_LCASE 6685 6686 2 0.0 167120.8 14.8X -UNICODE 99387 99484 138 0.0 2484672.5 220.5X -UNICODE_CI 98525 98597 101 0.0 2463132.9 218.6X +UTF8_BINARY 310 314 3 0.1 7741.5 1.0X +UTF8_LCASE 5707 5711 5 0.0 142683.3 18.4X +UNICODE 91242 95109 5469 0.0 2281057.2 294.7X +UNICODE_CI 91446 92305 1215 0.0 2286138.3 295.3X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------- -UNICODE 231 232 0 0.2 5784.5 1.0X -UNICODE_CI 231 232 1 0.2 5780.4 1.0X +UNICODE 298 300 2 0.1 7454.2 1.0X +UNICODE_CI 300 301 1 0.1 7500.2 1.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinaryICU: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 312 314 1 0.1 7811.2 1.0X -UTF8_LCASE 313 314 2 0.1 7822.9 1.0X -UNICODE 313 314 1 0.1 7815.5 1.0X -UNICODE_CI 313 315 4 0.1 7825.7 1.0X +UTF8_BINARY 343 346 3 0.1 8576.7 1.0X +UTF8_LCASE 343 345 2 0.1 8582.7 1.0X +UNICODE 344 348 2 0.1 8607.4 1.0X +UNICODE_CI 340 345 3 0.1 8493.8 1.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execBinary: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ----------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 132 133 0 0.3 3302.0 1.0X -UTF8_LCASE 132 132 0 0.3 3297.5 1.0X -UNICODE 132 133 1 0.3 3296.9 1.0X -UNICODE_CI 132 132 0 0.3 3298.1 1.0X +UTF8_BINARY 130 132 1 0.3 3245.3 1.0X +UTF8_LCASE 129 132 1 0.3 3235.2 1.0X +UNICODE 129 133 1 0.3 3231.9 1.0X +UNICODE_CI 131 133 1 0.3 3274.8 1.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1018-aws -Intel(R) Xeon(R) Platinum 8252C CPU @ 3.80GHz +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor collation unit benchmarks - initCap using impl execLowercase: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 231 231 0 0.2 5770.4 1.0X -UTF8_LCASE 231 232 1 0.2 5776.4 1.0X -UNICODE 231 231 0 0.2 5767.5 1.0X -UNICODE_CI 231 232 1 0.2 5770.2 1.0X +UTF8_BINARY 294 296 1 0.1 7348.6 1.0X +UTF8_LCASE 296 299 2 0.1 7390.7 1.0X +UNICODE 298 300 2 0.1 7461.3 1.0X +UNICODE_CI 297 299 2 0.1 7421.1 1.0X diff --git a/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt b/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt index e6d3fa3dfbe5e..40b7cf00b6669 100644 --- a/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt @@ -2,58 +2,58 @@ Int Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Java Array 122 123 1 2676.8 0.4 1.0X -ByteBuffer Unsafe 194 201 8 1685.2 0.6 0.6X -ByteBuffer API 501 503 2 653.5 1.5 0.2X -DirectByteBuffer 418 419 1 784.8 1.3 0.3X -Unsafe Buffer 154 154 0 2134.6 0.5 0.8X -Column(on heap) 123 123 1 2668.6 0.4 1.0X -Column(off heap) 154 154 1 2134.0 0.5 0.8X -Column(off heap direct) 154 154 1 2128.0 0.5 0.8X -UnsafeRow (on heap) 432 433 2 758.6 1.3 0.3X -UnsafeRow (off heap) 294 295 1 1116.1 0.9 0.4X -Column On Heap Append 336 337 2 976.5 1.0 0.4X +Java Array 123 123 0 2664.8 0.4 1.0X +ByteBuffer Unsafe 188 194 8 1742.2 0.6 0.7X +ByteBuffer API 429 429 1 764.2 1.3 0.3X +DirectByteBuffer 420 421 2 780.9 1.3 0.3X +Unsafe Buffer 154 156 5 2124.5 0.5 0.8X +Column(on heap) 124 124 0 2646.3 0.4 1.0X +Column(off heap) 155 155 0 2117.5 0.5 0.8X +Column(off heap direct) 155 155 0 2115.9 0.5 0.8X +UnsafeRow (on heap) 452 452 0 725.7 1.4 0.3X +UnsafeRow (off heap) 296 297 0 1106.8 0.9 0.4X +Column On Heap Append 312 315 3 1048.8 1.0 0.4X ================================================================================================ Boolean Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Boolean Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Bitset 430 431 2 780.3 1.3 1.0X -Byte Array 249 250 2 1348.7 0.7 1.7X +Bitset 432 433 1 776.0 1.3 1.0X +Byte Array 250 251 1 1341.9 0.7 1.7X ================================================================================================ String Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap 121 122 1 134.9 7.4 1.0X -Off Heap 523 535 9 31.3 31.9 0.2X +On Heap 126 131 9 129.5 7.7 1.0X +Off Heap 475 495 18 34.5 29.0 0.3X ================================================================================================ Array Vector Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array Vector Read: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap Read Size Only 87 88 1 1878.9 0.5 1.0X -Off Heap Read Size Only 425 425 0 385.7 2.6 0.2X -On Heap Read Elements 2464 2467 5 66.5 15.0 0.0X -Off Heap Read Elements 2409 2412 5 68.0 14.7 0.0X +On Heap Read Size Only 86 87 0 1898.0 0.5 1.0X +Off Heap Read Size Only 307 308 1 533.7 1.9 0.3X +On Heap Read Elements 2385 2398 20 68.7 14.6 0.0X +Off Heap Read Elements 2606 2608 3 62.9 15.9 0.0X diff --git a/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt index ea5edb89dcfe6..96a39bdadeeaf 100644 --- a/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt +++ b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt @@ -2,58 +2,58 @@ Int Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Java Array 174 175 1 1883.1 0.5 1.0X -ByteBuffer Unsafe 278 282 6 1177.8 0.8 0.6X -ByteBuffer API 508 509 1 645.6 1.5 0.3X -DirectByteBuffer 468 469 1 700.7 1.4 0.4X -Unsafe Buffer 159 161 1 2057.9 0.5 1.1X -Column(on heap) 170 171 0 1923.5 0.5 1.0X -Column(off heap) 162 162 0 2023.8 0.5 1.1X -Column(off heap direct) 157 158 1 2083.7 0.5 1.1X -UnsafeRow (on heap) 436 436 1 751.7 1.3 0.4X -UnsafeRow (off heap) 314 321 14 1042.5 1.0 0.6X -Column On Heap Append 361 362 1 906.5 1.1 0.5X +Java Array 175 177 5 1871.5 0.5 1.0X +ByteBuffer Unsafe 279 280 1 1174.5 0.9 0.6X +ByteBuffer API 510 511 1 642.1 1.6 0.3X +DirectByteBuffer 470 471 1 697.0 1.4 0.4X +Unsafe Buffer 162 163 1 2020.2 0.5 1.1X +Column(on heap) 171 172 0 1911.5 0.5 1.0X +Column(off heap) 163 163 0 2012.6 0.5 1.1X +Column(off heap direct) 158 158 0 2076.1 0.5 1.1X +UnsafeRow (on heap) 439 439 0 747.1 1.3 0.4X +UnsafeRow (off heap) 315 318 2 1040.4 1.0 0.6X +Column On Heap Append 363 364 1 901.8 1.1 0.5X ================================================================================================ Boolean Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Boolean Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Bitset 451 452 1 744.8 1.3 1.0X -Byte Array 288 289 2 1163.9 0.9 1.6X +Bitset 454 455 1 739.1 1.4 1.0X +Byte Array 263 264 1 1274.3 0.8 1.7X ================================================================================================ String Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap 193 236 33 85.0 11.8 1.0X -Off Heap 424 440 15 38.7 25.9 0.5X +On Heap 139 141 2 118.0 8.5 1.0X +Off Heap 382 391 12 42.9 23.3 0.4X ================================================================================================ Array Vector Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array Vector Read: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap Read Size Only 90 90 1 1826.1 0.5 1.0X -Off Heap Read Size Only 85 85 1 1927.9 0.5 1.1X -On Heap Read Elements 2177 2178 1 75.3 13.3 0.0X -Off Heap Read Elements 2732 2735 4 60.0 16.7 0.0X +On Heap Read Size Only 87 87 0 1883.4 0.5 1.0X +Off Heap Read Size Only 85 86 0 1918.9 0.5 1.0X +On Heap Read Elements 2428 2430 3 67.5 14.8 0.0X +Off Heap Read Elements 2956 2958 3 55.4 18.0 0.0X diff --git a/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt b/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt index 3338d6b4df0eb..588ce854d1858 100644 --- a/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt @@ -2,136 +2,136 @@ Compression Scheme Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor BOOLEAN Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 1 1 0 46950.3 0.0 1.0X -RunLengthEncoding(2.517) 983 989 6 68.2 14.7 0.0X -BooleanBitSet(0.125) 233 234 1 287.8 3.5 0.0X +PassThrough(1.000) 1 1 0 46470.3 0.0 1.0X +RunLengthEncoding(2.515) 1110 1283 245 60.5 16.5 0.0X +BooleanBitSet(0.125) 285 286 1 235.2 4.3 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor BOOLEAN Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 210 211 1 319.3 3.1 1.0X -RunLengthEncoding 598 605 10 112.3 8.9 0.4X -BooleanBitSet 696 699 3 96.5 10.4 0.3X +PassThrough 211 213 1 317.5 3.1 1.0X +RunLengthEncoding 601 601 0 111.7 9.0 0.4X +BooleanBitSet 672 672 0 99.9 10.0 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23190.9 0.0 1.0X -RunLengthEncoding(1.495) 1229 1229 1 54.6 18.3 0.0X +PassThrough(1.000) 3 3 0 23145.6 0.0 1.0X +RunLengthEncoding(1.489) 1079 1079 0 62.2 16.1 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 667 668 1 100.6 9.9 1.0X -RunLengthEncoding 1030 1032 3 65.2 15.3 0.6X +PassThrough 795 796 2 84.5 11.8 1.0X +RunLengthEncoding 990 991 1 67.8 14.8 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23427.9 0.0 1.0X -RunLengthEncoding(2.000) 1234 1234 0 54.4 18.4 0.0X +PassThrough(1.000) 3 3 0 23322.1 0.0 1.0X +RunLengthEncoding(2.000) 1116 1117 2 60.1 16.6 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 665 666 1 100.9 9.9 1.0X -RunLengthEncoding 1007 1007 0 66.6 15.0 0.7X +PassThrough 796 811 16 84.3 11.9 1.0X +RunLengthEncoding 956 957 1 70.2 14.2 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11347.4 0.1 1.0X -RunLengthEncoding(0.997) 1072 1073 1 62.6 16.0 0.0X -DictionaryEncoding(0.500) 378 378 0 177.5 5.6 0.0X -IntDelta(0.250) 139 141 3 481.6 2.1 0.0X +PassThrough(1.000) 6 6 0 11601.5 0.1 1.0X +RunLengthEncoding(1.004) 1011 1011 1 66.4 15.1 0.0X +DictionaryEncoding(0.500) 335 335 0 200.4 5.0 0.0X +IntDelta(0.250) 110 111 0 607.5 1.6 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 644 646 2 104.2 9.6 1.0X -RunLengthEncoding 1181 1182 2 56.8 17.6 0.5X -DictionaryEncoding 521 522 0 128.8 7.8 1.2X -IntDelta 498 499 2 134.7 7.4 1.3X +PassThrough 647 647 0 103.7 9.6 1.0X +RunLengthEncoding 1213 1214 1 55.3 18.1 0.5X +DictionaryEncoding 526 526 0 127.7 7.8 1.2X +IntDelta 501 503 2 133.9 7.5 1.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11261.2 0.1 1.0X -RunLengthEncoding(1.329) 1128 1129 2 59.5 16.8 0.0X -DictionaryEncoding(0.501) 378 379 2 177.5 5.6 0.0X -IntDelta(0.250) 125 125 0 536.8 1.9 0.0X +PassThrough(1.000) 6 6 0 11080.2 0.1 1.0X +RunLengthEncoding(1.339) 1048 1051 5 64.1 15.6 0.0X +DictionaryEncoding(0.501) 337 339 1 199.0 5.0 0.0X +IntDelta(0.250) 110 111 0 607.5 1.6 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 711 712 1 94.3 10.6 1.0X -RunLengthEncoding 1150 1154 5 58.4 17.1 0.6X -DictionaryEncoding 651 655 4 103.0 9.7 1.1X -IntDelta 520 573 59 129.1 7.7 1.4X +PassThrough 710 712 2 94.5 10.6 1.0X +RunLengthEncoding 1188 1190 3 56.5 17.7 0.6X +DictionaryEncoding 659 663 7 101.9 9.8 1.1X +IntDelta 524 526 3 128.1 7.8 1.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 13 13 0 5052.7 0.2 1.0X -RunLengthEncoding(0.748) 1072 1073 0 62.6 16.0 0.0X -DictionaryEncoding(0.250) 521 521 0 128.8 7.8 0.0X -LongDelta(0.125) 110 110 0 609.1 1.6 0.1X +PassThrough(1.000) 16 19 0 4302.0 0.2 1.0X +RunLengthEncoding(0.760) 1066 1066 0 63.0 15.9 0.0X +DictionaryEncoding(0.250) 404 405 2 166.2 6.0 0.0X +LongDelta(0.125) 111 111 0 605.5 1.7 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 771 774 3 87.1 11.5 1.0X -RunLengthEncoding 1232 1233 1 54.5 18.4 0.6X -DictionaryEncoding 720 724 6 93.2 10.7 1.1X -LongDelta 541 543 3 124.1 8.1 1.4X +PassThrough 774 776 2 86.7 11.5 1.0X +RunLengthEncoding 1240 1241 2 54.1 18.5 0.6X +DictionaryEncoding 714 717 4 93.9 10.6 1.1X +LongDelta 543 545 2 123.6 8.1 1.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 13 13 0 5054.5 0.2 1.0X -RunLengthEncoding(1.007) 1110 1111 1 60.4 16.5 0.0X -DictionaryEncoding(0.251) 533 534 2 126.0 7.9 0.0X -LongDelta(0.125) 111 112 0 605.2 1.7 0.1X +PassThrough(1.000) 18 18 0 3770.3 0.3 1.0X +RunLengthEncoding(1.002) 1095 1098 4 61.3 16.3 0.0X +DictionaryEncoding(0.251) 404 405 2 166.0 6.0 0.0X +LongDelta(0.125) 111 111 0 603.7 1.7 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 769 770 0 87.2 11.5 1.0X -RunLengthEncoding 1234 1236 4 54.4 18.4 0.6X -DictionaryEncoding 721 723 3 93.0 10.7 1.1X -LongDelta 669 672 3 100.2 10.0 1.1X +PassThrough 774 777 4 86.7 11.5 1.0X +RunLengthEncoding 1217 1218 1 55.2 18.1 0.6X +DictionaryEncoding 715 719 6 93.9 10.7 1.1X +LongDelta 671 672 2 100.1 10.0 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor STRING Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 20 20 0 3376.5 0.3 1.0X -RunLengthEncoding(0.892) 2013 2014 1 33.3 30.0 0.0X -DictionaryEncoding(0.167) 1687 1691 6 39.8 25.1 0.0X +PassThrough(1.000) 20 23 4 3349.0 0.3 1.0X +RunLengthEncoding(0.893) 1852 1854 3 36.2 27.6 0.0X +DictionaryEncoding(0.167) 2101 2111 14 31.9 31.3 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor STRING Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 1776 1776 1 37.8 26.5 1.0X -RunLengthEncoding 2518 2518 0 26.7 37.5 0.7X -DictionaryEncoding 2028 2030 4 33.1 30.2 0.9X +PassThrough 1654 1675 31 40.6 24.6 1.0X +RunLengthEncoding 2501 2505 6 26.8 37.3 0.7X +DictionaryEncoding 2028 2030 2 33.1 30.2 0.8X diff --git a/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt index c56288558bd5f..4b1206ab2e105 100644 --- a/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt +++ b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt @@ -2,136 +2,136 @@ Compression Scheme Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor BOOLEAN Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 1 1 0 47046.4 0.0 1.0X -RunLengthEncoding(2.514) 882 883 0 76.0 13.1 0.0X -BooleanBitSet(0.125) 234 235 0 286.3 3.5 0.0X +PassThrough(1.000) 2 2 0 43967.6 0.0 1.0X +RunLengthEncoding(2.492) 900 901 1 74.6 13.4 0.0X +BooleanBitSet(0.125) 292 292 0 229.9 4.4 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor BOOLEAN Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 167 168 1 402.1 2.5 1.0X -RunLengthEncoding 532 534 1 126.1 7.9 0.3X -BooleanBitSet 663 665 2 101.2 9.9 0.3X +PassThrough 168 169 1 400.0 2.5 1.0X +RunLengthEncoding 551 555 5 121.7 8.2 0.3X +BooleanBitSet 639 640 1 105.0 9.5 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23535.9 0.0 1.0X -RunLengthEncoding(1.501) 1218 1219 1 55.1 18.2 0.0X +PassThrough(1.000) 3 3 0 23004.2 0.0 1.0X +RunLengthEncoding(1.488) 1039 1040 1 64.6 15.5 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 710 712 2 94.6 10.6 1.0X -RunLengthEncoding 1043 1055 18 64.4 15.5 0.7X +PassThrough 548 561 9 122.5 8.2 1.0X +RunLengthEncoding 970 972 2 69.2 14.5 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 24666.2 0.0 1.0X -RunLengthEncoding(2.012) 1157 1159 3 58.0 17.2 0.0X +PassThrough(1.000) 3 3 0 23244.9 0.0 1.0X +RunLengthEncoding(2.018) 1070 1070 1 62.7 15.9 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 675 675 0 99.5 10.1 1.0X -RunLengthEncoding 1021 1024 4 65.7 15.2 0.7X +PassThrough 543 544 0 123.5 8.1 1.0X +RunLengthEncoding 930 931 2 72.2 13.9 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11233.4 0.1 1.0X -RunLengthEncoding(1.002) 1012 1021 12 66.3 15.1 0.0X -DictionaryEncoding(0.500) 386 387 1 174.1 5.7 0.0X -IntDelta(0.250) 115 115 1 585.5 1.7 0.1X +PassThrough(1.000) 6 6 0 11412.0 0.1 1.0X +RunLengthEncoding(1.006) 997 1000 3 67.3 14.9 0.0X +DictionaryEncoding(0.500) 374 374 1 179.6 5.6 0.0X +IntDelta(0.250) 110 110 1 609.3 1.6 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 644 647 3 104.3 9.6 1.0X -RunLengthEncoding 1194 1194 0 56.2 17.8 0.5X -DictionaryEncoding 502 504 2 133.7 7.5 1.3X -IntDelta 457 458 1 146.9 6.8 1.4X +PassThrough 626 627 1 107.2 9.3 1.0X +RunLengthEncoding 1041 1042 2 64.5 15.5 0.6X +DictionaryEncoding 524 527 2 128.0 7.8 1.2X +IntDelta 460 460 1 146.0 6.8 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11739.3 0.1 1.0X -RunLengthEncoding(1.336) 1040 1040 1 64.5 15.5 0.0X -DictionaryEncoding(0.501) 387 388 1 173.2 5.8 0.0X -IntDelta(0.250) 115 115 1 585.4 1.7 0.0X +PassThrough(1.000) 6 6 0 11296.0 0.1 1.0X +RunLengthEncoding(1.338) 1018 1018 0 65.9 15.2 0.0X +DictionaryEncoding(0.501) 374 374 0 179.4 5.6 0.0X +IntDelta(0.250) 110 110 0 609.1 1.6 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor INT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 727 729 3 92.3 10.8 1.0X -RunLengthEncoding 1178 1182 5 57.0 17.6 0.6X -DictionaryEncoding 687 690 3 97.7 10.2 1.1X -IntDelta 480 482 2 139.7 7.2 1.5X +PassThrough 689 691 2 97.4 10.3 1.0X +RunLengthEncoding 1093 1094 1 61.4 16.3 0.6X +DictionaryEncoding 543 544 1 123.6 8.1 1.3X +IntDelta 597 599 2 112.5 8.9 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 13 13 0 5037.6 0.2 1.0X -RunLengthEncoding(0.750) 1017 1019 3 66.0 15.2 0.0X -DictionaryEncoding(0.250) 442 443 2 152.0 6.6 0.0X -LongDelta(0.125) 110 110 1 609.8 1.6 0.1X +PassThrough(1.000) 18 18 0 3771.2 0.3 1.0X +RunLengthEncoding(0.756) 1058 1059 2 63.5 15.8 0.0X +DictionaryEncoding(0.250) 441 442 1 152.2 6.6 0.0X +LongDelta(0.125) 111 111 0 604.4 1.7 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 755 758 3 88.8 11.3 1.0X -RunLengthEncoding 1216 1216 0 55.2 18.1 0.6X -DictionaryEncoding 774 774 0 86.8 11.5 1.0X -LongDelta 485 488 2 138.4 7.2 1.6X +PassThrough 713 715 2 94.2 10.6 1.0X +RunLengthEncoding 1192 1192 0 56.3 17.8 0.6X +DictionaryEncoding 686 689 3 97.8 10.2 1.0X +LongDelta 523 526 3 128.4 7.8 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 13 13 0 5032.6 0.2 1.0X -RunLengthEncoding(1.003) 1033 1035 3 65.0 15.4 0.0X -DictionaryEncoding(0.251) 444 446 3 151.1 6.6 0.0X -LongDelta(0.125) 147 147 1 457.3 2.2 0.1X +PassThrough(1.000) 13 14 0 4998.3 0.2 1.0X +RunLengthEncoding(1.000) 1073 1076 4 62.5 16.0 0.0X +DictionaryEncoding(0.251) 442 442 0 151.8 6.6 0.0X +LongDelta(0.125) 111 112 1 602.7 1.7 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LONG Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 753 755 2 89.1 11.2 1.0X -RunLengthEncoding 1225 1227 3 54.8 18.3 0.6X -DictionaryEncoding 773 774 0 86.8 11.5 1.0X -LongDelta 672 675 6 99.9 10.0 1.1X +PassThrough 712 714 2 94.3 10.6 1.0X +RunLengthEncoding 1163 1165 2 57.7 17.3 0.6X +DictionaryEncoding 685 686 2 97.9 10.2 1.0X +LongDelta 609 610 2 110.1 9.1 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor STRING Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 20 26 2 3350.9 0.3 1.0X -RunLengthEncoding(0.887) 1812 1813 2 37.0 27.0 0.0X -DictionaryEncoding(0.167) 2262 2263 1 29.7 33.7 0.0X +PassThrough(1.000) 27 27 0 2518.1 0.4 1.0X +RunLengthEncoding(0.892) 1819 1821 2 36.9 27.1 0.0X +DictionaryEncoding(0.167) 2071 2072 1 32.4 30.9 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor STRING Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 1645 1650 7 40.8 24.5 1.0X -RunLengthEncoding 2281 2284 4 29.4 34.0 0.7X -DictionaryEncoding 1845 1847 3 36.4 27.5 0.9X +PassThrough 1448 1471 33 46.3 21.6 1.0X +RunLengthEncoding 2222 2227 6 30.2 33.1 0.7X +DictionaryEncoding 1998 2010 17 33.6 29.8 0.7X diff --git a/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt b/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt index c53ca57d7242e..d3aa5cb8235de 100644 --- a/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt @@ -1,280 +1,280 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 365410.9 0.0 1.0X -OnHeapColumnVector 3342 3368 36 122.6 8.2 0.0X -OffHeapColumnVector 5519 5519 0 74.2 13.5 0.0X +ConstantColumnVector 1 1 0 372657.0 0.0 1.0X +OnHeapColumnVector 2898 2899 1 141.3 7.1 0.0X +OffHeapColumnVector 5566 5569 4 73.6 13.6 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 399028.5 0.0 1.0X -OnHeapColumnVector 4031 4035 6 101.6 9.8 0.0X -OffHeapColumnVector 4792 4796 6 85.5 11.7 0.0X +ConstantColumnVector 1 1 0 423940.2 0.0 1.0X +OnHeapColumnVector 4102 4103 1 99.9 10.0 0.0X +OffHeapColumnVector 4885 4901 22 83.8 11.9 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 399083.0 0.0 1.0X -OnHeapColumnVector 4041 4043 4 101.4 9.9 0.0X -OffHeapColumnVector 4684 4701 25 87.5 11.4 0.0X +ConstantColumnVector 1 1 0 423996.4 0.0 1.0X +OnHeapColumnVector 4284 4291 10 95.6 10.5 0.0X +OffHeapColumnVector 5062 5071 13 80.9 12.4 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 399168.5 0.0 1.0X -OnHeapColumnVector 4762 4762 0 86.0 11.6 0.0X -OffHeapColumnVector 5314 5316 3 77.1 13.0 0.0X +ConstantColumnVector 1 1 0 423912.6 0.0 1.0X +OnHeapColumnVector 4176 4186 14 98.1 10.2 0.0X +OffHeapColumnVector 4728 4736 11 86.6 11.5 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 399059.2 0.0 1.0X -OnHeapColumnVector 8010 8011 0 51.1 19.6 0.0X -OffHeapColumnVector 5170 5183 19 79.2 12.6 0.0X +ConstantColumnVector 1 1 0 423965.7 0.0 1.0X +OnHeapColumnVector 4527 4529 2 90.5 11.1 0.0X +OffHeapColumnVector 5110 5116 10 80.2 12.5 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 399074.8 0.0 1.0X -OnHeapColumnVector 4366 4366 0 93.8 10.7 0.0X -OffHeapColumnVector 4960 4963 4 82.6 12.1 0.0X +ConstantColumnVector 1 1 0 424082.0 0.0 1.0X +OnHeapColumnVector 4366 4372 9 93.8 10.7 0.0X +OffHeapColumnVector 5147 5152 8 79.6 12.6 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 632717.8 0.0 1.0X -OnHeapColumnVector 16 16 0 25522.9 0.0 0.0X -OffHeapColumnVector 65 65 0 6306.1 0.2 0.0X +ConstantColumnVector 1 1 0 664350.5 0.0 1.0X +OnHeapColumnVector 16 16 0 25444.2 0.0 0.0X +OffHeapColumnVector 65 65 0 6275.3 0.2 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 577697.2 0.0 1.0X -OnHeapColumnVector 33 33 0 12488.2 0.1 0.0X -OffHeapColumnVector 66 66 1 6198.2 0.2 0.0X +ConstantColumnVector 1 1 0 632713.9 0.0 1.0X +OnHeapColumnVector 33 34 0 12422.6 0.1 0.0X +OffHeapColumnVector 67 68 1 6094.1 0.2 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 442449.7 0.0 1.0X -OnHeapColumnVector 16 16 0 25047.7 0.0 0.1X -OffHeapColumnVector 127 128 0 3216.3 0.3 0.0X +ConstantColumnVector 1 1 0 457739.0 0.0 1.0X +OnHeapColumnVector 16 16 0 25107.7 0.0 0.1X +OffHeapColumnVector 129 129 0 3177.6 0.3 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 491627.0 0.0 1.0X -OnHeapColumnVector 33 33 0 12493.3 0.1 0.0X -OffHeapColumnVector 129 129 0 3184.4 0.3 0.0X +ConstantColumnVector 1 1 0 530954.4 0.0 1.0X +OnHeapColumnVector 34 34 0 12039.3 0.1 0.0X +OffHeapColumnVector 129 129 0 3168.0 0.3 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 309 310 1 1324.1 0.8 0.0X -OffHeapColumnVector 3767 3768 1 108.7 9.2 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 105 106 1 3884.1 0.3 0.0X +OffHeapColumnVector 6540 6543 4 62.6 16.0 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 4118 4123 7 99.5 10.1 0.0X -OffHeapColumnVector 3746 3755 13 109.3 9.1 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 4074 4075 0 100.5 9.9 0.0X +OffHeapColumnVector 6602 6610 12 62.0 16.1 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 4114 4115 2 99.6 10.0 0.0X -OffHeapColumnVector 3744 3763 27 109.4 9.1 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 4052 4056 6 101.1 9.9 0.0X +OffHeapColumnVector 6534 6537 5 62.7 16.0 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 4107 4122 21 99.7 10.0 0.0X -OffHeapColumnVector 3763 3779 21 108.8 9.2 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 4056 4058 3 101.0 9.9 0.0X +OffHeapColumnVector 6536 6541 7 62.7 16.0 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 4102 4104 4 99.9 10.0 0.0X -OffHeapColumnVector 3820 3824 7 107.2 9.3 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 4046 4053 9 101.2 9.9 0.0X +OffHeapColumnVector 6530 6531 1 62.7 15.9 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 4246 4248 2 96.5 10.4 0.0X -OffHeapColumnVector 3743 3777 48 109.4 9.1 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 4059 4061 3 100.9 9.9 0.0X +OffHeapColumnVector 6537 6538 2 62.7 16.0 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 26549131.4 0.0 1.0X -OnHeapColumnVector 1 1 0 492066.4 0.0 0.0X -OffHeapColumnVector 889 890 2 461.0 2.2 0.0X +ConstantColumnVector 0 0 0 13274135.5 0.0 1.0X +OnHeapColumnVector 1 1 0 474473.3 0.0 0.0X +OffHeapColumnVector 893 894 1 458.6 2.2 0.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1939 1940 2 211.3 4.7 1.0X -OnHeapColumnVector 2075 2089 19 197.4 5.1 0.9X -OffHeapColumnVector 2601 2603 2 157.5 6.3 0.7X +ConstantColumnVector 1953 1955 2 209.7 4.8 1.0X +OnHeapColumnVector 2072 2077 7 197.7 5.1 0.9X +OffHeapColumnVector 2604 2608 5 157.3 6.4 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1846 1848 2 221.9 4.5 1.0X -OnHeapColumnVector 2099 2101 4 195.2 5.1 0.9X -OffHeapColumnVector 2613 2638 35 156.7 6.4 0.7X +ConstantColumnVector 1845 1846 2 222.0 4.5 1.0X +OnHeapColumnVector 2101 2103 4 195.0 5.1 0.9X +OffHeapColumnVector 2613 2615 3 156.8 6.4 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1986 1987 1 206.3 4.8 1.0X -OnHeapColumnVector 2120 2121 1 193.2 5.2 0.9X -OffHeapColumnVector 2753 2753 0 148.8 6.7 0.7X +ConstantColumnVector 1985 1986 1 206.3 4.8 1.0X +OnHeapColumnVector 2120 2123 4 193.2 5.2 0.9X +OffHeapColumnVector 2758 2762 5 148.5 6.7 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1943 1943 0 210.8 4.7 1.0X -OnHeapColumnVector 5899 5903 5 69.4 14.4 0.3X -OffHeapColumnVector 5086 5089 5 80.5 12.4 0.4X +ConstantColumnVector 1948 1952 5 210.2 4.8 1.0X +OnHeapColumnVector 5737 5746 13 71.4 14.0 0.3X +OffHeapColumnVector 8493 8494 2 48.2 20.7 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1943 1943 1 210.8 4.7 1.0X -OnHeapColumnVector 5919 5922 5 69.2 14.5 0.3X -OffHeapColumnVector 5089 5096 10 80.5 12.4 0.4X +ConstantColumnVector 1950 1951 1 210.0 4.8 1.0X +OnHeapColumnVector 5657 5657 1 72.4 13.8 0.3X +OffHeapColumnVector 8500 8502 3 48.2 20.8 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1940 1946 9 211.1 4.7 1.0X -OnHeapColumnVector 5901 5907 8 69.4 14.4 0.3X -OffHeapColumnVector 5132 5142 14 79.8 12.5 0.4X +ConstantColumnVector 1948 1949 1 210.3 4.8 1.0X +OnHeapColumnVector 5765 5765 1 71.0 14.1 0.3X +OffHeapColumnVector 8512 8533 29 48.1 20.8 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1943 1944 2 210.8 4.7 1.0X -OnHeapColumnVector 5913 5914 1 69.3 14.4 0.3X -OffHeapColumnVector 5133 5159 37 79.8 12.5 0.4X +ConstantColumnVector 1949 1950 0 210.1 4.8 1.0X +OnHeapColumnVector 5660 5670 15 72.4 13.8 0.3X +OffHeapColumnVector 8502 8505 4 48.2 20.8 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1945 1949 6 210.6 4.7 1.0X -OnHeapColumnVector 5954 5955 2 68.8 14.5 0.3X -OffHeapColumnVector 5081 5083 3 80.6 12.4 0.4X +ConstantColumnVector 1952 1956 6 209.9 4.8 1.0X +OnHeapColumnVector 5742 5745 3 71.3 14.0 0.3X +OffHeapColumnVector 8555 8574 26 47.9 20.9 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1956 1957 2 209.4 4.8 1.0X -OnHeapColumnVector 5956 5997 58 68.8 14.5 0.3X -OffHeapColumnVector 5076 5077 1 80.7 12.4 0.4X +ConstantColumnVector 1956 1957 0 209.4 4.8 1.0X +OnHeapColumnVector 5657 5661 4 72.4 13.8 0.3X +OffHeapColumnVector 8523 8539 23 48.1 20.8 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 888 888 0 461.4 2.2 1.0X -OnHeapColumnVector 889 890 1 461.0 2.2 1.0X -OffHeapColumnVector 888 889 1 461.3 2.2 1.0X +ConstantColumnVector 892 892 1 459.3 2.2 1.0X +OnHeapColumnVector 1020 1021 1 401.5 2.5 0.9X +OffHeapColumnVector 892 893 1 459.0 2.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 2850 2850 0 143.7 7.0 1.0X -OnHeapColumnVector 2978 2978 1 137.6 7.3 1.0X -OffHeapColumnVector 2977 2978 1 137.6 7.3 1.0X +ConstantColumnVector 2866 2869 4 142.9 7.0 1.0X +OnHeapColumnVector 2993 2994 0 136.8 7.3 1.0X +OffHeapColumnVector 2991 2993 3 137.0 7.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 2867 2872 6 142.9 7.0 1.0X -OnHeapColumnVector 2993 2994 1 136.8 7.3 1.0X -OffHeapColumnVector 2991 2995 5 136.9 7.3 1.0X +ConstantColumnVector 2877 2892 21 142.4 7.0 1.0X +OnHeapColumnVector 3135 3136 3 130.7 7.7 0.9X +OffHeapColumnVector 3012 3013 1 136.0 7.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 3009 3011 3 136.1 7.3 1.0X -OnHeapColumnVector 3137 3139 3 130.6 7.7 1.0X -OffHeapColumnVector 3141 3142 2 130.4 7.7 1.0X +ConstantColumnVector 2381 2381 0 172.1 5.8 1.0X +OnHeapColumnVector 3157 3158 3 129.8 7.7 0.8X +OffHeapColumnVector 3148 3149 1 130.1 7.7 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with StringType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3321197.8 0.0 0.0X +OnHeapColumnVector 0 0 0 3321413.2 0.0 0.0X OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3321197.8 0.0 0.0X +OnHeapColumnVector 0 0 0 3321440.2 0.0 0.0X OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3321197.8 0.0 0.0X +OnHeapColumnVector 0 0 0 3321440.2 0.0 0.0X OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3321197.8 0.0 0.0X +OnHeapColumnVector 0 0 0 3321440.2 0.0 0.0X OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3321467.1 0.0 0.0X +OnHeapColumnVector 0 0 0 3321440.2 0.0 0.0X OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X diff --git a/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt b/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt index c381cbab325fc..39aedf6270830 100644 --- a/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt +++ b/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt @@ -1,280 +1,280 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 329000.5 0.0 1.0X -OnHeapColumnVector 2882 2884 3 142.1 7.0 0.0X -OffHeapColumnVector 3380 3382 2 121.2 8.3 0.0X +ConstantColumnVector 1 1 0 324095.0 0.0 1.0X +OnHeapColumnVector 2813 2814 2 145.6 6.9 0.0X +OffHeapColumnVector 3407 3412 7 120.2 8.3 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387377.7 0.0 1.0X -OnHeapColumnVector 3661 3670 12 111.9 8.9 0.0X -OffHeapColumnVector 4386 4388 3 93.4 10.7 0.0X +ConstantColumnVector 1 1 0 382856.0 0.0 1.0X +OnHeapColumnVector 4041 4044 4 101.4 9.9 0.0X +OffHeapColumnVector 4288 4289 1 95.5 10.5 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387399.7 0.0 1.0X -OnHeapColumnVector 3915 3918 4 104.6 9.6 0.0X -OffHeapColumnVector 4559 4560 2 89.8 11.1 0.0X +ConstantColumnVector 1 1 0 383128.5 0.0 1.0X +OnHeapColumnVector 4013 4014 2 102.1 9.8 0.0X +OffHeapColumnVector 4353 4355 2 94.1 10.6 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387538.2 0.0 1.0X -OnHeapColumnVector 3628 3632 6 112.9 8.9 0.0X -OffHeapColumnVector 4489 4490 2 91.2 11.0 0.0X +ConstantColumnVector 1 1 0 383052.9 0.0 1.0X +OnHeapColumnVector 3818 3820 3 107.3 9.3 0.0X +OffHeapColumnVector 4644 4645 1 88.2 11.3 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387487.6 0.0 1.0X -OnHeapColumnVector 4219 4222 5 97.1 10.3 0.0X -OffHeapColumnVector 4701 4702 2 87.1 11.5 0.0X +ConstantColumnVector 1 1 0 383078.3 0.0 1.0X +OnHeapColumnVector 4128 4139 16 99.2 10.1 0.0X +OffHeapColumnVector 4602 4605 4 89.0 11.2 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387405.9 0.0 1.0X -OnHeapColumnVector 4336 4342 8 94.5 10.6 0.0X -OffHeapColumnVector 4376 4376 0 93.6 10.7 0.0X +ConstantColumnVector 1 1 0 383207.7 0.0 1.0X +OnHeapColumnVector 4274 4280 8 95.8 10.4 0.0X +OffHeapColumnVector 4583 4584 2 89.4 11.2 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 699323.4 0.0 1.0X -OnHeapColumnVector 16 16 0 25587.5 0.0 0.0X -OffHeapColumnVector 65 65 0 6320.1 0.2 0.0X +ConstantColumnVector 1 1 0 699303.1 0.0 1.0X +OnHeapColumnVector 16 16 0 25461.3 0.0 0.0X +OffHeapColumnVector 66 66 1 6242.8 0.2 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 664355.9 0.0 1.0X -OnHeapColumnVector 33 34 0 12331.0 0.1 0.0X -OffHeapColumnVector 67 67 0 6114.9 0.2 0.0X +ConstantColumnVector 1 1 0 664337.6 0.0 1.0X +OnHeapColumnVector 34 34 0 12100.9 0.1 0.0X +OffHeapColumnVector 68 69 0 5986.6 0.2 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 553059.9 0.0 1.0X -OnHeapColumnVector 16 16 0 25179.1 0.0 0.0X -OffHeapColumnVector 127 127 0 3217.6 0.3 0.0X +ConstantColumnVector 1 1 0 553053.1 0.0 1.0X +OnHeapColumnVector 16 16 0 25009.2 0.0 0.0X +OffHeapColumnVector 128 128 0 3191.8 0.3 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 510543.7 0.0 1.0X -OnHeapColumnVector 34 34 0 12081.9 0.1 0.0X -OffHeapColumnVector 128 129 0 3191.4 0.3 0.0X +ConstantColumnVector 1 1 0 510537.3 0.0 1.0X +OnHeapColumnVector 34 35 0 11938.3 0.1 0.0X +OffHeapColumnVector 129 130 0 3165.4 0.3 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1041 1041 1 393.6 2.5 1.0X -OnHeapColumnVector 2191 2191 0 186.9 5.3 0.5X -OffHeapColumnVector 4378 4379 1 93.6 10.7 0.2X +ConstantColumnVector 1051 1051 1 389.8 2.6 1.0X +OnHeapColumnVector 2133 2135 2 192.0 5.2 0.5X +OffHeapColumnVector 4374 4376 2 93.6 10.7 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 826 827 2 496.1 2.0 1.0X -OnHeapColumnVector 4856 4859 4 84.4 11.9 0.2X -OffHeapColumnVector 4645 4667 32 88.2 11.3 0.2X +ConstantColumnVector 836 836 0 490.1 2.0 1.0X +OnHeapColumnVector 4993 4994 0 82.0 12.2 0.2X +OffHeapColumnVector 4488 4489 1 91.3 11.0 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 828 828 1 494.9 2.0 1.0X -OnHeapColumnVector 4917 4918 2 83.3 12.0 0.2X -OffHeapColumnVector 4624 4631 9 88.6 11.3 0.2X +ConstantColumnVector 835 836 2 490.6 2.0 1.0X +OnHeapColumnVector 5030 5032 3 81.4 12.3 0.2X +OffHeapColumnVector 4509 4513 5 90.8 11.0 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 826 828 2 495.9 2.0 1.0X -OnHeapColumnVector 4914 4917 5 83.4 12.0 0.2X -OffHeapColumnVector 4635 4637 3 88.4 11.3 0.2X +ConstantColumnVector 838 840 3 489.1 2.0 1.0X +OnHeapColumnVector 5039 5045 8 81.3 12.3 0.2X +OffHeapColumnVector 4522 4523 2 90.6 11.0 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 827 829 2 495.1 2.0 1.0X -OnHeapColumnVector 4931 4932 1 83.1 12.0 0.2X -OffHeapColumnVector 4642 4644 2 88.2 11.3 0.2X +ConstantColumnVector 833 836 3 491.5 2.0 1.0X +OnHeapColumnVector 5044 5045 0 81.2 12.3 0.2X +OffHeapColumnVector 4500 4502 3 91.0 11.0 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 826 827 1 496.0 2.0 1.0X -OnHeapColumnVector 4908 4921 19 83.5 12.0 0.2X -OffHeapColumnVector 4627 4628 1 88.5 11.3 0.2X +ConstantColumnVector 835 836 1 490.4 2.0 1.0X +OnHeapColumnVector 5040 5042 3 81.3 12.3 0.2X +OffHeapColumnVector 4499 4499 1 91.0 11.0 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1811 1811 0 226.2 4.4 1.0X -OnHeapColumnVector 2128 2130 4 192.5 5.2 0.9X -OffHeapColumnVector 2340 2343 3 175.0 5.7 0.8X +ConstantColumnVector 2605 2605 1 157.3 6.4 1.0X +OnHeapColumnVector 2723 2724 2 150.4 6.6 1.0X +OffHeapColumnVector 2729 2730 1 150.1 6.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 2657221.1 0.0 1.0X -OnHeapColumnVector 0 0 0 1022070.8 0.0 0.4X -OffHeapColumnVector 691 692 1 592.6 1.7 0.0X +ConstantColumnVector 0 0 0 1476302.0 0.0 1.0X +OnHeapColumnVector 0 0 0 1022060.6 0.0 0.7X +OffHeapColumnVector 767 767 0 534.2 1.9 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 2214485.0 0.0 1.0X -OnHeapColumnVector 0 1 0 949064.3 0.0 0.4X -OffHeapColumnVector 767 769 3 533.8 1.9 0.0X +ConstantColumnVector 0 0 0 1660780.7 0.0 1.0X +OnHeapColumnVector 0 0 0 1022032.6 0.0 0.6X +OffHeapColumnVector 766 767 1 534.8 1.9 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1022070.8 0.0 1.0X -OnHeapColumnVector 1 1 0 738160.3 0.0 0.7X -OffHeapColumnVector 762 762 0 537.5 1.9 0.0X +ConstantColumnVector 0 0 0 1476307.4 0.0 1.0X +OnHeapColumnVector 0 0 0 1022058.1 0.0 0.7X +OffHeapColumnVector 767 767 0 534.2 1.9 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 1660794.1 0.0 1.0X -OnHeapColumnVector 3784 3785 2 108.3 9.2 0.0X -OffHeapColumnVector 3768 3782 20 108.7 9.2 0.0X +ConstantColumnVector 0 0 0 1021777.6 0.0 1.0X +OnHeapColumnVector 3918 3923 7 104.5 9.6 0.0X +OffHeapColumnVector 3743 3752 12 109.4 9.1 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 1660794.1 0.0 1.0X -OnHeapColumnVector 3788 3808 28 108.1 9.2 0.0X -OffHeapColumnVector 3680 3687 10 111.3 9.0 0.0X +ConstantColumnVector 0 0 0 857165.6 0.0 1.0X +OnHeapColumnVector 3933 3938 7 104.1 9.6 0.0X +OffHeapColumnVector 3737 3748 16 109.6 9.1 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1660794.1 0.0 1.0X -OnHeapColumnVector 3804 3807 5 107.7 9.3 0.0X -OffHeapColumnVector 3712 3713 1 110.3 9.1 0.0X +ConstantColumnVector 0 0 0 857165.6 0.0 1.0X +OnHeapColumnVector 3930 3930 1 104.2 9.6 0.0X +OffHeapColumnVector 3736 3736 1 109.6 9.1 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1660794.1 0.0 1.0X -OnHeapColumnVector 3801 3802 2 107.8 9.3 0.0X -OffHeapColumnVector 3704 3704 1 110.6 9.0 0.0X +ConstantColumnVector 0 0 0 857165.6 0.0 1.0X +OnHeapColumnVector 3922 3923 1 104.4 9.6 0.0X +OffHeapColumnVector 3742 3743 1 109.5 9.1 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1660794.1 0.0 1.0X -OnHeapColumnVector 3795 3797 2 107.9 9.3 0.0X -OffHeapColumnVector 3703 3715 16 110.6 9.0 0.0X +ConstantColumnVector 0 0 0 857165.6 0.0 1.0X +OnHeapColumnVector 3920 3926 8 104.5 9.6 0.0X +OffHeapColumnVector 3745 3753 12 109.4 9.1 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1660794.1 0.0 1.0X -OnHeapColumnVector 3794 3797 4 108.0 9.3 0.0X -OffHeapColumnVector 3719 3720 1 110.1 9.1 0.0X +ConstantColumnVector 0 0 0 857183.5 0.0 1.0X +OnHeapColumnVector 3920 3926 9 104.5 9.6 0.0X +OffHeapColumnVector 3723 3725 3 110.0 9.1 0.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 3673 3675 3 111.5 9.0 1.0X -OnHeapColumnVector 2448 2450 3 167.3 6.0 1.5X -OffHeapColumnVector 2585 2585 1 158.5 6.3 1.4X +ConstantColumnVector 3097 3099 3 132.3 7.6 1.0X +OnHeapColumnVector 2732 2733 1 149.9 6.7 1.1X +OffHeapColumnVector 2741 2742 1 149.4 6.7 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 2657221.1 0.0 1.0X -OnHeapColumnVector 651 652 1 629.3 1.6 0.0X -OffHeapColumnVector 691 692 1 592.4 1.7 0.0X +ConstantColumnVector 765 766 1 535.4 1.9 1.0X +OnHeapColumnVector 774 774 1 529.3 1.9 1.0X +OffHeapColumnVector 830 831 2 493.6 2.0 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 887 888 1 461.6 2.2 1.0X -OnHeapColumnVector 764 764 0 535.9 1.9 1.2X -OffHeapColumnVector 762 763 1 537.5 1.9 1.2X +ConstantColumnVector 765 768 3 535.2 1.9 1.0X +OnHeapColumnVector 772 773 1 530.4 1.9 1.0X +OffHeapColumnVector 831 832 1 492.7 2.0 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test write and read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 761 761 0 538.5 1.9 1.0X -OnHeapColumnVector 765 765 1 535.7 1.9 1.0X -OffHeapColumnVector 763 763 1 537.2 1.9 1.0X +ConstantColumnVector 892 893 1 459.2 2.2 1.0X +OnHeapColumnVector 774 775 1 528.9 1.9 1.2X +OffHeapColumnVector 831 831 0 493.0 2.0 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with StringType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X -OnHeapColumnVector 0 0 0 2211973.6 0.0 0.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X OffHeapColumnVector 0 0 0 409190809.2 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X -OnHeapColumnVector 0 0 0 2211985.5 0.0 0.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X OffHeapColumnVector 0 0 0 409190809.2 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X -OnHeapColumnVector 0 0 0 2211985.5 0.0 0.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X OffHeapColumnVector 0 0 0 409190809.2 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X -OnHeapColumnVector 0 0 0 2211985.5 0.0 0.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X OffHeapColumnVector 0 0 0 409190809.2 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Test isNull with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X -OnHeapColumnVector 0 0 0 2211985.5 0.0 0.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X OffHeapColumnVector 0 0 0 409190809.2 0.0 1.0X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt index bdc453db1735d..ea578d9f6d8aa 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt @@ -2,437 +2,437 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10214 10246 45 1.5 649.4 1.0X -SQL Json 7831 7865 48 2.0 497.9 1.3X -SQL Json with UnsafeRow 8565 8571 8 1.8 544.6 1.2X -SQL Parquet Vectorized: DataPageV1 81 96 11 193.3 5.2 125.6X -SQL Parquet Vectorized: DataPageV2 201 210 8 78.4 12.8 50.9X -SQL Parquet MR: DataPageV1 1794 1818 34 8.8 114.1 5.7X -SQL Parquet MR: DataPageV2 1650 1651 1 9.5 104.9 6.2X -SQL ORC Vectorized 120 132 8 130.5 7.7 84.8X -SQL ORC MR 1447 1453 9 10.9 92.0 7.1X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 10281 10314 46 1.5 653.6 1.0X +SQL Json 7952 8108 220 2.0 505.6 1.3X +SQL Json with UnsafeRow 9090 9092 3 1.7 577.9 1.1X +SQL Parquet Vectorized: DataPageV1 82 94 10 192.5 5.2 125.8X +SQL Parquet Vectorized: DataPageV2 92 99 8 171.9 5.8 112.3X +SQL Parquet MR: DataPageV1 1701 1728 38 9.2 108.2 6.0X +SQL Parquet MR: DataPageV2 1594 1607 19 9.9 101.3 6.5X +SQL ORC Vectorized 137 142 6 114.9 8.7 75.1X +SQL ORC MR 1464 1465 2 10.7 93.1 7.0X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 84 86 1 187.3 5.3 1.0X -ParquetReader Vectorized: DataPageV2 208 211 4 75.7 13.2 0.4X -ParquetReader Vectorized -> Row: DataPageV1 72 73 1 219.2 4.6 1.2X -ParquetReader Vectorized -> Row: DataPageV2 199 201 4 79.2 12.6 0.4X +ParquetReader Vectorized: DataPageV1 84 86 2 186.8 5.4 1.0X +ParquetReader Vectorized: DataPageV2 100 101 1 157.9 6.3 0.8X +ParquetReader Vectorized -> Row: DataPageV1 73 74 1 216.3 4.6 1.2X +ParquetReader Vectorized -> Row: DataPageV2 90 91 1 175.2 5.7 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9574 9607 46 1.6 608.7 1.0X -SQL Json 8719 8757 55 1.8 554.3 1.1X -SQL Json with UnsafeRow 9120 9130 13 1.7 579.9 1.0X -SQL Parquet Vectorized: DataPageV1 95 101 5 164.9 6.1 100.4X -SQL Parquet Vectorized: DataPageV2 95 104 8 165.3 6.0 100.6X -SQL Parquet MR: DataPageV1 1927 1938 15 8.2 122.5 5.0X -SQL Parquet MR: DataPageV2 1792 1851 84 8.8 114.0 5.3X -SQL ORC Vectorized 110 118 7 143.1 7.0 87.1X -SQL ORC MR 1579 1582 4 10.0 100.4 6.1X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 9866 9904 53 1.6 627.3 1.0X +SQL Json 9122 9125 5 1.7 579.9 1.1X +SQL Json with UnsafeRow 10109 10124 20 1.6 642.7 1.0X +SQL Parquet Vectorized: DataPageV1 96 104 8 163.5 6.1 102.6X +SQL Parquet Vectorized: DataPageV2 98 111 8 160.7 6.2 100.8X +SQL Parquet MR: DataPageV1 1870 1883 19 8.4 118.9 5.3X +SQL Parquet MR: DataPageV2 1857 1895 54 8.5 118.1 5.3X +SQL ORC Vectorized 139 149 15 113.1 8.8 70.9X +SQL ORC MR 1588 1591 4 9.9 101.0 6.2X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 80 83 2 196.0 5.1 1.0X -ParquetReader Vectorized: DataPageV2 81 83 1 194.9 5.1 1.0X -ParquetReader Vectorized -> Row: DataPageV1 44 46 2 353.7 2.8 1.8X -ParquetReader Vectorized -> Row: DataPageV2 45 46 1 352.4 2.8 1.8X +ParquetReader Vectorized: DataPageV1 82 84 2 191.5 5.2 1.0X +ParquetReader Vectorized: DataPageV2 85 98 7 184.5 5.4 1.0X +ParquetReader Vectorized -> Row: DataPageV1 46 51 6 341.6 2.9 1.8X +ParquetReader Vectorized -> Row: DataPageV2 46 50 5 339.9 2.9 1.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10409 10436 39 1.5 661.8 1.0X -SQL Json 8942 8944 4 1.8 568.5 1.2X -SQL Json with UnsafeRow 9693 9697 5 1.6 616.3 1.1X -SQL Parquet Vectorized: DataPageV1 118 134 17 133.8 7.5 88.5X -SQL Parquet Vectorized: DataPageV2 139 152 16 113.5 8.8 75.1X -SQL Parquet MR: DataPageV1 2019 2054 50 7.8 128.4 5.2X -SQL Parquet MR: DataPageV2 2011 2011 0 7.8 127.9 5.2X -SQL ORC Vectorized 140 148 8 112.1 8.9 74.2X -SQL ORC MR 1818 1825 10 8.7 115.6 5.7X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 10575 10586 14 1.5 672.4 1.0X +SQL Json 9463 9503 57 1.7 601.6 1.1X +SQL Json with UnsafeRow 10388 10399 15 1.5 660.5 1.0X +SQL Parquet Vectorized: DataPageV1 118 131 14 133.4 7.5 89.7X +SQL Parquet Vectorized: DataPageV2 140 183 19 112.4 8.9 75.6X +SQL Parquet MR: DataPageV1 2010 2013 4 7.8 127.8 5.3X +SQL Parquet MR: DataPageV2 2018 2038 28 7.8 128.3 5.2X +SQL ORC Vectorized 139 172 28 113.3 8.8 76.2X +SQL ORC MR 1687 1701 20 9.3 107.3 6.3X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 138 146 9 113.8 8.8 1.0X -ParquetReader Vectorized: DataPageV2 169 176 10 93.2 10.7 0.8X -ParquetReader Vectorized -> Row: DataPageV1 134 139 5 117.0 8.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 183 186 5 86.1 11.6 0.8X +ParquetReader Vectorized: DataPageV1 149 155 5 105.3 9.5 1.0X +ParquetReader Vectorized: DataPageV2 178 184 7 88.2 11.3 0.8X +ParquetReader Vectorized -> Row: DataPageV1 135 140 5 116.9 8.6 1.1X +ParquetReader Vectorized -> Row: DataPageV2 166 176 10 95.0 10.5 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11428 11435 9 1.4 726.6 1.0X -SQL Json 9048 9074 37 1.7 575.2 1.3X -SQL Json with UnsafeRow 9790 9800 14 1.6 622.4 1.2X -SQL Parquet Vectorized: DataPageV1 97 110 13 162.2 6.2 117.8X -SQL Parquet Vectorized: DataPageV2 176 197 18 89.2 11.2 64.8X -SQL Parquet MR: DataPageV1 1974 1978 6 8.0 125.5 5.8X -SQL Parquet MR: DataPageV2 2028 2031 5 7.8 128.9 5.6X -SQL ORC Vectorized 177 201 27 89.0 11.2 64.6X -SQL ORC MR 2053 2059 9 7.7 130.5 5.6X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 11729 11735 10 1.3 745.7 1.0X +SQL Json 9804 9835 43 1.6 623.3 1.2X +SQL Json with UnsafeRow 10754 10760 9 1.5 683.7 1.1X +SQL Parquet Vectorized: DataPageV1 97 113 14 162.9 6.1 121.5X +SQL Parquet Vectorized: DataPageV2 176 191 12 89.3 11.2 66.6X +SQL Parquet MR: DataPageV1 1949 1973 34 8.1 123.9 6.0X +SQL Parquet MR: DataPageV2 2019 2034 21 7.8 128.4 5.8X +SQL ORC Vectorized 180 190 17 87.6 11.4 65.3X +SQL ORC MR 1692 1707 22 9.3 107.5 6.9X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 158 162 5 99.5 10.0 1.0X -ParquetReader Vectorized: DataPageV2 237 248 18 66.4 15.1 0.7X -ParquetReader Vectorized -> Row: DataPageV1 128 134 7 122.5 8.2 1.2X -ParquetReader Vectorized -> Row: DataPageV2 209 216 6 75.3 13.3 0.8X +ParquetReader Vectorized: DataPageV1 130 138 6 120.9 8.3 1.0X +ParquetReader Vectorized: DataPageV2 214 219 6 73.6 13.6 0.6X +ParquetReader Vectorized -> Row: DataPageV1 129 133 5 122.0 8.2 1.0X +ParquetReader Vectorized -> Row: DataPageV2 225 246 24 69.8 14.3 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11758 11763 8 1.3 747.6 1.0X -SQL Json 9255 9264 12 1.7 588.4 1.3X -SQL Json with UnsafeRow 9871 9876 6 1.6 627.6 1.2X -SQL Parquet Vectorized: DataPageV1 286 308 13 54.9 18.2 41.1X -SQL Parquet Vectorized: DataPageV2 238 269 14 66.0 15.2 49.3X -SQL Parquet MR: DataPageV1 2493 2494 1 6.3 158.5 4.7X -SQL Parquet MR: DataPageV2 2053 2054 2 7.7 130.5 5.7X -SQL ORC Vectorized 165 174 10 95.5 10.5 71.4X -SQL ORC MR 1821 1822 1 8.6 115.8 6.5X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 11573 11671 139 1.4 735.8 1.0X +SQL Json 9549 9558 12 1.6 607.1 1.2X +SQL Json with UnsafeRow 10532 10532 1 1.5 669.6 1.1X +SQL Parquet Vectorized: DataPageV1 279 300 17 56.3 17.8 41.4X +SQL Parquet Vectorized: DataPageV2 248 272 11 63.5 15.7 46.7X +SQL Parquet MR: DataPageV1 2453 2454 2 6.4 156.0 4.7X +SQL Parquet MR: DataPageV2 1991 1997 8 7.9 126.6 5.8X +SQL ORC Vectorized 166 179 12 94.5 10.6 69.5X +SQL ORC MR 1773 1776 4 8.9 112.7 6.5X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 305 313 10 51.6 19.4 1.0X -ParquetReader Vectorized: DataPageV2 258 270 15 60.8 16.4 1.2X -ParquetReader Vectorized -> Row: DataPageV1 317 319 3 49.6 20.2 1.0X -ParquetReader Vectorized -> Row: DataPageV2 254 268 9 61.9 16.2 1.2X +ParquetReader Vectorized: DataPageV1 306 309 3 51.5 19.4 1.0X +ParquetReader Vectorized: DataPageV2 278 284 6 56.5 17.7 1.1X +ParquetReader Vectorized -> Row: DataPageV1 317 323 6 49.6 20.2 1.0X +ParquetReader Vectorized -> Row: DataPageV2 262 272 9 60.1 16.6 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11470 11490 28 1.4 729.3 1.0X -SQL Json 10456 10495 56 1.5 664.8 1.1X -SQL Json with UnsafeRow 11508 11514 10 1.4 731.6 1.0X -SQL Parquet Vectorized: DataPageV1 85 101 17 185.0 5.4 134.9X -SQL Parquet Vectorized: DataPageV2 84 96 12 187.7 5.3 136.9X -SQL Parquet MR: DataPageV1 2003 2039 51 7.9 127.3 5.7X -SQL Parquet MR: DataPageV2 1969 1969 1 8.0 125.2 5.8X -SQL ORC Vectorized 239 248 14 65.9 15.2 48.0X -SQL ORC MR 1782 1791 13 8.8 113.3 6.4X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 11778 11795 24 1.3 748.8 1.0X +SQL Json 11267 11356 127 1.4 716.3 1.0X +SQL Json with UnsafeRow 12181 12204 32 1.3 774.5 1.0X +SQL Parquet Vectorized: DataPageV1 84 99 14 187.8 5.3 140.6X +SQL Parquet Vectorized: DataPageV2 83 96 16 189.9 5.3 142.2X +SQL Parquet MR: DataPageV1 2002 2005 4 7.9 127.3 5.9X +SQL Parquet MR: DataPageV2 1943 1971 40 8.1 123.5 6.1X +SQL ORC Vectorized 220 243 21 71.6 14.0 53.6X +SQL ORC MR 1680 1688 11 9.4 106.8 7.0X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 145 153 12 108.7 9.2 1.0X -ParquetReader Vectorized: DataPageV2 143 149 7 110.0 9.1 1.0X -ParquetReader Vectorized -> Row: DataPageV1 136 143 8 115.2 8.7 1.1X -ParquetReader Vectorized -> Row: DataPageV2 135 141 6 116.3 8.6 1.1X +ParquetReader Vectorized: DataPageV1 135 152 37 116.7 8.6 1.0X +ParquetReader Vectorized: DataPageV2 132 138 6 119.0 8.4 1.0X +ParquetReader Vectorized -> Row: DataPageV1 129 135 5 121.8 8.2 1.0X +ParquetReader Vectorized -> Row: DataPageV2 145 147 2 108.3 9.2 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11799 11829 43 1.3 750.1 1.0X -SQL Json 11125 11128 3 1.4 707.3 1.1X -SQL Json with UnsafeRow 11800 11815 22 1.3 750.2 1.0X -SQL Parquet Vectorized: DataPageV1 266 288 20 59.1 16.9 44.4X -SQL Parquet Vectorized: DataPageV2 263 286 14 59.7 16.8 44.8X -SQL Parquet MR: DataPageV1 2457 2472 22 6.4 156.2 4.8X -SQL Parquet MR: DataPageV2 2414 2423 13 6.5 153.5 4.9X -SQL ORC Vectorized 576 581 9 27.3 36.6 20.5X -SQL ORC MR 2192 2197 7 7.2 139.4 5.4X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 12383 12385 2 1.3 787.3 1.0X +SQL Json 11720 11726 8 1.3 745.1 1.1X +SQL Json with UnsafeRow 12528 12562 47 1.3 796.5 1.0X +SQL Parquet Vectorized: DataPageV1 279 301 19 56.3 17.8 44.3X +SQL Parquet Vectorized: DataPageV2 267 288 14 58.9 17.0 46.3X +SQL Parquet MR: DataPageV1 2421 2431 14 6.5 154.0 5.1X +SQL Parquet MR: DataPageV2 2354 2382 39 6.7 149.7 5.3X +SQL ORC Vectorized 585 598 16 26.9 37.2 21.2X +SQL ORC MR 2199 2199 0 7.2 139.8 5.6X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 341 346 6 46.2 21.6 1.0X -ParquetReader Vectorized: DataPageV2 351 358 5 44.8 22.3 1.0X -ParquetReader Vectorized -> Row: DataPageV1 324 331 6 48.5 20.6 1.0X -ParquetReader Vectorized -> Row: DataPageV2 323 326 4 48.7 20.5 1.1X +ParquetReader Vectorized: DataPageV1 334 342 8 47.1 21.2 1.0X +ParquetReader Vectorized: DataPageV2 334 338 5 47.1 21.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 333 336 5 47.2 21.2 1.0X +ParquetReader Vectorized -> Row: DataPageV2 335 338 2 46.9 21.3 1.0X ================================================================================================ SQL Single Numeric Column Scan in Struct ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2295 2333 53 6.9 145.9 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2261 2268 10 7.0 143.8 1.0X -SQL ORC Vectorized (Nested Column Enabled) 128 136 11 122.7 8.2 17.9X -SQL Parquet MR: DataPageV1 2378 2387 13 6.6 151.2 1.0X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2801 2804 5 5.6 178.1 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 103 119 19 152.5 6.6 22.3X -SQL Parquet MR: DataPageV2 2295 2312 25 6.9 145.9 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2669 2679 14 5.9 169.7 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 104 114 13 150.9 6.6 22.0X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2168 2196 39 7.3 137.9 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2168 2173 7 7.3 137.8 1.0X +SQL ORC Vectorized (Nested Column Enabled) 146 152 12 107.9 9.3 14.9X +SQL Parquet MR: DataPageV1 2344 2367 33 6.7 149.0 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2798 2805 9 5.6 177.9 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 107 126 20 147.6 6.8 20.4X +SQL Parquet MR: DataPageV2 2289 2318 41 6.9 145.5 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2687 2690 5 5.9 170.8 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 105 117 18 149.6 6.7 20.6X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2265 2302 52 6.9 144.0 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2221 2276 78 7.1 141.2 1.0X -SQL ORC Vectorized (Nested Column Enabled) 261 274 21 60.3 16.6 8.7X -SQL Parquet MR: DataPageV1 2435 2440 6 6.5 154.8 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2972 2982 15 5.3 188.9 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 110 121 14 143.6 7.0 20.7X -SQL Parquet MR: DataPageV2 2429 2437 12 6.5 154.4 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2882 2884 4 5.5 183.2 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 142 160 21 110.5 9.0 15.9X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2155 2166 15 7.3 137.0 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2139 2150 17 7.4 136.0 1.0X +SQL ORC Vectorized (Nested Column Enabled) 276 283 12 57.0 17.6 7.8X +SQL Parquet MR: DataPageV1 2477 2489 17 6.4 157.5 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2995 3013 26 5.3 190.4 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 113 148 19 139.1 7.2 19.1X +SQL Parquet MR: DataPageV2 2394 2401 10 6.6 152.2 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2943 2994 73 5.3 187.1 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 139 159 21 113.1 8.8 15.5X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2182 2205 32 7.2 138.7 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2192 2223 45 7.2 139.3 1.0X -SQL ORC Vectorized (Nested Column Enabled) 284 293 14 55.4 18.1 7.7X -SQL Parquet MR: DataPageV1 2445 2464 26 6.4 155.4 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3037 3038 2 5.2 193.1 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 113 116 2 138.7 7.2 19.2X -SQL Parquet MR: DataPageV2 2437 2448 17 6.5 154.9 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3036 3037 1 5.2 193.0 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 265 271 5 59.4 16.8 8.2X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2146 2196 72 7.3 136.4 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2099 2111 17 7.5 133.5 1.0X +SQL ORC Vectorized (Nested Column Enabled) 302 322 17 52.1 19.2 7.1X +SQL Parquet MR: DataPageV1 2420 2446 36 6.5 153.9 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2844 2849 6 5.5 180.8 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 106 118 13 148.3 6.7 20.2X +SQL Parquet MR: DataPageV2 2372 2383 14 6.6 150.8 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2871 2880 12 5.5 182.5 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 267 279 15 58.8 17.0 8.0X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2185 2193 12 7.2 138.9 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2202 2216 19 7.1 140.0 1.0X -SQL ORC Vectorized (Nested Column Enabled) 283 298 14 55.5 18.0 7.7X -SQL Parquet MR: DataPageV1 2872 2882 14 5.5 182.6 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3376 3392 23 4.7 214.7 0.6X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 320 329 8 49.2 20.3 6.8X -SQL Parquet MR: DataPageV2 2512 2518 9 6.3 159.7 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3009 3010 2 5.2 191.3 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 291 298 11 54.1 18.5 7.5X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2138 2162 35 7.4 135.9 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2118 2125 10 7.4 134.6 1.0X +SQL ORC Vectorized (Nested Column Enabled) 305 310 4 51.5 19.4 7.0X +SQL Parquet MR: DataPageV1 2786 2802 23 5.6 177.1 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3459 3460 1 4.5 219.9 0.6X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 323 328 5 48.7 20.5 6.6X +SQL Parquet MR: DataPageV2 2403 2419 22 6.5 152.8 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2896 2921 35 5.4 184.1 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 269 296 17 58.4 17.1 7.9X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2205 2207 4 7.1 140.2 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2238 2243 7 7.0 142.3 1.0X -SQL ORC Vectorized (Nested Column Enabled) 346 374 27 45.5 22.0 6.4X -SQL Parquet MR: DataPageV1 2463 2465 2 6.4 156.6 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3059 3060 2 5.1 194.5 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 91 109 17 172.3 5.8 24.2X -SQL Parquet MR: DataPageV2 2419 2446 37 6.5 153.8 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3078 3084 9 5.1 195.7 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 91 108 16 172.9 5.8 24.2X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2198 2213 21 7.2 139.7 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2184 2219 49 7.2 138.9 1.0X +SQL ORC Vectorized (Nested Column Enabled) 360 374 25 43.7 22.9 6.1X +SQL Parquet MR: DataPageV1 2434 2445 16 6.5 154.7 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3003 3008 7 5.2 191.0 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 106 115 6 148.1 6.8 20.7X +SQL Parquet MR: DataPageV2 2354 2357 4 6.7 149.7 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2847 2860 17 5.5 181.0 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 91 103 6 171.9 5.8 24.0X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2639 2643 6 6.0 167.8 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2676 2677 1 5.9 170.1 1.0X -SQL ORC Vectorized (Nested Column Enabled) 700 703 4 22.5 44.5 3.8X -SQL Parquet MR: DataPageV1 2949 2962 17 5.3 187.5 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3307 3315 12 4.8 210.2 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 310 324 17 50.7 19.7 8.5X -SQL Parquet MR: DataPageV2 2785 2810 36 5.6 177.0 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3269 3269 1 4.8 207.8 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 310 324 11 50.7 19.7 8.5X +SQL ORC MR 2598 2614 23 6.1 165.2 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2566 2583 24 6.1 163.1 1.0X +SQL ORC Vectorized (Nested Column Enabled) 713 720 11 22.0 45.4 3.6X +SQL Parquet MR: DataPageV1 2767 2850 119 5.7 175.9 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3267 3280 18 4.8 207.7 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 284 294 16 55.4 18.1 9.1X +SQL Parquet MR: DataPageV2 2713 2727 20 5.8 172.5 1.0X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3235 3237 2 4.9 205.7 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 281 314 23 55.9 17.9 9.2X ================================================================================================ SQL Nested Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Nested Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 12995 13153 131 0.1 12393.4 1.0X -SQL ORC Vectorized (Nested Column Disabled) 13011 13181 142 0.1 12408.4 1.0X -SQL ORC Vectorized (Nested Column Enabled) 7084 7096 11 0.1 6755.6 1.8X -SQL Parquet MR: DataPageV1 9427 9453 27 0.1 8990.6 1.4X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9722 9802 39 0.1 9271.2 1.3X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5931 6030 41 0.2 5656.2 2.2X -SQL Parquet MR: DataPageV2 9704 9744 59 0.1 9254.3 1.3X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 10391 10496 55 0.1 9909.7 1.3X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5687 5729 23 0.2 5423.2 2.3X +SQL ORC MR 13204 13257 72 0.1 12592.7 1.0X +SQL ORC Vectorized (Nested Column Disabled) 13023 13064 43 0.1 12419.4 1.0X +SQL ORC Vectorized (Nested Column Enabled) 7170 7182 15 0.1 6837.7 1.8X +SQL Parquet MR: DataPageV1 9320 9408 68 0.1 8887.8 1.4X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9632 9684 27 0.1 9186.0 1.4X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5954 6007 35 0.2 5678.3 2.2X +SQL Parquet MR: DataPageV2 9823 9976 213 0.1 9368.0 1.3X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 10198 10460 203 0.1 9725.6 1.3X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5708 5778 39 0.2 5443.9 2.3X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10241 10290 70 1.0 976.6 1.0X -SQL Json 9827 9840 19 1.1 937.1 1.0X -SQL Parquet Vectorized: DataPageV1 1711 1736 35 6.1 163.2 6.0X -SQL Parquet Vectorized: DataPageV2 1912 1916 6 5.5 182.3 5.4X -SQL Parquet MR: DataPageV1 4027 4028 1 2.6 384.1 2.5X -SQL Parquet MR: DataPageV2 3967 3967 1 2.6 378.3 2.6X -SQL ORC Vectorized 1819 1845 37 5.8 173.5 5.6X -SQL ORC MR 3460 3468 11 3.0 330.0 3.0X +SQL CSV 10885 10952 95 1.0 1038.0 1.0X +SQL Json 10052 10073 30 1.0 958.6 1.1X +SQL Parquet Vectorized: DataPageV1 1759 1768 13 6.0 167.7 6.2X +SQL Parquet Vectorized: DataPageV2 1974 1974 1 5.3 188.2 5.5X +SQL Parquet MR: DataPageV1 3896 3902 9 2.7 371.6 2.8X +SQL Parquet MR: DataPageV2 3869 3895 36 2.7 369.0 2.8X +SQL ORC Vectorized 1823 1848 35 5.8 173.8 6.0X +SQL ORC MR 3507 3524 24 3.0 334.4 3.1X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5689 5724 49 1.8 542.6 1.0X -SQL Json 6157 6173 22 1.7 587.1 0.9X -SQL Parquet Vectorized: DataPageV1 465 470 4 22.5 44.4 12.2X -SQL Parquet Vectorized: DataPageV2 459 460 1 22.8 43.8 12.4X -SQL Parquet MR: DataPageV1 1551 1558 10 6.8 147.9 3.7X -SQL Parquet MR: DataPageV2 1501 1506 7 7.0 143.2 3.8X -SQL ORC Vectorized 366 369 3 28.7 34.9 15.5X -SQL ORC MR 1703 1740 51 6.2 162.4 3.3X +SQL CSV 5972 5973 2 1.8 569.6 1.0X +SQL Json 6515 6538 32 1.6 621.4 0.9X +SQL Parquet Vectorized: DataPageV1 481 499 18 21.8 45.9 12.4X +SQL Parquet Vectorized: DataPageV2 482 495 16 21.8 46.0 12.4X +SQL Parquet MR: DataPageV1 1603 1624 29 6.5 152.9 3.7X +SQL Parquet MR: DataPageV2 1543 1554 16 6.8 147.2 3.9X +SQL ORC Vectorized 378 383 5 27.7 36.1 15.8X +SQL ORC MR 1747 1750 3 6.0 166.6 3.4X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 10920 10985 92 1.4 694.2 1.0X -Data column - Json 9064 9065 2 1.7 576.3 1.2X -Data column - Parquet Vectorized: DataPageV1 117 124 6 134.6 7.4 93.5X -Data column - Parquet Vectorized: DataPageV2 223 239 14 70.5 14.2 49.0X -Data column - Parquet MR: DataPageV1 2287 2295 12 6.9 145.4 4.8X -Data column - Parquet MR: DataPageV2 2302 2305 4 6.8 146.4 4.7X -Data column - ORC Vectorized 179 191 20 87.9 11.4 61.0X -Data column - ORC MR 2135 2161 36 7.4 135.8 5.1X -Partition column - CSV 3806 3806 0 4.1 242.0 2.9X -Partition column - Json 8340 8352 16 1.9 530.3 1.3X -Partition column - Parquet Vectorized: DataPageV1 30 34 6 529.7 1.9 367.7X -Partition column - Parquet Vectorized: DataPageV2 29 34 6 549.7 1.8 381.6X -Partition column - Parquet MR: DataPageV1 1425 1435 14 11.0 90.6 7.7X -Partition column - Parquet MR: DataPageV2 1414 1428 20 11.1 89.9 7.7X -Partition column - ORC Vectorized 30 33 5 525.5 1.9 364.8X -Partition column - ORC MR 1284 1293 13 12.3 81.6 8.5X -Both columns - CSV 11211 11232 30 1.4 712.8 1.0X -Both columns - Json 9167 9184 24 1.7 582.8 1.2X -Both columns - Parquet Vectorized: DataPageV1 153 167 13 102.5 9.8 71.2X -Both columns - Parquet Vectorized: DataPageV2 267 298 31 58.8 17.0 40.8X -Both columns - Parquet MR: DataPageV1 2567 2611 62 6.1 163.2 4.3X -Both columns - Parquet MR: DataPageV2 2647 2659 17 5.9 168.3 4.1X -Both columns - ORC Vectorized 178 200 26 88.3 11.3 61.3X -Both columns - ORC MR 2119 2131 17 7.4 134.7 5.2X +Data column - CSV 11700 11730 43 1.3 743.9 1.0X +Data column - Json 9276 9304 40 1.7 589.8 1.3X +Data column - Parquet Vectorized: DataPageV1 102 131 25 154.9 6.5 115.2X +Data column - Parquet Vectorized: DataPageV2 220 252 37 71.6 14.0 53.3X +Data column - Parquet MR: DataPageV1 2276 2345 97 6.9 144.7 5.1X +Data column - Parquet MR: DataPageV2 2205 2216 15 7.1 140.2 5.3X +Data column - ORC Vectorized 178 189 13 88.4 11.3 65.8X +Data column - ORC MR 1942 1952 14 8.1 123.5 6.0X +Partition column - CSV 3761 3778 24 4.2 239.1 3.1X +Partition column - Json 8482 8581 141 1.9 539.3 1.4X +Partition column - Parquet Vectorized: DataPageV1 30 37 8 528.0 1.9 392.7X +Partition column - Parquet Vectorized: DataPageV2 28 35 7 561.2 1.8 417.5X +Partition column - Parquet MR: DataPageV1 1184 1185 2 13.3 75.3 9.9X +Partition column - Parquet MR: DataPageV2 1179 1228 69 13.3 74.9 9.9X +Partition column - ORC Vectorized 30 33 6 531.9 1.9 395.6X +Partition column - ORC MR 1209 1211 3 13.0 76.9 9.7X +Both columns - CSV 11640 11652 17 1.4 740.0 1.0X +Both columns - Json 9733 9757 34 1.6 618.8 1.2X +Both columns - Parquet Vectorized: DataPageV1 141 162 15 111.2 9.0 82.7X +Both columns - Parquet Vectorized: DataPageV2 269 288 24 58.4 17.1 43.4X +Both columns - Parquet MR: DataPageV1 2487 2500 18 6.3 158.1 4.7X +Both columns - Parquet MR: DataPageV2 2441 2489 68 6.4 155.2 4.8X +Both columns - ORC Vectorized 203 214 16 77.6 12.9 57.8X +Both columns - ORC MR 2001 2006 7 7.9 127.2 5.8X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7385 7393 11 1.4 704.3 1.0X -SQL Json 8624 8638 20 1.2 822.4 0.9X -SQL Parquet Vectorized: DataPageV1 1123 1130 10 9.3 107.1 6.6X -SQL Parquet Vectorized: DataPageV2 1398 1403 6 7.5 133.3 5.3X -SQL Parquet MR: DataPageV1 3770 3795 35 2.8 359.6 2.0X -SQL Parquet MR: DataPageV2 3738 3769 43 2.8 356.5 2.0X -ParquetReader Vectorized: DataPageV1 753 760 7 13.9 71.8 9.8X -ParquetReader Vectorized: DataPageV2 1084 1095 16 9.7 103.3 6.8X -SQL ORC Vectorized 818 836 23 12.8 78.1 9.0X -SQL ORC MR 2885 2904 27 3.6 275.1 2.6X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 7656 7673 24 1.4 730.1 1.0X +SQL Json 8974 8995 30 1.2 855.8 0.9X +SQL Parquet Vectorized: DataPageV1 1114 1143 41 9.4 106.2 6.9X +SQL Parquet Vectorized: DataPageV2 1477 1501 34 7.1 140.8 5.2X +SQL Parquet MR: DataPageV1 3613 3614 1 2.9 344.6 2.1X +SQL Parquet MR: DataPageV2 3877 3877 0 2.7 369.7 2.0X +ParquetReader Vectorized: DataPageV1 765 773 12 13.7 72.9 10.0X +ParquetReader Vectorized: DataPageV2 1109 1130 30 9.5 105.8 6.9X +SQL ORC Vectorized 841 851 18 12.5 80.2 9.1X +SQL ORC MR 2849 2862 19 3.7 271.7 2.7X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5899 5900 2 1.8 562.6 1.0X -SQL Json 7189 7199 14 1.5 685.6 0.8X -SQL Parquet Vectorized: DataPageV1 737 756 22 14.2 70.3 8.0X -SQL Parquet Vectorized: DataPageV2 1004 1035 45 10.4 95.7 5.9X -SQL Parquet MR: DataPageV1 2744 2752 12 3.8 261.6 2.2X -SQL Parquet MR: DataPageV2 2917 2923 8 3.6 278.2 2.0X -ParquetReader Vectorized: DataPageV1 719 734 19 14.6 68.6 8.2X -ParquetReader Vectorized: DataPageV2 950 957 12 11.0 90.6 6.2X -SQL ORC Vectorized 986 1002 22 10.6 94.1 6.0X -SQL ORC MR 2840 2866 36 3.7 270.9 2.1X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 5670 5681 16 1.8 540.7 1.0X +SQL Json 7363 7363 1 1.4 702.2 0.8X +SQL Parquet Vectorized: DataPageV1 747 770 35 14.0 71.2 7.6X +SQL Parquet Vectorized: DataPageV2 981 1019 53 10.7 93.6 5.8X +SQL Parquet MR: DataPageV1 2684 2693 13 3.9 256.0 2.1X +SQL Parquet MR: DataPageV2 2820 2830 14 3.7 269.0 2.0X +ParquetReader Vectorized: DataPageV1 697 706 11 15.1 66.4 8.1X +ParquetReader Vectorized: DataPageV2 920 935 20 11.4 87.8 6.2X +SQL ORC Vectorized 976 1000 35 10.7 93.1 5.8X +SQL ORC MR 2670 2690 28 3.9 254.6 2.1X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 3951 3956 7 2.7 376.8 1.0X -SQL Json 4888 4888 1 2.1 466.1 0.8X -SQL Parquet Vectorized: DataPageV1 173 193 11 60.5 16.5 22.8X -SQL Parquet Vectorized: DataPageV2 194 199 3 54.0 18.5 20.3X -SQL Parquet MR: DataPageV1 1666 1672 8 6.3 158.9 2.4X -SQL Parquet MR: DataPageV2 1626 1633 10 6.5 155.0 2.4X -ParquetReader Vectorized: DataPageV1 174 178 5 60.2 16.6 22.7X -ParquetReader Vectorized: DataPageV2 201 203 2 52.1 19.2 19.6X -SQL ORC Vectorized 328 331 4 32.0 31.2 12.1X -SQL ORC MR 1633 1636 3 6.4 155.8 2.4X +SQL CSV 4208 4236 40 2.5 401.3 1.0X +SQL Json 5288 5295 11 2.0 504.3 0.8X +SQL Parquet Vectorized: DataPageV1 165 174 6 63.7 15.7 25.6X +SQL Parquet Vectorized: DataPageV2 194 198 5 54.1 18.5 21.7X +SQL Parquet MR: DataPageV1 1693 1697 5 6.2 161.5 2.5X +SQL Parquet MR: DataPageV2 1668 1686 25 6.3 159.0 2.5X +ParquetReader Vectorized: DataPageV1 155 157 2 67.6 14.8 27.1X +ParquetReader Vectorized: DataPageV2 184 186 2 56.9 17.6 22.8X +SQL ORC Vectorized 327 340 17 32.1 31.2 12.9X +SQL ORC MR 1521 1538 23 6.9 145.1 2.8X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 1259 1261 2 0.8 1201.0 1.0X -SQL Json 1688 1695 9 0.6 1610.1 0.7X -SQL Parquet Vectorized: DataPageV1 24 29 6 43.9 22.8 52.7X -SQL Parquet Vectorized: DataPageV2 32 36 6 32.8 30.5 39.4X -SQL Parquet MR: DataPageV1 169 176 6 6.2 161.2 7.5X -SQL Parquet MR: DataPageV2 157 165 7 6.7 149.6 8.0X -SQL ORC Vectorized 29 35 6 36.2 27.6 43.5X -SQL ORC MR 132 140 6 7.9 126.2 9.5X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 1322 1325 4 0.8 1261.2 1.0X +SQL Json 1758 1767 13 0.6 1676.5 0.8X +SQL Parquet Vectorized: DataPageV1 24 28 6 44.0 22.7 55.5X +SQL Parquet Vectorized: DataPageV2 33 36 6 32.2 31.1 40.6X +SQL Parquet MR: DataPageV1 154 160 5 6.8 147.1 8.6X +SQL Parquet MR: DataPageV2 163 166 2 6.4 155.6 8.1X +SQL ORC Vectorized 28 33 6 37.8 26.5 47.6X +SQL ORC MR 134 141 5 7.8 127.7 9.9X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2656 2659 4 0.4 2533.4 1.0X -SQL Json 6186 6199 19 0.2 5899.5 0.4X -SQL Parquet Vectorized: DataPageV1 27 33 7 39.1 25.6 99.1X -SQL Parquet Vectorized: DataPageV2 35 40 6 30.3 33.0 76.9X -SQL Parquet MR: DataPageV1 170 176 6 6.2 162.4 15.6X -SQL Parquet MR: DataPageV2 163 173 10 6.5 155.0 16.3X -SQL ORC Vectorized 33 38 6 32.3 31.0 81.7X -SQL ORC MR 137 145 8 7.7 130.4 19.4X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 2634 2639 7 0.4 2511.9 1.0X +SQL Json 5624 5655 44 0.2 5363.7 0.5X +SQL Parquet Vectorized: DataPageV1 27 33 7 39.4 25.4 99.0X +SQL Parquet Vectorized: DataPageV2 34 41 7 30.5 32.8 76.6X +SQL Parquet MR: DataPageV1 158 167 6 6.6 150.9 16.6X +SQL Parquet MR: DataPageV2 153 159 6 6.9 145.9 17.2X +SQL ORC Vectorized 31 35 6 34.0 29.4 85.3X +SQL ORC MR 131 137 5 8.0 124.6 20.2X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4422 4439 25 0.2 4217.1 1.0X -SQL Json 11222 11248 37 0.1 10702.2 0.4X -SQL Parquet Vectorized: DataPageV1 35 41 6 30.2 33.1 127.5X -SQL Parquet Vectorized: DataPageV2 42 46 6 25.0 40.0 105.5X -SQL Parquet MR: DataPageV1 182 191 8 5.8 173.8 24.3X -SQL Parquet MR: DataPageV2 182 185 2 5.8 173.6 24.3X -SQL ORC Vectorized 39 44 5 27.0 37.0 114.0X -SQL ORC MR 148 159 6 7.1 141.2 29.9X +SQL CSV 4252 4309 81 0.2 4054.8 1.0X +SQL Json 10496 10648 215 0.1 10009.3 0.4X +SQL Parquet Vectorized: DataPageV1 35 48 10 30.2 33.1 122.4X +SQL Parquet Vectorized: DataPageV2 42 46 6 25.2 39.6 102.4X +SQL Parquet MR: DataPageV1 167 177 7 6.3 159.5 25.4X +SQL Parquet MR: DataPageV2 177 182 6 5.9 168.4 24.1X +SQL ORC Vectorized 38 44 7 27.9 35.9 113.1X +SQL ORC MR 138 146 8 7.6 131.3 30.9X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index d2180ecb771d5..948694d17066c 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,437 +2,437 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10580 10590 15 1.5 672.6 1.0X -SQL Json 8244 8399 219 1.9 524.2 1.3X -SQL Json with UnsafeRow 9338 9354 22 1.7 593.7 1.1X -SQL Parquet Vectorized: DataPageV1 103 117 7 152.2 6.6 102.4X -SQL Parquet Vectorized: DataPageV2 105 116 8 149.7 6.7 100.7X -SQL Parquet MR: DataPageV1 1871 1932 87 8.4 118.9 5.7X -SQL Parquet MR: DataPageV2 1762 1767 8 8.9 112.0 6.0X -SQL ORC Vectorized 142 151 6 110.8 9.0 74.5X -SQL ORC MR 1697 1702 7 9.3 107.9 6.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 11082 11121 55 1.4 704.6 1.0X +SQL Json 8235 8413 252 1.9 523.6 1.3X +SQL Json with UnsafeRow 9534 9547 17 1.6 606.2 1.2X +SQL Parquet Vectorized: DataPageV1 99 114 9 158.3 6.3 111.6X +SQL Parquet Vectorized: DataPageV2 99 106 5 158.1 6.3 111.4X +SQL Parquet MR: DataPageV1 1781 1787 9 8.8 113.2 6.2X +SQL Parquet MR: DataPageV2 1685 1760 106 9.3 107.1 6.6X +SQL ORC Vectorized 139 145 4 112.9 8.9 79.5X +SQL ORC MR 1447 1449 3 10.9 92.0 7.7X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 94 96 3 167.8 6.0 1.0X -ParquetReader Vectorized: DataPageV2 101 103 1 155.0 6.4 0.9X -ParquetReader Vectorized -> Row: DataPageV1 74 76 2 211.8 4.7 1.3X -ParquetReader Vectorized -> Row: DataPageV2 83 84 2 190.4 5.3 1.1X +ParquetReader Vectorized: DataPageV1 88 90 1 178.8 5.6 1.0X +ParquetReader Vectorized: DataPageV2 95 96 1 165.3 6.0 0.9X +ParquetReader Vectorized -> Row: DataPageV1 73 74 1 214.9 4.7 1.2X +ParquetReader Vectorized -> Row: DataPageV2 81 82 1 193.4 5.2 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11731 11783 74 1.3 745.8 1.0X -SQL Json 9315 9364 69 1.7 592.2 1.3X -SQL Json with UnsafeRow 10241 10246 7 1.5 651.1 1.1X -SQL Parquet Vectorized: DataPageV1 113 120 7 138.9 7.2 103.6X -SQL Parquet Vectorized: DataPageV2 111 118 6 142.1 7.0 106.0X -SQL Parquet MR: DataPageV1 1992 2010 26 7.9 126.6 5.9X -SQL Parquet MR: DataPageV2 1918 1939 29 8.2 122.0 6.1X -SQL ORC Vectorized 112 120 6 139.9 7.1 104.4X -SQL ORC MR 1643 1647 5 9.6 104.5 7.1X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 10768 10815 67 1.5 684.6 1.0X +SQL Json 9495 9518 33 1.7 603.7 1.1X +SQL Json with UnsafeRow 10257 10262 7 1.5 652.1 1.0X +SQL Parquet Vectorized: DataPageV1 91 100 10 173.0 5.8 118.4X +SQL Parquet Vectorized: DataPageV2 90 99 10 175.0 5.7 119.8X +SQL Parquet MR: DataPageV1 1839 1839 0 8.6 116.9 5.9X +SQL Parquet MR: DataPageV2 1807 1816 13 8.7 114.9 6.0X +SQL ORC Vectorized 114 118 3 138.1 7.2 94.5X +SQL ORC MR 1485 1485 0 10.6 94.4 7.3X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 83 85 2 190.1 5.3 1.0X -ParquetReader Vectorized: DataPageV2 83 84 2 189.8 5.3 1.0X -ParquetReader Vectorized -> Row: DataPageV1 62 63 1 254.3 3.9 1.3X -ParquetReader Vectorized -> Row: DataPageV2 62 64 2 253.5 3.9 1.3X +ParquetReader Vectorized: DataPageV1 68 69 1 232.8 4.3 1.0X +ParquetReader Vectorized: DataPageV2 68 70 2 232.1 4.3 1.0X +ParquetReader Vectorized -> Row: DataPageV1 46 48 2 338.4 3.0 1.5X +ParquetReader Vectorized -> Row: DataPageV2 46 48 2 340.2 2.9 1.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12442 12467 35 1.3 791.1 1.0X -SQL Json 9536 9578 58 1.6 606.3 1.3X -SQL Json with UnsafeRow 10484 10484 1 1.5 666.5 1.2X -SQL Parquet Vectorized: DataPageV1 110 115 3 142.4 7.0 112.6X -SQL Parquet Vectorized: DataPageV2 139 144 5 112.9 8.9 89.3X -SQL Parquet MR: DataPageV1 2082 2122 57 7.6 132.4 6.0X -SQL Parquet MR: DataPageV2 2050 2071 30 7.7 130.3 6.1X -SQL ORC Vectorized 143 148 4 110.2 9.1 87.2X -SQL ORC MR 1722 1723 1 9.1 109.5 7.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 11107 11122 21 1.4 706.2 1.0X +SQL Json 9895 9916 30 1.6 629.1 1.1X +SQL Json with UnsafeRow 10606 10615 13 1.5 674.3 1.0X +SQL Parquet Vectorized: DataPageV1 100 107 6 157.6 6.3 111.3X +SQL Parquet Vectorized: DataPageV2 129 135 6 122.2 8.2 86.3X +SQL Parquet MR: DataPageV1 1978 1980 3 8.0 125.8 5.6X +SQL Parquet MR: DataPageV2 1877 1894 24 8.4 119.3 5.9X +SQL ORC Vectorized 138 143 4 113.9 8.8 80.5X +SQL ORC MR 1570 1572 2 10.0 99.8 7.1X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 140 144 3 112.3 8.9 1.0X -ParquetReader Vectorized: DataPageV2 168 170 2 93.8 10.7 0.8X -ParquetReader Vectorized -> Row: DataPageV1 138 140 3 114.1 8.8 1.0X -ParquetReader Vectorized -> Row: DataPageV2 166 167 2 95.0 10.5 0.8X +ParquetReader Vectorized: DataPageV1 144 145 2 109.2 9.2 1.0X +ParquetReader Vectorized: DataPageV2 172 174 2 91.4 10.9 0.8X +ParquetReader Vectorized -> Row: DataPageV1 136 138 2 115.4 8.7 1.1X +ParquetReader Vectorized -> Row: DataPageV2 168 170 3 93.7 10.7 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13427 13451 33 1.2 853.7 1.0X -SQL Json 10000 10014 20 1.6 635.8 1.3X -SQL Json with UnsafeRow 10816 10829 18 1.5 687.7 1.2X -SQL Parquet Vectorized: DataPageV1 121 126 3 130.5 7.7 111.4X -SQL Parquet Vectorized: DataPageV2 197 203 12 79.7 12.5 68.0X -SQL Parquet MR: DataPageV1 2149 2246 137 7.3 136.7 6.2X -SQL Parquet MR: DataPageV2 2058 2072 19 7.6 130.9 6.5X -SQL ORC Vectorized 159 165 6 98.8 10.1 84.3X -SQL ORC MR 1868 1869 1 8.4 118.8 7.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 12198 12209 16 1.3 775.5 1.0X +SQL Json 10381 10390 13 1.5 660.0 1.2X +SQL Json with UnsafeRow 11101 11149 69 1.4 705.8 1.1X +SQL Parquet Vectorized: DataPageV1 106 109 3 147.9 6.8 114.7X +SQL Parquet Vectorized: DataPageV2 181 186 8 87.1 11.5 67.5X +SQL Parquet MR: DataPageV1 2004 2004 0 7.9 127.4 6.1X +SQL Parquet MR: DataPageV2 1962 1976 20 8.0 124.7 6.2X +SQL ORC Vectorized 146 149 3 107.9 9.3 83.7X +SQL ORC MR 1583 1585 3 9.9 100.7 7.7X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 142 144 1 110.5 9.0 1.0X -ParquetReader Vectorized: DataPageV2 215 219 7 73.0 13.7 0.7X -ParquetReader Vectorized -> Row: DataPageV1 141 142 1 111.9 8.9 1.0X -ParquetReader Vectorized -> Row: DataPageV2 212 213 1 74.2 13.5 0.7X +ParquetReader Vectorized: DataPageV1 146 147 1 107.8 9.3 1.0X +ParquetReader Vectorized: DataPageV2 216 217 1 73.0 13.7 0.7X +ParquetReader Vectorized -> Row: DataPageV1 139 142 6 113.1 8.8 1.0X +ParquetReader Vectorized -> Row: DataPageV2 211 214 4 74.4 13.4 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13182 13188 8 1.2 838.1 1.0X -SQL Json 10134 10141 10 1.6 644.3 1.3X -SQL Json with UnsafeRow 10915 10920 7 1.4 693.9 1.2X -SQL Parquet Vectorized: DataPageV1 281 285 4 55.9 17.9 46.8X -SQL Parquet Vectorized: DataPageV2 176 181 4 89.2 11.2 74.8X -SQL Parquet MR: DataPageV1 2659 2694 49 5.9 169.1 5.0X -SQL Parquet MR: DataPageV2 2191 2194 5 7.2 139.3 6.0X -SQL ORC Vectorized 144 151 4 109.2 9.2 91.5X -SQL ORC MR 1814 1887 103 8.7 115.4 7.3X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 12198 12205 10 1.3 775.5 1.0X +SQL Json 10391 10400 13 1.5 660.6 1.2X +SQL Json with UnsafeRow 11102 11110 12 1.4 705.8 1.1X +SQL Parquet Vectorized: DataPageV1 280 284 3 56.3 17.8 43.6X +SQL Parquet Vectorized: DataPageV2 175 179 4 90.0 11.1 69.8X +SQL Parquet MR: DataPageV1 2379 2432 75 6.6 151.2 5.1X +SQL Parquet MR: DataPageV2 1910 1917 11 8.2 121.4 6.4X +SQL ORC Vectorized 127 132 5 124.2 8.1 96.3X +SQL ORC MR 1701 1717 23 9.2 108.1 7.2X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 316 317 1 49.8 20.1 1.0X -ParquetReader Vectorized: DataPageV2 214 218 7 73.5 13.6 1.5X -ParquetReader Vectorized -> Row: DataPageV1 338 343 8 46.6 21.5 0.9X -ParquetReader Vectorized -> Row: DataPageV2 234 236 2 67.1 14.9 1.3X +ParquetReader Vectorized: DataPageV1 335 337 2 47.0 21.3 1.0X +ParquetReader Vectorized: DataPageV2 217 231 9 72.4 13.8 1.5X +ParquetReader Vectorized -> Row: DataPageV1 347 353 6 45.4 22.0 1.0X +ParquetReader Vectorized -> Row: DataPageV2 243 248 4 64.6 15.5 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13724 13734 14 1.1 872.5 1.0X -SQL Json 11883 11914 45 1.3 755.5 1.2X -SQL Json with UnsafeRow 12737 12740 4 1.2 809.8 1.1X -SQL Parquet Vectorized: DataPageV1 86 97 10 183.4 5.5 160.0X -SQL Parquet Vectorized: DataPageV2 94 107 8 168.1 5.9 146.7X -SQL Parquet MR: DataPageV1 2291 2295 6 6.9 145.7 6.0X -SQL Parquet MR: DataPageV2 2156 2157 2 7.3 137.1 6.4X -SQL ORC Vectorized 258 270 11 60.9 16.4 53.1X -SQL ORC MR 1903 1908 7 8.3 121.0 7.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 12624 12635 16 1.2 802.6 1.0X +SQL Json 11911 11924 19 1.3 757.3 1.1X +SQL Json with UnsafeRow 12643 12657 21 1.2 803.8 1.0X +SQL Parquet Vectorized: DataPageV1 90 98 8 175.2 5.7 140.6X +SQL Parquet Vectorized: DataPageV2 90 103 10 174.5 5.7 140.0X +SQL Parquet MR: DataPageV1 2018 2022 5 7.8 128.3 6.3X +SQL Parquet MR: DataPageV2 1947 1965 25 8.1 123.8 6.5X +SQL ORC Vectorized 251 268 16 62.6 16.0 50.2X +SQL ORC MR 1729 1732 4 9.1 109.9 7.3X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 141 141 0 111.9 8.9 1.0X -ParquetReader Vectorized: DataPageV2 152 155 4 103.2 9.7 0.9X -ParquetReader Vectorized -> Row: DataPageV1 152 157 4 103.2 9.7 0.9X -ParquetReader Vectorized -> Row: DataPageV2 152 156 6 103.6 9.7 0.9X +ParquetReader Vectorized: DataPageV1 139 142 2 113.0 8.8 1.0X +ParquetReader Vectorized: DataPageV2 156 158 4 101.0 9.9 0.9X +ParquetReader Vectorized -> Row: DataPageV1 151 153 3 104.3 9.6 0.9X +ParquetReader Vectorized -> Row: DataPageV2 150 153 4 104.5 9.6 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13926 13932 8 1.1 885.4 1.0X -SQL Json 12135 12148 19 1.3 771.5 1.1X -SQL Json with UnsafeRow 12983 13003 29 1.2 825.4 1.1X -SQL Parquet Vectorized: DataPageV1 292 298 7 53.9 18.5 47.7X -SQL Parquet Vectorized: DataPageV2 292 297 4 53.9 18.5 47.7X -SQL Parquet MR: DataPageV1 2769 2775 9 5.7 176.1 5.0X -SQL Parquet MR: DataPageV2 2619 2623 6 6.0 166.5 5.3X -SQL ORC Vectorized 632 649 18 24.9 40.2 22.0X -SQL ORC MR 2386 2405 27 6.6 151.7 5.8X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 12881 13006 176 1.2 819.0 1.0X +SQL Json 12083 12109 37 1.3 768.2 1.1X +SQL Json with UnsafeRow 12697 12731 49 1.2 807.2 1.0X +SQL Parquet Vectorized: DataPageV1 281 286 9 56.0 17.9 45.8X +SQL Parquet Vectorized: DataPageV2 280 287 5 56.2 17.8 46.0X +SQL Parquet MR: DataPageV1 2442 2490 67 6.4 155.3 5.3X +SQL Parquet MR: DataPageV2 2356 2370 19 6.7 149.8 5.5X +SQL ORC Vectorized 639 643 3 24.6 40.7 20.1X +SQL ORC MR 2155 2161 7 7.3 137.0 6.0X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 357 360 2 44.0 22.7 1.0X -ParquetReader Vectorized: DataPageV2 356 359 2 44.1 22.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 365 371 7 43.1 23.2 1.0X -ParquetReader Vectorized -> Row: DataPageV2 367 370 4 42.9 23.3 1.0X +ParquetReader Vectorized: DataPageV1 334 335 2 47.1 21.2 1.0X +ParquetReader Vectorized: DataPageV2 335 339 3 46.9 21.3 1.0X +ParquetReader Vectorized -> Row: DataPageV1 346 353 7 45.4 22.0 1.0X +ParquetReader Vectorized -> Row: DataPageV2 346 351 4 45.4 22.0 1.0X ================================================================================================ SQL Single Numeric Column Scan in Struct ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2238 2269 44 7.0 142.3 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2290 2319 42 6.9 145.6 1.0X -SQL ORC Vectorized (Nested Column Enabled) 129 144 34 121.9 8.2 17.3X -SQL Parquet MR: DataPageV1 2487 2501 20 6.3 158.1 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3250 3274 35 4.8 206.6 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 112 118 7 140.2 7.1 19.9X -SQL Parquet MR: DataPageV2 2368 2393 35 6.6 150.5 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3091 3118 37 5.1 196.5 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 113 118 7 139.2 7.2 19.8X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2062 2069 10 7.6 131.1 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2068 2085 24 7.6 131.5 1.0X +SQL ORC Vectorized (Nested Column Enabled) 119 132 28 132.3 7.6 17.3X +SQL Parquet MR: DataPageV1 2402 2421 27 6.5 152.7 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2910 2913 5 5.4 185.0 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 102 108 5 153.9 6.5 20.2X +SQL Parquet MR: DataPageV2 2340 2361 29 6.7 148.8 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2765 2774 12 5.7 175.8 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 100 104 4 157.4 6.4 20.6X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2156 2195 55 7.3 137.0 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2174 2191 24 7.2 138.2 1.0X -SQL ORC Vectorized (Nested Column Enabled) 259 264 4 60.6 16.5 8.3X -SQL Parquet MR: DataPageV1 2617 2631 20 6.0 166.4 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3209 3215 8 4.9 204.0 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 171 182 14 92.0 10.9 12.6X -SQL Parquet MR: DataPageV2 2463 2498 50 6.4 156.6 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3018 3023 6 5.2 191.9 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 287 303 14 54.8 18.2 7.5X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2092 2099 10 7.5 133.0 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2093 2110 24 7.5 133.1 1.0X +SQL ORC Vectorized (Nested Column Enabled) 280 286 7 56.2 17.8 7.5X +SQL Parquet MR: DataPageV1 2341 2354 18 6.7 148.9 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2922 2926 6 5.4 185.8 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 144 150 4 109.3 9.1 14.5X +SQL Parquet MR: DataPageV2 2276 2287 16 6.9 144.7 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2828 2831 4 5.6 179.8 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 266 283 18 59.2 16.9 7.9X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2301 2367 94 6.8 146.3 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2417 2421 6 6.5 153.7 1.0X -SQL ORC Vectorized (Nested Column Enabled) 282 288 4 55.7 17.9 8.2X -SQL Parquet MR: DataPageV1 2681 2694 18 5.9 170.5 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3187 3213 36 4.9 202.6 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 152 159 6 103.8 9.6 15.2X -SQL Parquet MR: DataPageV2 2636 2650 20 6.0 167.6 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3077 3089 17 5.1 195.6 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 287 304 31 54.7 18.3 8.0X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2149 2172 33 7.3 136.6 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2074 2107 47 7.6 131.8 1.0X +SQL ORC Vectorized (Nested Column Enabled) 274 282 8 57.4 17.4 7.8X +SQL Parquet MR: DataPageV1 2363 2370 10 6.7 150.2 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2863 2898 49 5.5 182.0 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 120 134 7 130.8 7.6 17.9X +SQL Parquet MR: DataPageV2 2301 2318 23 6.8 146.3 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2729 2763 48 5.8 173.5 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 267 274 6 58.9 17.0 8.1X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2419 2419 1 6.5 153.8 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2342 2392 71 6.7 148.9 1.0X -SQL ORC Vectorized (Nested Column Enabled) 285 291 4 55.2 18.1 8.5X -SQL Parquet MR: DataPageV1 2915 2931 23 5.4 185.4 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3405 3418 19 4.6 216.5 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 321 346 15 49.0 20.4 7.5X -SQL Parquet MR: DataPageV2 2554 2570 24 6.2 162.4 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2943 2954 15 5.3 187.1 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 210 215 3 74.9 13.4 11.5X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2115 2121 8 7.4 134.5 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2103 2130 37 7.5 133.7 1.0X +SQL ORC Vectorized (Nested Column Enabled) 270 305 76 58.2 17.2 7.8X +SQL Parquet MR: DataPageV1 2791 2796 8 5.6 177.4 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3190 3211 29 4.9 202.8 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 307 315 9 51.2 19.5 6.9X +SQL Parquet MR: DataPageV2 2447 2458 15 6.4 155.6 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2786 2804 26 5.6 177.1 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 201 208 10 78.4 12.8 10.5X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2457 2629 243 6.4 156.2 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2469 2481 17 6.4 157.0 1.0X -SQL ORC Vectorized (Nested Column Enabled) 354 368 10 44.4 22.5 6.9X -SQL Parquet MR: DataPageV1 2592 2592 1 6.1 164.8 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3017 3022 7 5.2 191.8 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 104 123 14 151.0 6.6 23.6X -SQL Parquet MR: DataPageV2 2511 2554 61 6.3 159.6 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2932 2964 44 5.4 186.4 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 102 106 7 154.9 6.5 24.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL ORC MR 2200 2240 56 7.1 139.9 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2211 2231 28 7.1 140.5 1.0X +SQL ORC Vectorized (Nested Column Enabled) 356 376 18 44.2 22.6 6.2X +SQL Parquet MR: DataPageV1 2249 2280 43 7.0 143.0 1.0X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2676 2677 2 5.9 170.1 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 93 121 14 169.6 5.9 23.7X +SQL Parquet MR: DataPageV2 2244 2258 19 7.0 142.7 1.0X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2605 2631 37 6.0 165.6 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 104 121 15 151.7 6.6 21.2X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2921 2933 17 5.4 185.7 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2929 2950 30 5.4 186.2 1.0X -SQL ORC Vectorized (Nested Column Enabled) 790 793 4 19.9 50.2 3.7X -SQL Parquet MR: DataPageV1 2944 2952 12 5.3 187.2 1.0X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3441 3485 62 4.6 218.8 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 300 313 12 52.5 19.0 9.8X -SQL Parquet MR: DataPageV2 2922 2972 71 5.4 185.8 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3389 3393 7 4.6 215.4 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 299 306 7 52.6 19.0 9.8X +SQL ORC MR 2579 2580 1 6.1 164.0 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2595 2624 40 6.1 165.0 1.0X +SQL ORC Vectorized (Nested Column Enabled) 748 767 23 21.0 47.5 3.4X +SQL Parquet MR: DataPageV1 2668 2686 26 5.9 169.6 1.0X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3358 3363 7 4.7 213.5 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 291 302 12 54.0 18.5 8.9X +SQL Parquet MR: DataPageV2 2652 2655 4 5.9 168.6 1.0X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3264 3284 28 4.8 207.5 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 292 299 7 53.9 18.5 8.8X ================================================================================================ SQL Nested Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Nested Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 14325 14522 154 0.1 13661.0 1.0X -SQL ORC Vectorized (Nested Column Disabled) 14107 14392 251 0.1 13453.2 1.0X -SQL ORC Vectorized (Nested Column Enabled) 7445 7470 16 0.1 7099.8 1.9X -SQL Parquet MR: DataPageV1 8992 9032 32 0.1 8575.8 1.6X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9615 9741 77 0.1 9169.2 1.5X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 6242 6391 95 0.2 5952.4 2.3X -SQL Parquet MR: DataPageV2 10019 10415 264 0.1 9555.2 1.4X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 10273 10371 146 0.1 9796.8 1.4X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5730 5779 33 0.2 5464.9 2.5X +SQL ORC MR 12979 13152 162 0.1 12377.6 1.0X +SQL ORC Vectorized (Nested Column Disabled) 12920 12989 53 0.1 12321.6 1.0X +SQL ORC Vectorized (Nested Column Enabled) 7225 7249 17 0.1 6890.2 1.8X +SQL Parquet MR: DataPageV1 8620 8655 24 0.1 8221.1 1.5X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 8972 8983 7 0.1 8556.5 1.4X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5756 5799 31 0.2 5489.8 2.3X +SQL Parquet MR: DataPageV2 9485 9514 18 0.1 9045.5 1.4X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 9765 9805 19 0.1 9312.8 1.3X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5567 5600 19 0.2 5309.3 2.3X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12003 12156 217 0.9 1144.7 1.0X -SQL Json 10706 10711 7 1.0 1021.0 1.1X -SQL Parquet Vectorized: DataPageV1 1800 1807 10 5.8 171.6 6.7X -SQL Parquet Vectorized: DataPageV2 1923 1930 10 5.5 183.4 6.2X -SQL Parquet MR: DataPageV1 4008 4018 14 2.6 382.2 3.0X -SQL Parquet MR: DataPageV2 4075 4082 10 2.6 388.7 2.9X -SQL ORC Vectorized 1903 1925 30 5.5 181.5 6.3X -SQL ORC MR 3934 3949 21 2.7 375.2 3.1X +SQL CSV 11208 11255 67 0.9 1068.9 1.0X +SQL Json 10457 10487 41 1.0 997.3 1.1X +SQL Parquet Vectorized: DataPageV1 1820 1834 20 5.8 173.5 6.2X +SQL Parquet Vectorized: DataPageV2 1917 1918 1 5.5 182.8 5.8X +SQL Parquet MR: DataPageV1 3975 3976 1 2.6 379.1 2.8X +SQL Parquet MR: DataPageV2 3974 3994 28 2.6 379.0 2.8X +SQL ORC Vectorized 1939 1944 7 5.4 184.9 5.8X +SQL ORC MR 3490 3502 17 3.0 332.8 3.2X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7254 7268 20 1.4 691.8 1.0X -SQL Json 6959 6959 1 1.5 663.6 1.0X -SQL Parquet Vectorized: DataPageV1 477 482 6 22.0 45.5 15.2X -SQL Parquet Vectorized: DataPageV2 475 488 21 22.1 45.3 15.3X -SQL Parquet MR: DataPageV1 1778 1780 3 5.9 169.6 4.1X -SQL Parquet MR: DataPageV2 1723 1726 5 6.1 164.3 4.2X -SQL ORC Vectorized 396 409 22 26.5 37.7 18.3X -SQL ORC MR 1884 1905 30 5.6 179.6 3.9X +SQL CSV 6355 6393 55 1.7 606.0 1.0X +SQL Json 6798 6811 17 1.5 648.4 0.9X +SQL Parquet Vectorized: DataPageV1 517 522 4 20.3 49.3 12.3X +SQL Parquet Vectorized: DataPageV2 511 521 13 20.5 48.7 12.4X +SQL Parquet MR: DataPageV1 1725 1746 30 6.1 164.5 3.7X +SQL Parquet MR: DataPageV2 1631 1650 27 6.4 155.5 3.9X +SQL ORC Vectorized 371 377 4 28.3 35.3 17.1X +SQL ORC MR 1701 1713 17 6.2 162.2 3.7X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 13466 13514 67 1.2 856.2 1.0X -Data column - Json 10162 10191 42 1.5 646.1 1.3X -Data column - Parquet Vectorized: DataPageV1 119 134 10 132.4 7.6 113.3X -Data column - Parquet Vectorized: DataPageV2 294 302 13 53.6 18.7 45.9X -Data column - Parquet MR: DataPageV1 2489 2627 195 6.3 158.2 5.4X -Data column - Parquet MR: DataPageV2 2443 2466 33 6.4 155.3 5.5X -Data column - ORC Vectorized 187 200 11 84.0 11.9 71.9X -Data column - ORC MR 2306 2313 9 6.8 146.6 5.8X -Partition column - CSV 3790 3809 27 4.2 241.0 3.6X -Partition column - Json 8570 8579 12 1.8 544.9 1.6X -Partition column - Parquet Vectorized: DataPageV1 35 38 3 444.9 2.2 380.9X -Partition column - Parquet Vectorized: DataPageV2 35 38 3 452.0 2.2 387.0X -Partition column - Parquet MR: DataPageV1 1411 1422 15 11.1 89.7 9.5X -Partition column - Parquet MR: DataPageV2 1396 1435 54 11.3 88.8 9.6X -Partition column - ORC Vectorized 36 39 3 432.0 2.3 369.9X -Partition column - ORC MR 1503 1514 16 10.5 95.6 9.0X -Both columns - CSV 13408 13425 24 1.2 852.5 1.0X -Both columns - Json 10284 10301 24 1.5 653.9 1.3X -Both columns - Parquet Vectorized: DataPageV1 154 182 24 101.8 9.8 87.2X -Both columns - Parquet Vectorized: DataPageV2 341 350 17 46.1 21.7 39.5X -Both columns - Parquet MR: DataPageV1 2465 2490 35 6.4 156.7 5.5X -Both columns - Parquet MR: DataPageV2 2450 2489 55 6.4 155.8 5.5X -Both columns - ORC Vectorized 220 245 19 71.4 14.0 61.1X -Both columns - ORC MR 2333 2334 1 6.7 148.4 5.8X +Data column - CSV 12083 12159 107 1.3 768.2 1.0X +Data column - Json 10115 10122 10 1.6 643.1 1.2X +Data column - Parquet Vectorized: DataPageV1 102 107 4 154.0 6.5 118.3X +Data column - Parquet Vectorized: DataPageV2 237 242 4 66.3 15.1 50.9X +Data column - Parquet MR: DataPageV1 2228 2369 199 7.1 141.7 5.4X +Data column - Parquet MR: DataPageV2 2196 2201 7 7.2 139.6 5.5X +Data column - ORC Vectorized 138 142 4 113.7 8.8 87.4X +Data column - ORC MR 1925 1944 27 8.2 122.4 6.3X +Partition column - CSV 3593 3619 37 4.4 228.4 3.4X +Partition column - Json 8708 8717 13 1.8 553.6 1.4X +Partition column - Parquet Vectorized: DataPageV1 29 32 4 549.4 1.8 422.0X +Partition column - Parquet Vectorized: DataPageV2 28 32 4 554.1 1.8 425.7X +Partition column - Parquet MR: DataPageV1 1173 1183 14 13.4 74.6 10.3X +Partition column - Parquet MR: DataPageV2 1168 1176 11 13.5 74.3 10.3X +Partition column - ORC Vectorized 30 36 6 525.7 1.9 403.8X +Partition column - ORC MR 1210 1211 1 13.0 76.9 10.0X +Both columns - CSV 12007 12141 189 1.3 763.4 1.0X +Both columns - Json 10312 10333 29 1.5 655.6 1.2X +Both columns - Parquet Vectorized: DataPageV1 136 157 21 115.5 8.7 88.7X +Both columns - Parquet Vectorized: DataPageV2 279 310 24 56.4 17.7 43.3X +Both columns - Parquet MR: DataPageV1 2345 2361 23 6.7 149.1 5.2X +Both columns - Parquet MR: DataPageV2 2257 2309 74 7.0 143.5 5.4X +Both columns - ORC Vectorized 183 211 19 85.8 11.6 65.9X +Both columns - ORC MR 2075 2086 15 7.6 131.9 5.8X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8487 8504 24 1.2 809.4 1.0X -SQL Json 9230 9236 9 1.1 880.3 0.9X -SQL Parquet Vectorized: DataPageV1 1279 1294 20 8.2 122.0 6.6X -SQL Parquet Vectorized: DataPageV2 1327 1382 78 7.9 126.5 6.4X -SQL Parquet MR: DataPageV1 3655 3662 10 2.9 348.6 2.3X -SQL Parquet MR: DataPageV2 3708 3742 47 2.8 353.6 2.3X -ParquetReader Vectorized: DataPageV1 837 838 1 12.5 79.8 10.1X -ParquetReader Vectorized: DataPageV2 898 900 3 11.7 85.7 9.4X -SQL ORC Vectorized 970 1025 77 10.8 92.5 8.7X -SQL ORC MR 3092 3123 44 3.4 294.9 2.7X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 7818 7868 70 1.3 745.6 1.0X +SQL Json 9376 9390 20 1.1 894.1 0.8X +SQL Parquet Vectorized: DataPageV1 1240 1242 3 8.5 118.2 6.3X +SQL Parquet Vectorized: DataPageV2 1301 1302 2 8.1 124.1 6.0X +SQL Parquet MR: DataPageV1 3359 3365 9 3.1 320.3 2.3X +SQL Parquet MR: DataPageV2 3670 3674 6 2.9 350.0 2.1X +ParquetReader Vectorized: DataPageV1 830 834 3 12.6 79.2 9.4X +ParquetReader Vectorized: DataPageV2 896 898 2 11.7 85.4 8.7X +SQL ORC Vectorized 867 887 32 12.1 82.7 9.0X +SQL ORC MR 2905 2907 3 3.6 277.1 2.7X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6254 6267 18 1.7 596.4 1.0X -SQL Json 7852 7857 8 1.3 748.8 0.8X -SQL Parquet Vectorized: DataPageV1 889 907 17 11.8 84.8 7.0X -SQL Parquet Vectorized: DataPageV2 978 983 8 10.7 93.3 6.4X -SQL Parquet MR: DataPageV1 2939 2948 14 3.6 280.2 2.1X -SQL Parquet MR: DataPageV2 3175 3189 20 3.3 302.8 2.0X -ParquetReader Vectorized: DataPageV1 756 761 7 13.9 72.1 8.3X -ParquetReader Vectorized: DataPageV2 853 858 5 12.3 81.3 7.3X -SQL ORC Vectorized 1024 1027 4 10.2 97.6 6.1X -SQL ORC MR 2930 2933 4 3.6 279.4 2.1X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 6074 6083 13 1.7 579.2 1.0X +SQL Json 7930 7931 2 1.3 756.2 0.8X +SQL Parquet Vectorized: DataPageV1 862 888 22 12.2 82.2 7.0X +SQL Parquet Vectorized: DataPageV2 951 959 9 11.0 90.7 6.4X +SQL Parquet MR: DataPageV1 2636 2703 95 4.0 251.4 2.3X +SQL Parquet MR: DataPageV2 2697 2706 13 3.9 257.2 2.3X +ParquetReader Vectorized: DataPageV1 758 765 10 13.8 72.3 8.0X +ParquetReader Vectorized: DataPageV2 824 826 4 12.7 78.6 7.4X +SQL ORC Vectorized 982 993 10 10.7 93.6 6.2X +SQL ORC MR 2763 2774 15 3.8 263.5 2.2X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4333 4340 10 2.4 413.3 1.0X -SQL Json 5440 5448 11 1.9 518.8 0.8X -SQL Parquet Vectorized: DataPageV1 166 173 10 63.1 15.8 26.1X -SQL Parquet Vectorized: DataPageV2 184 187 3 56.9 17.6 23.5X -SQL Parquet MR: DataPageV1 1846 1854 11 5.7 176.0 2.3X -SQL Parquet MR: DataPageV2 1813 1815 2 5.8 172.9 2.4X -ParquetReader Vectorized: DataPageV1 171 174 4 61.2 16.3 25.3X -ParquetReader Vectorized: DataPageV2 190 191 1 55.2 18.1 22.8X -SQL ORC Vectorized 308 310 1 34.0 29.4 14.1X -SQL ORC MR 1700 1707 10 6.2 162.2 2.5X +SQL CSV 4577 4595 25 2.3 436.5 1.0X +SQL Json 5583 5595 16 1.9 532.5 0.8X +SQL Parquet Vectorized: DataPageV1 165 171 7 63.5 15.7 27.7X +SQL Parquet Vectorized: DataPageV2 179 182 2 58.7 17.0 25.6X +SQL Parquet MR: DataPageV1 1643 1652 13 6.4 156.7 2.8X +SQL Parquet MR: DataPageV2 1603 1604 1 6.5 152.9 2.9X +ParquetReader Vectorized: DataPageV1 170 172 4 61.9 16.2 27.0X +ParquetReader Vectorized: DataPageV2 184 185 1 56.9 17.6 24.8X +SQL ORC Vectorized 317 322 4 33.1 30.2 14.4X +SQL ORC MR 1585 1598 18 6.6 151.1 2.9X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 1183 1208 34 0.9 1128.7 1.0X -SQL Json 1836 1837 2 0.6 1750.6 0.6X -SQL Parquet Vectorized: DataPageV1 28 31 3 37.5 26.6 42.4X -SQL Parquet Vectorized: DataPageV2 38 42 5 27.8 35.9 31.4X -SQL Parquet MR: DataPageV1 185 189 3 5.7 176.1 6.4X -SQL Parquet MR: DataPageV2 180 188 10 5.8 171.9 6.6X -SQL ORC Vectorized 33 36 3 31.4 31.8 35.5X -SQL ORC MR 167 175 5 6.3 159.1 7.1X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 1232 1233 0 0.9 1175.3 1.0X +SQL Json 1765 1775 14 0.6 1683.6 0.7X +SQL Parquet Vectorized: DataPageV1 24 27 4 43.1 23.2 50.7X +SQL Parquet Vectorized: DataPageV2 33 36 4 31.8 31.4 37.4X +SQL Parquet MR: DataPageV1 156 160 3 6.7 148.6 7.9X +SQL Parquet MR: DataPageV2 151 156 3 6.9 144.1 8.2X +SQL ORC Vectorized 29 32 4 35.7 28.0 42.0X +SQL ORC MR 124 129 4 8.5 118.1 10.0X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2586 2589 4 0.4 2466.4 1.0X -SQL Json 6706 6852 207 0.2 6395.3 0.4X -SQL Parquet Vectorized: DataPageV1 32 35 3 33.0 30.3 81.5X -SQL Parquet Vectorized: DataPageV2 42 47 6 25.0 40.0 61.7X -SQL Parquet MR: DataPageV1 187 193 4 5.6 178.3 13.8X -SQL Parquet MR: DataPageV2 181 186 5 5.8 172.2 14.3X -SQL ORC Vectorized 38 41 3 27.3 36.7 67.2X -SQL ORC MR 171 178 11 6.1 163.5 15.1X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +SQL CSV 2667 2671 6 0.4 2543.6 1.0X +SQL Json 6256 6274 25 0.2 5966.2 0.4X +SQL Parquet Vectorized: DataPageV1 27 29 4 38.9 25.7 99.0X +SQL Parquet Vectorized: DataPageV2 36 39 4 29.3 34.1 74.5X +SQL Parquet MR: DataPageV1 160 166 5 6.5 152.9 16.6X +SQL Parquet MR: DataPageV2 155 160 4 6.8 147.9 17.2X +SQL ORC Vectorized 33 38 6 32.0 31.3 81.3X +SQL ORC MR 127 131 4 8.2 121.3 21.0X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4290 4320 42 0.2 4091.7 1.0X -SQL Json 12544 12642 139 0.1 11963.0 0.3X -SQL Parquet Vectorized: DataPageV1 41 45 5 25.7 38.9 105.2X -SQL Parquet Vectorized: DataPageV2 50 57 9 20.8 48.0 85.3X -SQL Parquet MR: DataPageV1 199 205 4 5.3 189.9 21.5X -SQL Parquet MR: DataPageV2 196 200 2 5.3 187.0 21.9X -SQL ORC Vectorized 46 49 4 22.6 44.2 92.5X -SQL ORC MR 181 185 3 5.8 172.7 23.7X +SQL CSV 4437 4470 46 0.2 4231.7 1.0X +SQL Json 11849 12082 329 0.1 11300.4 0.4X +SQL Parquet Vectorized: DataPageV1 34 38 5 30.8 32.4 130.5X +SQL Parquet Vectorized: DataPageV2 43 47 6 24.5 40.9 103.5X +SQL Parquet MR: DataPageV1 169 174 3 6.2 161.3 26.2X +SQL Parquet MR: DataPageV2 167 172 5 6.3 159.0 26.6X +SQL ORC Vectorized 38 41 4 27.3 36.7 115.4X +SQL ORC MR 134 138 3 7.8 127.5 33.2X diff --git a/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt b/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt index a98af93289208..80f7753f53541 100644 --- a/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt @@ -2,45 +2,45 @@ Dataset Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back map long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 6410 6514 147 15.6 64.1 1.0X -DataFrame 1121 1133 17 89.2 11.2 5.7X -Dataset 1691 1698 10 59.1 16.9 3.8X +RDD 6419 6615 277 15.6 64.2 1.0X +DataFrame 1215 1219 6 82.3 12.2 5.3X +Dataset 1694 1698 7 59.0 16.9 3.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 7313 7329 23 13.7 73.1 1.0X -DataFrame 2721 2764 60 36.7 27.2 2.7X -Dataset 6563 6672 155 15.2 65.6 1.1X +RDD 7768 7769 1 12.9 77.7 1.0X +DataFrame 2860 2869 13 35.0 28.6 2.7X +Dataset 7013 7025 17 14.3 70.1 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back filter Long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 3870 3894 35 25.8 38.7 1.0X -DataFrame 723 733 11 138.3 7.2 5.4X -Dataset 1534 1566 45 65.2 15.3 2.5X +RDD 4387 4430 61 22.8 43.9 1.0X +DataFrame 755 782 28 132.4 7.6 5.8X +Dataset 1571 1581 14 63.7 15.7 2.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 1967 1996 41 50.8 19.7 1.0X -DataFrame 116 126 10 864.5 1.2 17.0X -Dataset 2234 2273 55 44.8 22.3 0.9X +RDD 2090 2092 3 47.9 20.9 1.0X +DataFrame 106 121 8 941.0 1.1 19.7X +Dataset 2335 2438 146 42.8 23.4 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor aggregate: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD sum 1470 1504 48 68.0 14.7 1.0X -DataFrame sum 66 84 13 1506.4 0.7 22.1X -Dataset sum using Aggregator 1929 1944 21 51.8 19.3 0.8X -Dataset complex Aggregator 4979 5163 260 20.1 49.8 0.3X +RDD sum 1419 1424 7 70.5 14.2 1.0X +DataFrame sum 57 70 11 1765.6 0.6 25.1X +Dataset sum using Aggregator 1942 1973 44 51.5 19.4 0.7X +Dataset complex Aggregator 5348 5593 347 18.7 53.5 0.3X diff --git a/sql/core/benchmarks/DatasetBenchmark-results.txt b/sql/core/benchmarks/DatasetBenchmark-results.txt index 78e90cf783593..6cd94dd233f80 100644 --- a/sql/core/benchmarks/DatasetBenchmark-results.txt +++ b/sql/core/benchmarks/DatasetBenchmark-results.txt @@ -2,45 +2,45 @@ Dataset Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back map long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 6908 6969 86 14.5 69.1 1.0X -DataFrame 1286 1300 21 77.8 12.9 5.4X -Dataset 1763 1778 21 56.7 17.6 3.9X +RDD 5780 5868 124 17.3 57.8 1.0X +DataFrame 1150 1157 9 86.9 11.5 5.0X +Dataset 1584 1664 114 63.1 15.8 3.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 8250 8274 34 12.1 82.5 1.0X -DataFrame 2867 2868 2 34.9 28.7 2.9X -Dataset 6939 6971 45 14.4 69.4 1.2X +RDD 6968 7114 205 14.4 69.7 1.0X +DataFrame 2743 2753 15 36.5 27.4 2.5X +Dataset 7436 7456 29 13.4 74.4 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back filter Long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 4265 4343 110 23.4 42.7 1.0X -DataFrame 712 763 45 140.5 7.1 6.0X -Dataset 1722 1732 14 58.1 17.2 2.5X +RDD 4086 4184 139 24.5 40.9 1.0X +DataFrame 663 692 48 150.7 6.6 6.2X +Dataset 1515 1528 18 66.0 15.1 2.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor back-to-back filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 2250 2275 36 44.4 22.5 1.0X -DataFrame 115 126 10 873.3 1.1 19.7X -Dataset 2441 2459 25 41.0 24.4 0.9X +RDD 2091 2111 29 47.8 20.9 1.0X +DataFrame 112 127 13 892.0 1.1 18.7X +Dataset 2456 2476 29 40.7 24.6 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor aggregate: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD sum 1434 1444 14 69.7 14.3 1.0X -DataFrame sum 67 80 10 1483.2 0.7 21.3X -Dataset sum using Aggregator 2083 2146 90 48.0 20.8 0.7X -Dataset complex Aggregator 5100 5116 23 19.6 51.0 0.3X +RDD sum 1426 1474 68 70.1 14.3 1.0X +DataFrame sum 68 79 11 1478.3 0.7 21.1X +Dataset sum using Aggregator 2117 2130 19 47.2 21.2 0.7X +Dataset complex Aggregator 5121 5237 165 19.5 51.2 0.3X diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt index c230aea8da606..e4b39687c4080 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt @@ -2,460 +2,460 @@ datetime +/- interval ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 1185 1217 45 8.4 118.5 1.0X -date + interval(m, d) 1166 1188 31 8.6 116.6 1.0X -date + interval(m, d, ms) 3784 3794 13 2.6 378.4 0.3X -date - interval(m) 1098 1101 3 9.1 109.8 1.1X -date - interval(m, d) 1119 1128 13 8.9 111.9 1.1X -date - interval(m, d, ms) 3792 3799 9 2.6 379.2 0.3X -timestamp + interval(m) 1516 1522 8 6.6 151.6 0.8X -timestamp + interval(m, d) 1571 1573 3 6.4 157.1 0.8X -timestamp + interval(m, d, ms) 1716 1717 2 5.8 171.6 0.7X -timestamp - interval(m) 1503 1505 2 6.7 150.3 0.8X -timestamp - interval(m, d) 1557 1558 1 6.4 155.7 0.8X -timestamp - interval(m, d, ms) 1714 1716 3 5.8 171.4 0.7X +date + interval(m) 1127 1142 21 8.9 112.7 1.0X +date + interval(m, d) 1098 1100 3 9.1 109.8 1.0X +date + interval(m, d, ms) 3853 3857 5 2.6 385.3 0.3X +date - interval(m) 1084 1089 7 9.2 108.4 1.0X +date - interval(m, d) 1104 1106 4 9.1 110.4 1.0X +date - interval(m, d, ms) 3908 3946 54 2.6 390.8 0.3X +timestamp + interval(m) 1985 1988 4 5.0 198.5 0.6X +timestamp + interval(m, d) 2020 2026 9 5.0 202.0 0.6X +timestamp + interval(m, d, ms) 2106 2122 22 4.7 210.6 0.5X +timestamp - interval(m) 1935 1938 4 5.2 193.5 0.6X +timestamp - interval(m, d) 1973 1981 11 5.1 197.3 0.6X +timestamp - interval(m, d, ms) 2100 2110 14 4.8 210.0 0.5X ================================================================================================ Extract components ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 200 202 3 49.9 20.0 1.0X -cast to timestamp wholestage on 220 231 9 45.5 22.0 0.9X +cast to timestamp wholestage off 199 202 4 50.3 19.9 1.0X +cast to timestamp wholestage on 219 223 3 45.7 21.9 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 707 709 3 14.2 70.7 1.0X -year of timestamp wholestage on 718 721 4 13.9 71.8 1.0X +year of timestamp wholestage off 695 697 3 14.4 69.5 1.0X +year of timestamp wholestage on 705 713 10 14.2 70.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 743 744 2 13.5 74.3 1.0X -quarter of timestamp wholestage on 747 754 9 13.4 74.7 1.0X +quarter of timestamp wholestage off 752 758 10 13.3 75.2 1.0X +quarter of timestamp wholestage on 790 795 4 12.7 79.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 720 724 5 13.9 72.0 1.0X -month of timestamp wholestage on 729 731 2 13.7 72.9 1.0X +month of timestamp wholestage off 709 716 9 14.1 70.9 1.0X +month of timestamp wholestage on 722 727 5 13.8 72.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1098 1098 0 9.1 109.8 1.0X -weekofyear of timestamp wholestage on 1141 1151 17 8.8 114.1 1.0X +weekofyear of timestamp wholestage off 1113 1113 1 9.0 111.3 1.0X +weekofyear of timestamp wholestage on 1137 1151 22 8.8 113.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 732 737 7 13.7 73.2 1.0X -day of timestamp wholestage on 756 760 3 13.2 75.6 1.0X +day of timestamp wholestage off 724 726 2 13.8 72.4 1.0X +day of timestamp wholestage on 766 779 9 13.1 76.6 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 767 770 4 13.0 76.7 1.0X -dayofyear of timestamp wholestage on 780 785 3 12.8 78.0 1.0X +dayofyear of timestamp wholestage off 751 753 3 13.3 75.1 1.0X +dayofyear of timestamp wholestage on 763 767 4 13.1 76.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 755 763 11 13.2 75.5 1.0X -dayofmonth of timestamp wholestage on 758 764 7 13.2 75.8 1.0X +dayofmonth of timestamp wholestage off 742 744 3 13.5 74.2 1.0X +dayofmonth of timestamp wholestage on 777 782 5 12.9 77.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 886 889 5 11.3 88.6 1.0X -dayofweek of timestamp wholestage on 933 943 9 10.7 93.3 0.9X +dayofweek of timestamp wholestage off 885 887 2 11.3 88.5 1.0X +dayofweek of timestamp wholestage on 974 976 1 10.3 97.4 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 822 822 0 12.2 82.2 1.0X -weekday of timestamp wholestage on 839 845 9 11.9 83.9 1.0X +weekday of timestamp wholestage off 818 821 5 12.2 81.8 1.0X +weekday of timestamp wholestage on 901 911 9 11.1 90.1 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 549 558 13 18.2 54.9 1.0X -hour of timestamp wholestage on 564 567 2 17.7 56.4 1.0X +hour of timestamp wholestage off 545 547 2 18.4 54.5 1.0X +hour of timestamp wholestage on 617 626 9 16.2 61.7 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 547 549 2 18.3 54.7 1.0X -minute of timestamp wholestage on 561 567 4 17.8 56.1 1.0X +minute of timestamp wholestage off 542 550 11 18.4 54.2 1.0X +minute of timestamp wholestage on 556 565 13 18.0 55.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 552 555 5 18.1 55.2 1.0X -second of timestamp wholestage on 561 564 3 17.8 56.1 1.0X +second of timestamp wholestage off 541 546 8 18.5 54.1 1.0X +second of timestamp wholestage on 560 566 4 17.8 56.0 1.0X ================================================================================================ Current date and time ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 179 179 0 55.9 17.9 1.0X -current_date wholestage on 224 228 6 44.7 22.4 0.8X +current_date wholestage off 176 178 3 56.7 17.6 1.0X +current_date wholestage on 216 223 8 46.3 21.6 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 176 181 6 56.7 17.6 1.0X -current_timestamp wholestage on 236 241 5 42.3 23.6 0.7X +current_timestamp wholestage off 182 190 12 55.1 18.2 1.0X +current_timestamp wholestage on 232 240 8 43.1 23.2 0.8X ================================================================================================ Date arithmetic ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 639 641 2 15.6 63.9 1.0X -cast to date wholestage on 717 721 6 14.0 71.7 0.9X +cast to date wholestage off 638 643 7 15.7 63.8 1.0X +cast to date wholestage on 714 720 7 14.0 71.4 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 794 797 3 12.6 79.4 1.0X -last_day wholestage on 817 821 4 12.2 81.7 1.0X +last_day wholestage off 773 777 6 12.9 77.3 1.0X +last_day wholestage on 809 811 1 12.4 80.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 724 728 6 13.8 72.4 1.0X -next_day wholestage on 744 747 3 13.4 74.4 1.0X +next_day wholestage off 715 751 51 14.0 71.5 1.0X +next_day wholestage on 735 739 7 13.6 73.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 676 679 4 14.8 67.6 1.0X -date_add wholestage on 700 704 3 14.3 70.0 1.0X +date_add wholestage off 669 681 16 14.9 66.9 1.0X +date_add wholestage on 684 689 4 14.6 68.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 684 684 1 14.6 68.4 1.0X -date_sub wholestage on 698 701 3 14.3 69.8 1.0X +date_sub wholestage off 670 670 1 14.9 67.0 1.0X +date_sub wholestage on 684 694 14 14.6 68.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 983 984 2 10.2 98.3 1.0X -add_months wholestage on 1069 1074 7 9.4 106.9 0.9X +add_months wholestage off 960 962 3 10.4 96.0 1.0X +add_months wholestage on 1058 1065 5 9.5 105.8 0.9X ================================================================================================ Formatting dates ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 3043 3062 27 3.3 304.3 1.0X -format date wholestage on 3118 3133 14 3.2 311.8 1.0X +format date wholestage off 3175 3213 53 3.1 317.5 1.0X +format date wholestage on 3134 3152 22 3.2 313.4 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 2560 2560 0 3.9 256.0 1.0X -from_unixtime wholestage on 2594 2653 37 3.9 259.4 1.0X +from_unixtime wholestage off 2599 2601 3 3.8 259.9 1.0X +from_unixtime wholestage on 2682 2692 10 3.7 268.2 1.0X ================================================================================================ Convert timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 641 642 2 15.6 64.1 1.0X -from_utc_timestamp wholestage on 767 770 3 13.0 76.7 0.8X +from_utc_timestamp wholestage off 623 625 2 16.0 62.3 1.0X +from_utc_timestamp wholestage on 751 753 2 13.3 75.1 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 809 812 5 12.4 80.9 1.0X -to_utc_timestamp wholestage on 882 889 7 11.3 88.2 0.9X +to_utc_timestamp wholestage off 769 772 4 13.0 76.9 1.0X +to_utc_timestamp wholestage on 826 828 1 12.1 82.6 0.9X ================================================================================================ Intervals ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 225 242 24 44.5 22.5 1.0X -cast interval wholestage on 225 226 2 44.5 22.5 1.0X +cast interval wholestage off 227 233 9 44.0 22.7 1.0X +cast interval wholestage on 217 223 8 46.1 21.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1226 1229 3 8.2 122.6 1.0X -datediff wholestage on 1220 1224 3 8.2 122.0 1.0X +datediff wholestage off 1231 1234 4 8.1 123.1 1.0X +datediff wholestage on 1225 1230 8 8.2 122.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 3576 3582 8 2.8 357.6 1.0X -months_between wholestage on 3568 3581 23 2.8 356.8 1.0X +months_between wholestage off 3561 3561 1 2.8 356.1 1.0X +months_between wholestage on 3597 3607 7 2.8 359.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 383 395 18 2.6 382.5 1.0X -window wholestage on 634 657 25 1.6 633.8 0.6X +window wholestage off 389 396 11 2.6 388.6 1.0X +window wholestage on 669 685 15 1.5 668.8 0.6X ================================================================================================ Truncation ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 1862 1863 0 5.4 186.2 1.0X -date_trunc YEAR wholestage on 1867 1875 6 5.4 186.7 1.0X +date_trunc YEAR wholestage off 1902 1902 0 5.3 190.2 1.0X +date_trunc YEAR wholestage on 1909 1915 6 5.2 190.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 1864 1867 4 5.4 186.4 1.0X -date_trunc YYYY wholestage on 1865 1871 4 5.4 186.5 1.0X +date_trunc YYYY wholestage off 1908 1908 1 5.2 190.8 1.0X +date_trunc YYYY wholestage on 1909 1911 1 5.2 190.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 1867 1869 3 5.4 186.7 1.0X -date_trunc YY wholestage on 1867 1874 5 5.4 186.7 1.0X +date_trunc YY wholestage off 1899 1901 2 5.3 189.9 1.0X +date_trunc YY wholestage on 1907 1918 6 5.2 190.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 1897 1904 10 5.3 189.7 1.0X -date_trunc MON wholestage on 1857 1862 5 5.4 185.7 1.0X +date_trunc MON wholestage off 1925 1926 2 5.2 192.5 1.0X +date_trunc MON wholestage on 1887 1899 14 5.3 188.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 1901 1901 1 5.3 190.1 1.0X -date_trunc MONTH wholestage on 1858 1863 4 5.4 185.8 1.0X +date_trunc MONTH wholestage off 1930 1932 2 5.2 193.0 1.0X +date_trunc MONTH wholestage on 1890 1895 6 5.3 189.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 1890 1895 7 5.3 189.0 1.0X -date_trunc MM wholestage on 1858 1861 2 5.4 185.8 1.0X +date_trunc MM wholestage off 1928 1930 2 5.2 192.8 1.0X +date_trunc MM wholestage on 1889 1895 5 5.3 188.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 1232 1234 2 8.1 123.2 1.0X -date_trunc DAY wholestage on 1330 1336 4 7.5 133.0 0.9X +date_trunc DAY wholestage off 1216 1219 5 8.2 121.6 1.0X +date_trunc DAY wholestage on 1183 1187 3 8.5 118.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 1231 1233 4 8.1 123.1 1.0X -date_trunc DD wholestage on 1334 1337 5 7.5 133.4 0.9X +date_trunc DD wholestage off 1216 1217 2 8.2 121.6 1.0X +date_trunc DD wholestage on 1185 1194 17 8.4 118.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 1200 1201 3 8.3 120.0 1.0X -date_trunc HOUR wholestage on 1162 1168 5 8.6 116.2 1.0X +date_trunc HOUR wholestage off 1212 1215 4 8.2 121.2 1.0X +date_trunc HOUR wholestage on 1169 1174 5 8.6 116.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 1206 1209 5 8.3 120.6 1.0X -date_trunc MINUTE wholestage on 1170 1174 3 8.5 117.0 1.0X +date_trunc MINUTE wholestage off 1233 1233 0 8.1 123.3 1.0X +date_trunc MINUTE wholestage on 1199 1204 3 8.3 119.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 289 289 0 34.6 28.9 1.0X -date_trunc SECOND wholestage on 264 271 4 37.8 26.4 1.1X +date_trunc SECOND wholestage off 307 309 3 32.6 30.7 1.0X +date_trunc SECOND wholestage on 281 283 1 35.6 28.1 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 1788 1794 8 5.6 178.8 1.0X -date_trunc WEEK wholestage on 1753 1756 3 5.7 175.3 1.0X +date_trunc WEEK wholestage off 1810 1810 0 5.5 181.0 1.0X +date_trunc WEEK wholestage on 1775 1789 16 5.6 177.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 2320 2323 4 4.3 232.0 1.0X -date_trunc QUARTER wholestage on 2324 2349 54 4.3 232.4 1.0X +date_trunc QUARTER wholestage off 2367 2370 4 4.2 236.7 1.0X +date_trunc QUARTER wholestage on 2414 2419 6 4.1 241.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 849 851 2 11.8 84.9 1.0X -trunc year wholestage on 829 832 3 12.1 82.9 1.0X +trunc year wholestage off 873 876 4 11.5 87.3 1.0X +trunc year wholestage on 847 852 8 11.8 84.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 850 853 4 11.8 85.0 1.0X -trunc yyyy wholestage on 829 843 20 12.1 82.9 1.0X +trunc yyyy wholestage off 870 882 16 11.5 87.0 1.0X +trunc yyyy wholestage on 844 846 3 11.9 84.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 851 851 0 11.8 85.1 1.0X -trunc yy wholestage on 827 831 4 12.1 82.7 1.0X +trunc yy wholestage off 869 870 1 11.5 86.9 1.0X +trunc yy wholestage on 844 850 7 11.9 84.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 815 817 3 12.3 81.5 1.0X -trunc mon wholestage on 809 812 4 12.4 80.9 1.0X +trunc mon wholestage off 836 840 5 12.0 83.6 1.0X +trunc mon wholestage on 810 815 7 12.3 81.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 815 817 4 12.3 81.5 1.0X -trunc month wholestage on 806 809 2 12.4 80.6 1.0X +trunc month wholestage off 833 835 3 12.0 83.3 1.0X +trunc month wholestage on 812 815 2 12.3 81.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 812 813 1 12.3 81.2 1.0X -trunc mm wholestage on 805 810 4 12.4 80.5 1.0X +trunc mm wholestage off 838 840 3 11.9 83.8 1.0X +trunc mm wholestage on 815 818 2 12.3 81.5 1.0X ================================================================================================ Parsing ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 97 99 2 10.3 96.9 1.0X -to timestamp str wholestage on 103 106 3 9.7 102.9 0.9X +to timestamp str wholestage off 106 107 1 9.4 106.2 1.0X +to timestamp str wholestage on 101 102 1 9.9 100.7 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 678 679 2 1.5 677.5 1.0X -to_timestamp wholestage on 676 680 3 1.5 676.1 1.0X +to_timestamp wholestage off 681 683 3 1.5 680.9 1.0X +to_timestamp wholestage on 680 684 3 1.5 680.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 677 682 6 1.5 677.3 1.0X -to_unix_timestamp wholestage on 669 672 2 1.5 669.1 1.0X +to_unix_timestamp wholestage off 662 662 0 1.5 662.1 1.0X +to_unix_timestamp wholestage on 658 661 2 1.5 657.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 133 135 2 7.5 133.1 1.0X -to date str wholestage on 126 131 3 7.9 126.4 1.1X +to date str wholestage off 133 138 7 7.5 133.4 1.0X +to date str wholestage on 129 132 3 7.7 129.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 716 723 10 1.4 716.2 1.0X -to_date wholestage on 690 693 3 1.4 690.4 1.0X +to_date wholestage off 676 679 4 1.5 676.2 1.0X +to_date wholestage on 672 674 2 1.5 671.6 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 282 284 3 17.8 56.3 1.0X -From java.time.LocalDate 265 276 12 18.8 53.1 1.1X -Collect java.sql.Date 1145 1206 97 4.4 229.0 0.2X -Collect java.time.LocalDate 959 1050 100 5.2 191.7 0.3X -From java.sql.Timestamp 229 245 22 21.9 45.7 1.2X -From java.time.Instant 173 176 6 28.9 34.5 1.6X -Collect longs 910 960 73 5.5 182.0 0.3X -Collect java.sql.Timestamp 920 1118 173 5.4 183.9 0.3X -Collect java.time.Instant 877 967 79 5.7 175.3 0.3X -java.sql.Date to Hive string 3960 4078 184 1.3 792.0 0.1X -java.time.LocalDate to Hive string 3039 3117 128 1.6 607.8 0.1X -java.sql.Timestamp to Hive string 6521 6619 162 0.8 1304.1 0.0X -java.time.Instant to Hive string 4252 4346 91 1.2 850.4 0.1X +From java.sql.Date 283 285 2 17.6 56.7 1.0X +From java.time.LocalDate 265 267 1 18.8 53.1 1.1X +Collect java.sql.Date 1215 1255 34 4.1 243.1 0.2X +Collect java.time.LocalDate 1061 1141 113 4.7 212.1 0.3X +From java.sql.Timestamp 232 248 14 21.5 46.4 1.2X +From java.time.Instant 196 203 11 25.5 39.2 1.4X +Collect longs 811 1029 201 6.2 162.2 0.3X +Collect java.sql.Timestamp 904 1113 183 5.5 180.8 0.3X +Collect java.time.Instant 943 1080 143 5.3 188.7 0.3X +java.sql.Date to Hive string 4049 4296 215 1.2 809.8 0.1X +java.time.LocalDate to Hive string 3393 3476 72 1.5 678.6 0.1X +java.sql.Timestamp to Hive string 6599 6712 98 0.8 1319.7 0.0X +java.time.Instant to Hive string 4387 4475 83 1.1 877.3 0.1X diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index 176cc1dc361b8..91e954cec7253 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -2,460 +2,460 @@ datetime +/- interval ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 1253 1259 9 8.0 125.3 1.0X -date + interval(m, d) 1258 1260 2 7.9 125.8 1.0X -date + interval(m, d, ms) 3904 3923 26 2.6 390.4 0.3X -date - interval(m) 1201 1214 18 8.3 120.1 1.0X -date - interval(m, d) 1236 1238 4 8.1 123.6 1.0X -date - interval(m, d, ms) 3983 3987 5 2.5 398.3 0.3X -timestamp + interval(m) 1846 1852 9 5.4 184.6 0.7X -timestamp + interval(m, d) 1919 1932 18 5.2 191.9 0.7X -timestamp + interval(m, d, ms) 2264 2273 12 4.4 226.4 0.6X -timestamp - interval(m) 2025 2027 3 4.9 202.5 0.6X -timestamp - interval(m, d) 2097 2104 10 4.8 209.7 0.6X -timestamp - interval(m, d, ms) 2265 2270 8 4.4 226.5 0.6X +date + interval(m) 1218 1236 26 8.2 121.8 1.0X +date + interval(m, d) 1194 1214 28 8.4 119.4 1.0X +date + interval(m, d, ms) 3975 3982 11 2.5 397.5 0.3X +date - interval(m) 1140 1149 13 8.8 114.0 1.1X +date - interval(m, d) 1180 1180 1 8.5 118.0 1.0X +date - interval(m, d, ms) 4014 4017 4 2.5 401.4 0.3X +timestamp + interval(m) 1801 1819 25 5.6 180.1 0.7X +timestamp + interval(m, d) 1857 1861 5 5.4 185.7 0.7X +timestamp + interval(m, d, ms) 2184 2185 1 4.6 218.4 0.6X +timestamp - interval(m) 1950 1952 3 5.1 195.0 0.6X +timestamp - interval(m, d) 2016 2027 15 5.0 201.6 0.6X +timestamp - interval(m, d, ms) 2173 2174 1 4.6 217.3 0.6X ================================================================================================ Extract components ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 193 195 3 51.9 19.3 1.0X -cast to timestamp wholestage on 213 220 7 47.0 21.3 0.9X +cast to timestamp wholestage off 200 201 1 49.9 20.0 1.0X +cast to timestamp wholestage on 210 220 8 47.5 21.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 828 832 5 12.1 82.8 1.0X -year of timestamp wholestage on 855 865 11 11.7 85.5 1.0X +year of timestamp wholestage off 821 827 8 12.2 82.1 1.0X +year of timestamp wholestage on 825 828 4 12.1 82.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 854 854 0 11.7 85.4 1.0X -quarter of timestamp wholestage on 884 893 9 11.3 88.4 1.0X +quarter of timestamp wholestage off 872 876 5 11.5 87.2 1.0X +quarter of timestamp wholestage on 843 846 4 11.9 84.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 833 834 1 12.0 83.3 1.0X -month of timestamp wholestage on 845 849 4 11.8 84.5 1.0X +month of timestamp wholestage off 820 821 1 12.2 82.0 1.0X +month of timestamp wholestage on 827 829 3 12.1 82.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1210 1214 5 8.3 121.0 1.0X -weekofyear of timestamp wholestage on 1255 1266 12 8.0 125.5 1.0X +weekofyear of timestamp wholestage off 1207 1208 2 8.3 120.7 1.0X +weekofyear of timestamp wholestage on 1221 1224 3 8.2 122.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 828 830 3 12.1 82.8 1.0X -day of timestamp wholestage on 847 854 12 11.8 84.7 1.0X +day of timestamp wholestage off 821 826 7 12.2 82.1 1.0X +day of timestamp wholestage on 823 831 6 12.2 82.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 854 855 2 11.7 85.4 1.0X -dayofyear of timestamp wholestage on 913 921 5 10.9 91.3 0.9X +dayofyear of timestamp wholestage off 871 872 1 11.5 87.1 1.0X +dayofyear of timestamp wholestage on 858 861 4 11.7 85.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 849 854 8 11.8 84.9 1.0X -dayofmonth of timestamp wholestage on 848 859 9 11.8 84.8 1.0X +dayofmonth of timestamp wholestage off 832 834 3 12.0 83.2 1.0X +dayofmonth of timestamp wholestage on 823 826 3 12.1 82.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 984 989 7 10.2 98.4 1.0X -dayofweek of timestamp wholestage on 1026 1038 7 9.7 102.6 1.0X +dayofweek of timestamp wholestage off 969 972 5 10.3 96.9 1.0X +dayofweek of timestamp wholestage on 976 978 2 10.3 97.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 944 945 2 10.6 94.4 1.0X -weekday of timestamp wholestage on 978 985 7 10.2 97.8 1.0X +weekday of timestamp wholestage off 941 943 3 10.6 94.1 1.0X +weekday of timestamp wholestage on 926 930 2 10.8 92.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 601 604 3 16.6 60.1 1.0X -hour of timestamp wholestage on 609 613 4 16.4 60.9 1.0X +hour of timestamp wholestage off 605 610 7 16.5 60.5 1.0X +hour of timestamp wholestage on 610 613 4 16.4 61.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 603 606 5 16.6 60.3 1.0X -minute of timestamp wholestage on 609 622 21 16.4 60.9 1.0X +minute of timestamp wholestage off 600 603 5 16.7 60.0 1.0X +minute of timestamp wholestage on 609 610 2 16.4 60.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 603 604 1 16.6 60.3 1.0X -second of timestamp wholestage on 612 617 5 16.3 61.2 1.0X +second of timestamp wholestage off 604 611 10 16.6 60.4 1.0X +second of timestamp wholestage on 608 610 3 16.5 60.8 1.0X ================================================================================================ Current date and time ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 188 190 2 53.1 18.8 1.0X -current_date wholestage on 213 217 3 47.0 21.3 0.9X +current_date wholestage off 183 184 1 54.6 18.3 1.0X +current_date wholestage on 216 218 3 46.3 21.6 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 185 189 6 54.0 18.5 1.0X -current_timestamp wholestage on 225 228 2 44.4 22.5 0.8X +current_timestamp wholestage off 192 205 19 52.2 19.2 1.0X +current_timestamp wholestage on 220 231 9 45.4 22.0 0.9X ================================================================================================ Date arithmetic ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 754 757 5 13.3 75.4 1.0X -cast to date wholestage on 771 777 6 13.0 77.1 1.0X +cast to date wholestage off 862 863 1 11.6 86.2 1.0X +cast to date wholestage on 876 893 22 11.4 87.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 854 855 1 11.7 85.4 1.0X -last_day wholestage on 868 871 3 11.5 86.8 1.0X +last_day wholestage off 967 970 4 10.3 96.7 1.0X +last_day wholestage on 982 985 3 10.2 98.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 782 783 2 12.8 78.2 1.0X -next_day wholestage on 811 818 9 12.3 81.1 1.0X +next_day wholestage off 888 892 5 11.3 88.8 1.0X +next_day wholestage on 899 901 2 11.1 89.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 735 736 1 13.6 73.5 1.0X -date_add wholestage on 754 759 8 13.3 75.4 1.0X +date_add wholestage off 843 843 1 11.9 84.3 1.0X +date_add wholestage on 875 882 12 11.4 87.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 740 741 0 13.5 74.0 1.0X -date_sub wholestage on 753 757 6 13.3 75.3 1.0X +date_sub wholestage off 842 845 5 11.9 84.2 1.0X +date_sub wholestage on 876 883 6 11.4 87.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 1142 1143 1 8.8 114.2 1.0X -add_months wholestage on 1138 1149 14 8.8 113.8 1.0X +add_months wholestage off 1182 1185 4 8.5 118.2 1.0X +add_months wholestage on 1205 1210 5 8.3 120.5 1.0X ================================================================================================ Formatting dates ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 3591 3598 9 2.8 359.1 1.0X -format date wholestage on 3704 3724 25 2.7 370.4 1.0X +format date wholestage off 4003 4012 14 2.5 400.3 1.0X +format date wholestage on 4044 4048 5 2.5 404.4 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 3881 3886 7 2.6 388.1 1.0X -from_unixtime wholestage on 3844 4051 117 2.6 384.4 1.0X +from_unixtime wholestage off 4055 4059 5 2.5 405.5 1.0X +from_unixtime wholestage on 4081 4091 8 2.5 408.1 1.0X ================================================================================================ Convert timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 673 677 6 14.9 67.3 1.0X -from_utc_timestamp wholestage on 782 788 6 12.8 78.2 0.9X +from_utc_timestamp wholestage off 675 685 14 14.8 67.5 1.0X +from_utc_timestamp wholestage on 802 810 7 12.5 80.2 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 1038 1038 0 9.6 103.8 1.0X -to_utc_timestamp wholestage on 1025 1031 5 9.8 102.5 1.0X +to_utc_timestamp wholestage off 1054 1055 1 9.5 105.4 1.0X +to_utc_timestamp wholestage on 1073 1076 3 9.3 107.3 1.0X ================================================================================================ Intervals ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 250 257 10 40.1 25.0 1.0X -cast interval wholestage on 215 220 6 46.6 21.5 1.2X +cast interval wholestage off 218 221 4 45.8 21.8 1.0X +cast interval wholestage on 217 221 4 46.2 21.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1340 1342 4 7.5 134.0 1.0X -datediff wholestage on 1389 1395 5 7.2 138.9 1.0X +datediff wholestage off 1492 1500 12 6.7 149.2 1.0X +datediff wholestage on 1514 1517 2 6.6 151.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 3444 3450 8 2.9 344.4 1.0X -months_between wholestage on 3439 3453 14 2.9 343.9 1.0X +months_between wholestage off 3634 3641 10 2.8 363.4 1.0X +months_between wholestage on 3577 3583 4 2.8 357.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 427 449 31 2.3 427.2 1.0X -window wholestage on 656 690 23 1.5 655.8 0.7X +window wholestage off 445 445 0 2.2 445.3 1.0X +window wholestage on 645 660 12 1.6 645.0 0.7X ================================================================================================ Truncation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 1828 1834 8 5.5 182.8 1.0X -date_trunc YEAR wholestage on 1776 1780 6 5.6 177.6 1.0X +date_trunc YEAR wholestage off 1870 1870 0 5.3 187.0 1.0X +date_trunc YEAR wholestage on 1837 1843 8 5.4 183.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 1847 1849 2 5.4 184.7 1.0X -date_trunc YYYY wholestage on 1774 1781 6 5.6 177.4 1.0X +date_trunc YYYY wholestage off 1867 1870 5 5.4 186.7 1.0X +date_trunc YYYY wholestage on 1841 1844 4 5.4 184.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 1843 1844 1 5.4 184.3 1.0X -date_trunc YY wholestage on 1778 1781 2 5.6 177.8 1.0X +date_trunc YY wholestage off 1868 1871 4 5.4 186.8 1.0X +date_trunc YY wholestage on 1838 1842 4 5.4 183.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 1857 1861 6 5.4 185.7 1.0X -date_trunc MON wholestage on 1786 1791 6 5.6 178.6 1.0X +date_trunc MON wholestage off 1961 1964 4 5.1 196.1 1.0X +date_trunc MON wholestage on 1880 1884 3 5.3 188.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 1860 1871 15 5.4 186.0 1.0X -date_trunc MONTH wholestage on 1782 1789 4 5.6 178.2 1.0X +date_trunc MONTH wholestage off 1966 1966 1 5.1 196.6 1.0X +date_trunc MONTH wholestage on 1881 1884 3 5.3 188.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 1860 1868 11 5.4 186.0 1.0X -date_trunc MM wholestage on 1789 1792 4 5.6 178.9 1.0X +date_trunc MM wholestage off 1966 1971 7 5.1 196.6 1.0X +date_trunc MM wholestage on 1881 1885 4 5.3 188.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 1328 1338 14 7.5 132.8 1.0X -date_trunc DAY wholestage on 1281 1286 5 7.8 128.1 1.0X +date_trunc DAY wholestage off 1318 1319 1 7.6 131.8 1.0X +date_trunc DAY wholestage on 1278 1284 5 7.8 127.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 1330 1335 6 7.5 133.0 1.0X -date_trunc DD wholestage on 1277 1280 2 7.8 127.7 1.0X +date_trunc DD wholestage off 1310 1312 2 7.6 131.0 1.0X +date_trunc DD wholestage on 1280 1285 6 7.8 128.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 1342 1347 7 7.5 134.2 1.0X -date_trunc HOUR wholestage on 1281 1285 3 7.8 128.1 1.0X +date_trunc HOUR wholestage off 1325 1328 5 7.5 132.5 1.0X +date_trunc HOUR wholestage on 1288 1294 4 7.8 128.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 1344 1346 2 7.4 134.4 1.0X -date_trunc MINUTE wholestage on 1306 1310 3 7.7 130.6 1.0X +date_trunc MINUTE wholestage off 1335 1339 5 7.5 133.5 1.0X +date_trunc MINUTE wholestage on 1316 1321 4 7.6 131.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 331 334 4 30.2 33.1 1.0X -date_trunc SECOND wholestage on 278 282 6 35.9 27.8 1.2X +date_trunc SECOND wholestage off 317 321 6 31.6 31.7 1.0X +date_trunc SECOND wholestage on 276 279 5 36.3 27.6 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 1742 1746 5 5.7 174.2 1.0X -date_trunc WEEK wholestage on 1688 1692 7 5.9 168.8 1.0X +date_trunc WEEK wholestage off 1812 1816 6 5.5 181.2 1.0X +date_trunc WEEK wholestage on 1764 1768 3 5.7 176.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 2385 2385 1 4.2 238.5 1.0X -date_trunc QUARTER wholestage on 2479 2495 32 4.0 247.9 1.0X +date_trunc QUARTER wholestage off 2664 2666 2 3.8 266.4 1.0X +date_trunc QUARTER wholestage on 2670 2684 16 3.7 267.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 1025 1025 1 9.8 102.5 1.0X -trunc year wholestage on 995 1003 8 10.0 99.5 1.0X +trunc year wholestage off 1123 1123 0 8.9 112.3 1.0X +trunc year wholestage on 1082 1085 3 9.2 108.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 1024 1027 4 9.8 102.4 1.0X -trunc yyyy wholestage on 995 999 4 10.1 99.5 1.0X +trunc yyyy wholestage off 1119 1119 1 8.9 111.9 1.0X +trunc yyyy wholestage on 1081 1092 16 9.3 108.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 1026 1026 0 9.8 102.6 1.0X -trunc yy wholestage on 999 1001 2 10.0 99.9 1.0X +trunc yy wholestage off 1121 1122 2 8.9 112.1 1.0X +trunc yy wholestage on 1083 1088 10 9.2 108.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 998 1000 3 10.0 99.8 1.0X -trunc mon wholestage on 952 953 1 10.5 95.2 1.0X +trunc mon wholestage off 1110 1123 18 9.0 111.0 1.0X +trunc mon wholestage on 1052 1061 12 9.5 105.2 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 999 1000 1 10.0 99.9 1.0X -trunc month wholestage on 951 961 18 10.5 95.1 1.1X +trunc month wholestage off 1102 1105 4 9.1 110.2 1.0X +trunc month wholestage on 1054 1057 2 9.5 105.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 1001 1003 3 10.0 100.1 1.0X -trunc mm wholestage on 951 953 2 10.5 95.1 1.1X +trunc mm wholestage off 1103 1103 0 9.1 110.3 1.0X +trunc mm wholestage on 1056 1067 12 9.5 105.6 1.0X ================================================================================================ Parsing ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 104 113 12 9.6 104.0 1.0X -to timestamp str wholestage on 100 103 3 10.0 99.5 1.0X +to timestamp str wholestage off 97 98 2 10.3 96.7 1.0X +to timestamp str wholestage on 99 102 4 10.1 99.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 760 763 3 1.3 760.4 1.0X -to_timestamp wholestage on 757 766 12 1.3 757.5 1.0X +to_timestamp wholestage off 721 721 1 1.4 720.6 1.0X +to_timestamp wholestage on 724 728 4 1.4 723.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 754 754 0 1.3 753.6 1.0X -to_unix_timestamp wholestage on 742 743 2 1.3 742.0 1.0X +to_unix_timestamp wholestage off 731 731 0 1.4 730.6 1.0X +to_unix_timestamp wholestage on 731 734 4 1.4 731.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 137 141 6 7.3 137.0 1.0X -to date str wholestage on 130 136 3 7.7 130.0 1.1X +to date str wholestage off 133 134 2 7.5 132.8 1.0X +to date str wholestage on 131 134 3 7.6 131.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 655 656 1 1.5 655.0 1.0X -to_date wholestage on 637 642 5 1.6 636.8 1.0X +to_date wholestage off 648 648 0 1.5 647.9 1.0X +to_date wholestage on 640 643 2 1.6 640.0 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 266 269 3 18.8 53.2 1.0X -From java.time.LocalDate 286 294 12 17.5 57.2 0.9X -Collect java.sql.Date 1152 1337 171 4.3 230.5 0.2X -Collect java.time.LocalDate 962 1123 144 5.2 192.4 0.3X -From java.sql.Timestamp 197 202 5 25.3 39.5 1.3X -From java.time.Instant 176 196 33 28.3 35.3 1.5X -Collect longs 847 1023 198 5.9 169.4 0.3X -Collect java.sql.Timestamp 1160 1208 80 4.3 232.0 0.2X -Collect java.time.Instant 1083 1158 78 4.6 216.6 0.2X -java.sql.Date to Hive string 4114 4175 91 1.2 822.8 0.1X -java.time.LocalDate to Hive string 3656 3737 98 1.4 731.2 0.1X -java.sql.Timestamp to Hive string 6474 6727 243 0.8 1294.8 0.0X -java.time.Instant to Hive string 5303 5420 117 0.9 1060.6 0.1X +From java.sql.Date 281 282 2 17.8 56.1 1.0X +From java.time.LocalDate 280 283 4 17.8 56.0 1.0X +Collect java.sql.Date 1328 1427 118 3.8 265.6 0.2X +Collect java.time.LocalDate 984 1125 124 5.1 196.7 0.3X +From java.sql.Timestamp 199 204 6 25.1 39.8 1.4X +From java.time.Instant 181 183 2 27.7 36.1 1.6X +Collect longs 945 998 46 5.3 189.0 0.3X +Collect java.sql.Timestamp 1008 1209 196 5.0 201.6 0.3X +Collect java.time.Instant 822 1017 206 6.1 164.4 0.3X +java.sql.Date to Hive string 3880 4013 125 1.3 775.9 0.1X +java.time.LocalDate to Hive string 3584 3632 45 1.4 716.8 0.1X +java.sql.Timestamp to Hive string 6366 6433 58 0.8 1273.2 0.0X +java.time.Instant to Hive string 5133 5224 113 1.0 1026.7 0.1X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt index c15fb78f2f165..fbe7b36bffa60 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt @@ -2,153 +2,153 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save DATE to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 11193 11193 0 8.9 111.9 1.0X -before 1582, noop 7070 7070 0 14.1 70.7 1.6X -after 1582, rebase EXCEPTION 19836 19836 0 5.0 198.4 0.6X -after 1582, rebase LEGACY 19368 19368 0 5.2 193.7 0.6X -after 1582, rebase CORRECTED 19627 19627 0 5.1 196.3 0.6X -before 1582, rebase LEGACY 16301 16301 0 6.1 163.0 0.7X -before 1582, rebase CORRECTED 15612 15612 0 6.4 156.1 0.7X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1582, noop 11144 11144 0 9.0 111.4 1.0X +before 1582, noop 7066 7066 0 14.2 70.7 1.6X +after 1582, rebase EXCEPTION 19440 19440 0 5.1 194.4 0.6X +after 1582, rebase LEGACY 19280 19280 0 5.2 192.8 0.6X +after 1582, rebase CORRECTED 19431 19431 0 5.1 194.3 0.6X +before 1582, rebase LEGACY 15530 15530 0 6.4 155.3 0.7X +before 1582, rebase CORRECTED 15717 15717 0 6.4 157.2 0.7X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load DATE from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase EXCEPTION 11601 11657 95 8.6 116.0 1.0X -after 1582, vec off, rebase LEGACY 11671 11751 72 8.6 116.7 1.0X -after 1582, vec off, rebase CORRECTED 11593 11632 67 8.6 115.9 1.0X -after 1582, vec on, rebase EXCEPTION 2394 2427 33 41.8 23.9 4.8X -after 1582, vec on, rebase LEGACY 2466 2489 38 40.6 24.7 4.7X -after 1582, vec on, rebase CORRECTED 2487 2500 18 40.2 24.9 4.7X -before 1582, vec off, rebase LEGACY 11937 11951 14 8.4 119.4 1.0X -before 1582, vec off, rebase CORRECTED 11542 11600 60 8.7 115.4 1.0X -before 1582, vec on, rebase LEGACY 2708 2729 30 36.9 27.1 4.3X -before 1582, vec on, rebase CORRECTED 2436 2445 8 41.1 24.4 4.8X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1582, vec off, rebase EXCEPTION 11704 11746 52 8.5 117.0 1.0X +after 1582, vec off, rebase LEGACY 11525 11573 66 8.7 115.2 1.0X +after 1582, vec off, rebase CORRECTED 11505 11532 25 8.7 115.0 1.0X +after 1582, vec on, rebase EXCEPTION 2347 2370 21 42.6 23.5 5.0X +after 1582, vec on, rebase LEGACY 2450 2453 2 40.8 24.5 4.8X +after 1582, vec on, rebase CORRECTED 2431 2446 17 41.1 24.3 4.8X +before 1582, vec off, rebase LEGACY 11748 11779 39 8.5 117.5 1.0X +before 1582, vec off, rebase CORRECTED 11591 11630 33 8.6 115.9 1.0X +before 1582, vec on, rebase LEGACY 2781 2786 6 36.0 27.8 4.2X +before 1582, vec on, rebase CORRECTED 2420 2425 7 41.3 24.2 4.8X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_INT96 to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2403 2403 0 41.6 24.0 1.0X -before 1900, noop 2443 2443 0 40.9 24.4 1.0X -after 1900, rebase EXCEPTION 12805 12805 0 7.8 128.1 0.2X -after 1900, rebase LEGACY 12529 12529 0 8.0 125.3 0.2X -after 1900, rebase CORRECTED 12474 12474 0 8.0 124.7 0.2X -before 1900, rebase LEGACY 14628 14628 0 6.8 146.3 0.2X -before 1900, rebase CORRECTED 12601 12601 0 7.9 126.0 0.2X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1900, noop 2518 2518 0 39.7 25.2 1.0X +before 1900, noop 2375 2375 0 42.1 23.8 1.1X +after 1900, rebase EXCEPTION 13654 13654 0 7.3 136.5 0.2X +after 1900, rebase LEGACY 13187 13187 0 7.6 131.9 0.2X +after 1900, rebase CORRECTED 13174 13174 0 7.6 131.7 0.2X +before 1900, rebase LEGACY 15129 15129 0 6.6 151.3 0.2X +before 1900, rebase CORRECTED 13438 13438 0 7.4 134.4 0.2X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_INT96 from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 15545 15658 99 6.4 155.4 1.0X -after 1900, vec off, rebase LEGACY 15927 15945 25 6.3 159.3 1.0X -after 1900, vec off, rebase CORRECTED 15558 15620 54 6.4 155.6 1.0X -after 1900, vec on, rebase EXCEPTION 4050 4074 34 24.7 40.5 3.8X -after 1900, vec on, rebase LEGACY 4024 4059 32 24.9 40.2 3.9X -after 1900, vec on, rebase CORRECTED 4062 4074 17 24.6 40.6 3.8X -before 1900, vec off, rebase LEGACY 18219 18234 22 5.5 182.2 0.9X -before 1900, vec off, rebase CORRECTED 15584 15633 45 6.4 155.8 1.0X -before 1900, vec on, rebase LEGACY 6080 6106 23 16.4 60.8 2.6X -before 1900, vec on, rebase CORRECTED 4045 4057 14 24.7 40.4 3.8X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1900, vec off, rebase EXCEPTION 16824 16877 53 5.9 168.2 1.0X +after 1900, vec off, rebase LEGACY 16304 16337 31 6.1 163.0 1.0X +after 1900, vec off, rebase CORRECTED 16164 16239 76 6.2 161.6 1.0X +after 1900, vec on, rebase EXCEPTION 4041 4045 7 24.7 40.4 4.2X +after 1900, vec on, rebase LEGACY 4015 4039 36 24.9 40.2 4.2X +after 1900, vec on, rebase CORRECTED 4012 4040 27 24.9 40.1 4.2X +before 1900, vec off, rebase LEGACY 18457 18537 71 5.4 184.6 0.9X +before 1900, vec off, rebase CORRECTED 16232 16269 55 6.2 162.3 1.0X +before 1900, vec on, rebase LEGACY 6106 6120 17 16.4 61.1 2.8X +before 1900, vec on, rebase CORRECTED 4018 4053 48 24.9 40.2 4.2X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MICROS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2378 2378 0 42.0 23.8 1.0X -before 1900, noop 2426 2426 0 41.2 24.3 1.0X -after 1900, rebase EXCEPTION 14475 14475 0 6.9 144.8 0.2X -after 1900, rebase LEGACY 13685 13685 0 7.3 136.8 0.2X -after 1900, rebase CORRECTED 13448 13448 0 7.4 134.5 0.2X -before 1900, rebase LEGACY 15085 15085 0 6.6 150.8 0.2X -before 1900, rebase CORRECTED 13668 13668 0 7.3 136.7 0.2X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1900, noop 2412 2412 0 41.5 24.1 1.0X +before 1900, noop 2413 2413 0 41.4 24.1 1.0X +after 1900, rebase EXCEPTION 11406 11406 0 8.8 114.1 0.2X +after 1900, rebase LEGACY 11249 11249 0 8.9 112.5 0.2X +after 1900, rebase CORRECTED 11318 11318 0 8.8 113.2 0.2X +before 1900, rebase LEGACY 13104 13104 0 7.6 131.0 0.2X +before 1900, rebase CORRECTED 11269 11269 0 8.9 112.7 0.2X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MICROS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14595 14621 23 6.9 146.0 1.0X -after 1900, vec off, rebase LEGACY 14689 14699 12 6.8 146.9 1.0X -after 1900, vec off, rebase CORRECTED 14626 14648 25 6.8 146.3 1.0X -after 1900, vec on, rebase EXCEPTION 3732 3745 14 26.8 37.3 3.9X -after 1900, vec on, rebase LEGACY 3753 3771 29 26.6 37.5 3.9X -after 1900, vec on, rebase CORRECTED 3714 3734 23 26.9 37.1 3.9X -before 1900, vec off, rebase LEGACY 17073 17151 107 5.9 170.7 0.9X -before 1900, vec off, rebase CORRECTED 14575 14613 33 6.9 145.8 1.0X -before 1900, vec on, rebase LEGACY 5581 5602 34 17.9 55.8 2.6X -before 1900, vec on, rebase CORRECTED 3680 3698 30 27.2 36.8 4.0X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1900, vec off, rebase EXCEPTION 15047 15100 59 6.6 150.5 1.0X +after 1900, vec off, rebase LEGACY 14965 15033 59 6.7 149.7 1.0X +after 1900, vec off, rebase CORRECTED 15041 15064 35 6.6 150.4 1.0X +after 1900, vec on, rebase EXCEPTION 3714 3737 21 26.9 37.1 4.1X +after 1900, vec on, rebase LEGACY 3748 3766 26 26.7 37.5 4.0X +after 1900, vec on, rebase CORRECTED 3733 3743 14 26.8 37.3 4.0X +before 1900, vec off, rebase LEGACY 17297 17350 52 5.8 173.0 0.9X +before 1900, vec off, rebase CORRECTED 14977 14993 24 6.7 149.8 1.0X +before 1900, vec on, rebase LEGACY 5709 5720 10 17.5 57.1 2.6X +before 1900, vec on, rebase CORRECTED 3696 3717 19 27.1 37.0 4.1X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MILLIS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2418 2418 0 41.4 24.2 1.0X -before 1900, noop 2413 2413 0 41.5 24.1 1.0X -after 1900, rebase EXCEPTION 11749 11749 0 8.5 117.5 0.2X -after 1900, rebase LEGACY 11757 11757 0 8.5 117.6 0.2X -after 1900, rebase CORRECTED 12081 12081 0 8.3 120.8 0.2X -before 1900, rebase LEGACY 13503 13503 0 7.4 135.0 0.2X -before 1900, rebase CORRECTED 11649 11649 0 8.6 116.5 0.2X - -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +after 1900, noop 2382 2382 0 42.0 23.8 1.0X +before 1900, noop 2414 2414 0 41.4 24.1 1.0X +after 1900, rebase EXCEPTION 11542 11542 0 8.7 115.4 0.2X +after 1900, rebase LEGACY 11074 11074 0 9.0 110.7 0.2X +after 1900, rebase CORRECTED 11275 11275 0 8.9 112.7 0.2X +before 1900, rebase LEGACY 13166 13166 0 7.6 131.7 0.2X +before 1900, rebase CORRECTED 11341 11341 0 8.8 113.4 0.2X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MILLIS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14322 14376 53 7.0 143.2 1.0X -after 1900, vec off, rebase LEGACY 14453 14478 22 6.9 144.5 1.0X -after 1900, vec off, rebase CORRECTED 14429 14523 94 6.9 144.3 1.0X -after 1900, vec on, rebase EXCEPTION 4876 4914 42 20.5 48.8 2.9X -after 1900, vec on, rebase LEGACY 4418 4465 41 22.6 44.2 3.2X -after 1900, vec on, rebase CORRECTED 4876 4909 50 20.5 48.8 2.9X -before 1900, vec off, rebase LEGACY 17196 17238 45 5.8 172.0 0.8X -before 1900, vec off, rebase CORRECTED 14462 14509 57 6.9 144.6 1.0X -before 1900, vec on, rebase LEGACY 6120 6135 16 16.3 61.2 2.3X -before 1900, vec on, rebase CORRECTED 4887 4929 42 20.5 48.9 2.9X +after 1900, vec off, rebase EXCEPTION 14250 14288 40 7.0 142.5 1.0X +after 1900, vec off, rebase LEGACY 14235 14315 82 7.0 142.4 1.0X +after 1900, vec off, rebase CORRECTED 14284 14304 25 7.0 142.8 1.0X +after 1900, vec on, rebase EXCEPTION 4925 4941 27 20.3 49.2 2.9X +after 1900, vec on, rebase LEGACY 4489 4499 10 22.3 44.9 3.2X +after 1900, vec on, rebase CORRECTED 4916 4943 33 20.3 49.2 2.9X +before 1900, vec off, rebase LEGACY 16801 16813 11 6.0 168.0 0.8X +before 1900, vec off, rebase CORRECTED 14259 14307 50 7.0 142.6 1.0X +before 1900, vec on, rebase LEGACY 5958 5966 7 16.8 59.6 2.4X +before 1900, vec on, rebase CORRECTED 4900 4920 19 20.4 49.0 2.9X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save DATE to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 12614 12614 0 7.9 126.1 1.0X -before 1582, noop 6620 6620 0 15.1 66.2 1.9X -after 1582 17066 17066 0 5.9 170.7 0.7X -before 1582 10573 10573 0 9.5 105.7 1.2X +after 1582, noop 10754 10754 0 9.3 107.5 1.0X +before 1582, noop 6783 6783 0 14.7 67.8 1.6X +after 1582 15425 15425 0 6.5 154.2 0.7X +before 1582 10856 10856 0 9.2 108.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load DATE from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 8422 8523 131 11.9 84.2 1.0X -after 1582, vec on 2386 2401 20 41.9 23.9 3.5X -before 1582, vec off 8447 8474 42 11.8 84.5 1.0X -before 1582, vec on 2526 2542 24 39.6 25.3 3.3X +after 1582, vec off 8437 8530 90 11.9 84.4 1.0X +after 1582, vec on 2419 2430 10 41.3 24.2 3.5X +before 1582, vec off 8505 8526 19 11.8 85.1 1.0X +before 1582, vec on 2557 2566 11 39.1 25.6 3.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2336 2336 0 42.8 23.4 1.0X -before 1900, noop 2309 2309 0 43.3 23.1 1.0X -after 1900 9646 9646 0 10.4 96.5 0.2X -before 1900 12150 12150 0 8.2 121.5 0.2X +after 1900, noop 2308 2308 0 43.3 23.1 1.0X +before 1900, noop 2302 2302 0 43.4 23.0 1.0X +after 1900 9526 9526 0 10.5 95.3 0.2X +before 1900 11558 11558 0 8.7 115.6 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off 9781 9845 86 10.2 97.8 1.0X -after 1900, vec on 3778 3792 13 26.5 37.8 2.6X -before 1900, vec off 11757 11781 21 8.5 117.6 0.8X -before 1900, vec on 5490 5511 21 18.2 54.9 1.8X +after 1900, vec off 10757 10772 13 9.3 107.6 1.0X +after 1900, vec on 3892 3899 11 25.7 38.9 2.8X +before 1900, vec off 13141 13195 52 7.6 131.4 0.8X +before 1900, vec on 6226 6301 129 16.1 62.3 1.7X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index 249b478e772a8..eed620cdeced6 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,153 +2,153 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save DATE to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 11170 11170 0 9.0 111.7 1.0X -before 1582, noop 6505 6505 0 15.4 65.0 1.7X -after 1582, rebase EXCEPTION 19873 19873 0 5.0 198.7 0.6X -after 1582, rebase LEGACY 19726 19726 0 5.1 197.3 0.6X -after 1582, rebase CORRECTED 19931 19931 0 5.0 199.3 0.6X -before 1582, rebase LEGACY 15590 15590 0 6.4 155.9 0.7X -before 1582, rebase CORRECTED 15523 15523 0 6.4 155.2 0.7X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1582, noop 13169 13169 0 7.6 131.7 1.0X +before 1582, noop 7787 7787 0 12.8 77.9 1.7X +after 1582, rebase EXCEPTION 21399 21399 0 4.7 214.0 0.6X +after 1582, rebase LEGACY 21530 21530 0 4.6 215.3 0.6X +after 1582, rebase CORRECTED 21579 21579 0 4.6 215.8 0.6X +before 1582, rebase LEGACY 16095 16095 0 6.2 160.9 0.8X +before 1582, rebase CORRECTED 16011 16011 0 6.2 160.1 0.8X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load DATE from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase EXCEPTION 11571 11596 22 8.6 115.7 1.0X -after 1582, vec off, rebase LEGACY 11892 11909 27 8.4 118.9 1.0X -after 1582, vec off, rebase CORRECTED 11681 11724 47 8.6 116.8 1.0X -after 1582, vec on, rebase EXCEPTION 2516 2530 13 39.7 25.2 4.6X -after 1582, vec on, rebase LEGACY 2555 2563 8 39.1 25.5 4.5X -after 1582, vec on, rebase CORRECTED 2487 2503 22 40.2 24.9 4.7X -before 1582, vec off, rebase LEGACY 11947 11996 69 8.4 119.5 1.0X -before 1582, vec off, rebase CORRECTED 11792 11821 41 8.5 117.9 1.0X -before 1582, vec on, rebase LEGACY 2826 2856 25 35.4 28.3 4.1X -before 1582, vec on, rebase CORRECTED 2465 2489 21 40.6 24.6 4.7X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1582, vec off, rebase EXCEPTION 11598 11638 62 8.6 116.0 1.0X +after 1582, vec off, rebase LEGACY 11716 11730 17 8.5 117.2 1.0X +after 1582, vec off, rebase CORRECTED 11556 11616 52 8.7 115.6 1.0X +after 1582, vec on, rebase EXCEPTION 2528 2537 10 39.6 25.3 4.6X +after 1582, vec on, rebase LEGACY 2564 2569 6 39.0 25.6 4.5X +after 1582, vec on, rebase CORRECTED 2487 2534 44 40.2 24.9 4.7X +before 1582, vec off, rebase LEGACY 11740 11799 56 8.5 117.4 1.0X +before 1582, vec off, rebase CORRECTED 11606 11656 50 8.6 116.1 1.0X +before 1582, vec on, rebase LEGACY 2840 2871 27 35.2 28.4 4.1X +before 1582, vec on, rebase CORRECTED 2401 2429 31 41.6 24.0 4.8X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_INT96 to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2340 2340 0 42.7 23.4 1.0X -before 1900, noop 2284 2284 0 43.8 22.8 1.0X -after 1900, rebase EXCEPTION 13230 13230 0 7.6 132.3 0.2X -after 1900, rebase LEGACY 13238 13238 0 7.6 132.4 0.2X -after 1900, rebase CORRECTED 13264 13264 0 7.5 132.6 0.2X -before 1900, rebase LEGACY 15216 15216 0 6.6 152.2 0.2X -before 1900, rebase CORRECTED 13382 13382 0 7.5 133.8 0.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1900, noop 2304 2304 0 43.4 23.0 1.0X +before 1900, noop 2242 2242 0 44.6 22.4 1.0X +after 1900, rebase EXCEPTION 13198 13198 0 7.6 132.0 0.2X +after 1900, rebase LEGACY 12894 12894 0 7.8 128.9 0.2X +after 1900, rebase CORRECTED 12991 12991 0 7.7 129.9 0.2X +before 1900, rebase LEGACY 14288 14288 0 7.0 142.9 0.2X +before 1900, rebase CORRECTED 12614 12614 0 7.9 126.1 0.2X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_INT96 from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 16170 16195 22 6.2 161.7 1.0X -after 1900, vec off, rebase LEGACY 16720 16755 31 6.0 167.2 1.0X -after 1900, vec off, rebase CORRECTED 16152 16213 54 6.2 161.5 1.0X -after 1900, vec on, rebase EXCEPTION 4090 4101 18 24.4 40.9 4.0X -after 1900, vec on, rebase LEGACY 4114 4144 33 24.3 41.1 3.9X -after 1900, vec on, rebase CORRECTED 4158 4191 28 24.0 41.6 3.9X -before 1900, vec off, rebase LEGACY 18554 18584 31 5.4 185.5 0.9X -before 1900, vec off, rebase CORRECTED 16192 16267 84 6.2 161.9 1.0X -before 1900, vec on, rebase LEGACY 6256 6271 22 16.0 62.6 2.6X -before 1900, vec on, rebase CORRECTED 4074 4104 27 24.5 40.7 4.0X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1900, vec off, rebase EXCEPTION 15238 15263 23 6.6 152.4 1.0X +after 1900, vec off, rebase LEGACY 14777 14793 22 6.8 147.8 1.0X +after 1900, vec off, rebase CORRECTED 14578 14650 107 6.9 145.8 1.0X +after 1900, vec on, rebase EXCEPTION 4051 4103 67 24.7 40.5 3.8X +after 1900, vec on, rebase LEGACY 4097 4123 34 24.4 41.0 3.7X +after 1900, vec on, rebase CORRECTED 4080 4092 16 24.5 40.8 3.7X +before 1900, vec off, rebase LEGACY 17402 17431 26 5.7 174.0 0.9X +before 1900, vec off, rebase CORRECTED 15337 15394 51 6.5 153.4 1.0X +before 1900, vec on, rebase LEGACY 6180 6197 17 16.2 61.8 2.5X +before 1900, vec on, rebase CORRECTED 4082 4094 14 24.5 40.8 3.7X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MICROS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2390 2390 0 41.8 23.9 1.0X -before 1900, noop 2291 2291 0 43.6 22.9 1.0X -after 1900, rebase EXCEPTION 12537 12537 0 8.0 125.4 0.2X -after 1900, rebase LEGACY 12047 12047 0 8.3 120.5 0.2X -after 1900, rebase CORRECTED 12151 12151 0 8.2 121.5 0.2X -before 1900, rebase LEGACY 13960 13960 0 7.2 139.6 0.2X -before 1900, rebase CORRECTED 11985 11985 0 8.3 119.9 0.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1900, noop 2285 2285 0 43.8 22.9 1.0X +before 1900, noop 2287 2287 0 43.7 22.9 1.0X +after 1900, rebase EXCEPTION 12295 12295 0 8.1 122.9 0.2X +after 1900, rebase LEGACY 11653 11653 0 8.6 116.5 0.2X +after 1900, rebase CORRECTED 11718 11718 0 8.5 117.2 0.2X +before 1900, rebase LEGACY 13462 13462 0 7.4 134.6 0.2X +before 1900, rebase CORRECTED 11886 11886 0 8.4 118.9 0.2X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MICROS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14977 15018 36 6.7 149.8 1.0X -after 1900, vec off, rebase LEGACY 14924 14960 33 6.7 149.2 1.0X -after 1900, vec off, rebase CORRECTED 14965 14994 26 6.7 149.7 1.0X -after 1900, vec on, rebase EXCEPTION 3810 3819 8 26.2 38.1 3.9X -after 1900, vec on, rebase LEGACY 3829 3835 8 26.1 38.3 3.9X -after 1900, vec on, rebase CORRECTED 3785 3837 47 26.4 37.9 4.0X -before 1900, vec off, rebase LEGACY 17323 17343 19 5.8 173.2 0.9X -before 1900, vec off, rebase CORRECTED 14933 14962 26 6.7 149.3 1.0X -before 1900, vec on, rebase LEGACY 5763 5783 17 17.4 57.6 2.6X -before 1900, vec on, rebase CORRECTED 3798 3817 32 26.3 38.0 3.9X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1900, vec off, rebase EXCEPTION 14824 14877 57 6.7 148.2 1.0X +after 1900, vec off, rebase LEGACY 14876 14899 26 6.7 148.8 1.0X +after 1900, vec off, rebase CORRECTED 14924 14947 24 6.7 149.2 1.0X +after 1900, vec on, rebase EXCEPTION 3813 3817 5 26.2 38.1 3.9X +after 1900, vec on, rebase LEGACY 3829 3855 28 26.1 38.3 3.9X +after 1900, vec on, rebase CORRECTED 3803 3811 11 26.3 38.0 3.9X +before 1900, vec off, rebase LEGACY 17141 17177 53 5.8 171.4 0.9X +before 1900, vec off, rebase CORRECTED 14916 14936 26 6.7 149.2 1.0X +before 1900, vec on, rebase LEGACY 5638 5656 15 17.7 56.4 2.6X +before 1900, vec on, rebase CORRECTED 3792 3820 43 26.4 37.9 3.9X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MILLIS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2309 2309 0 43.3 23.1 1.0X -before 1900, noop 2358 2358 0 42.4 23.6 1.0X -after 1900, rebase EXCEPTION 11266 11266 0 8.9 112.7 0.2X -after 1900, rebase LEGACY 11582 11582 0 8.6 115.8 0.2X -after 1900, rebase CORRECTED 11555 11555 0 8.7 115.5 0.2X -before 1900, rebase LEGACY 13600 13600 0 7.4 136.0 0.2X -before 1900, rebase CORRECTED 12113 12113 0 8.3 121.1 0.2X - -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +after 1900, noop 2283 2283 0 43.8 22.8 1.0X +before 1900, noop 2286 2286 0 43.8 22.9 1.0X +after 1900, rebase EXCEPTION 11040 11040 0 9.1 110.4 0.2X +after 1900, rebase LEGACY 11421 11421 0 8.8 114.2 0.2X +after 1900, rebase CORRECTED 11132 11132 0 9.0 111.3 0.2X +before 1900, rebase LEGACY 13097 13097 0 7.6 131.0 0.2X +before 1900, rebase CORRECTED 11359 11359 0 8.8 113.6 0.2X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MILLIS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 15076 15125 46 6.6 150.8 1.0X -after 1900, vec off, rebase LEGACY 15480 15491 16 6.5 154.8 1.0X -after 1900, vec off, rebase CORRECTED 15171 15189 17 6.6 151.7 1.0X -after 1900, vec on, rebase EXCEPTION 3976 4001 28 25.2 39.8 3.8X -after 1900, vec on, rebase LEGACY 4582 4609 46 21.8 45.8 3.3X -after 1900, vec on, rebase CORRECTED 3934 3953 29 25.4 39.3 3.8X -before 1900, vec off, rebase LEGACY 17602 17644 37 5.7 176.0 0.9X -before 1900, vec off, rebase CORRECTED 15201 15238 34 6.6 152.0 1.0X -before 1900, vec on, rebase LEGACY 6306 6311 6 15.9 63.1 2.4X -before 1900, vec on, rebase CORRECTED 3926 3961 50 25.5 39.3 3.8X +after 1900, vec off, rebase EXCEPTION 15017 15053 38 6.7 150.2 1.0X +after 1900, vec off, rebase LEGACY 14941 15013 75 6.7 149.4 1.0X +after 1900, vec off, rebase CORRECTED 15057 15070 17 6.6 150.6 1.0X +after 1900, vec on, rebase EXCEPTION 3942 3949 7 25.4 39.4 3.8X +after 1900, vec on, rebase LEGACY 4605 4628 26 21.7 46.1 3.3X +after 1900, vec on, rebase CORRECTED 4002 4027 22 25.0 40.0 3.8X +before 1900, vec off, rebase LEGACY 17121 17169 47 5.8 171.2 0.9X +before 1900, vec off, rebase CORRECTED 15086 15132 42 6.6 150.9 1.0X +before 1900, vec on, rebase LEGACY 6262 6271 10 16.0 62.6 2.4X +before 1900, vec on, rebase CORRECTED 3942 3960 24 25.4 39.4 3.8X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save DATE to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 11208 11208 0 8.9 112.1 1.0X -before 1582, noop 6567 6567 0 15.2 65.7 1.7X -after 1582 15130 15130 0 6.6 151.3 0.7X -before 1582 10992 10992 0 9.1 109.9 1.0X +after 1582, noop 13322 13322 0 7.5 133.2 1.0X +before 1582, noop 7967 7967 0 12.6 79.7 1.7X +after 1582 17193 17193 0 5.8 171.9 0.8X +before 1582 12729 12729 0 7.9 127.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load DATE from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 8770 8777 8 11.4 87.7 1.0X -after 1582, vec on 2445 2478 31 40.9 24.5 3.6X -before 1582, vec off 8820 8896 106 11.3 88.2 1.0X -before 1582, vec on 2580 2615 37 38.8 25.8 3.4X +after 1582, vec off 8797 8843 71 11.4 88.0 1.0X +after 1582, vec on 2457 2469 14 40.7 24.6 3.6X +before 1582, vec off 8555 8572 16 11.7 85.5 1.0X +before 1582, vec on 2613 2621 11 38.3 26.1 3.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2221 2221 0 45.0 22.2 1.0X -before 1900, noop 2218 2218 0 45.1 22.2 1.0X -after 1900 9916 9916 0 10.1 99.2 0.2X -before 1900 12130 12130 0 8.2 121.3 0.2X +after 1900, noop 2182 2182 0 45.8 21.8 1.0X +before 1900, noop 2169 2169 0 46.1 21.7 1.0X +after 1900 10099 10099 0 9.9 101.0 0.2X +before 1900 12162 12162 0 8.2 121.6 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off 10569 11038 802 9.5 105.7 1.0X -after 1900, vec on 4361 4415 55 22.9 43.6 2.4X -before 1900, vec off 12223 12227 7 8.2 122.2 0.9X -before 1900, vec on 6103 6136 30 16.4 61.0 1.7X +after 1900, vec off 9898 9923 26 10.1 99.0 1.0X +after 1900, vec on 4013 4048 55 24.9 40.1 2.5X +before 1900, vec off 11962 11980 18 8.4 119.6 0.8X +before 1900, vec on 5608 5635 43 17.8 56.1 1.8X diff --git a/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt index d74eb426cf341..a7aebf3e61025 100644 --- a/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt @@ -1,8 +1,8 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF-32 47715 47833 167 0.2 4771.5 1.0X -UTF-16 57379 57408 42 0.2 5737.9 0.8X -UTF-8 2840 2872 45 3.5 284.0 16.8X +UTF-32 64447 64482 50 0.2 6444.7 1.0X +UTF-16 60035 60070 49 0.2 6003.5 1.1X +UTF-8 33512 33524 16 0.3 3351.2 1.9X diff --git a/sql/core/benchmarks/EncodeBenchmark-results.txt b/sql/core/benchmarks/EncodeBenchmark-results.txt index 5fdbbf72d7e77..bd888d90c17de 100644 --- a/sql/core/benchmarks/EncodeBenchmark-results.txt +++ b/sql/core/benchmarks/EncodeBenchmark-results.txt @@ -1,8 +1,8 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF-32 29962 30019 81 0.3 2996.2 1.0X -UTF-16 47699 47702 3 0.2 4769.9 0.6X -UTF-8 3112 3154 59 3.2 311.2 9.6X +UTF-32 33442 33457 21 0.3 3344.2 1.0X +UTF-16 50707 50731 35 0.2 5070.7 0.7X +UTF-8 30829 30847 25 0.3 3082.9 1.1X diff --git a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt index 08f3d54f5ae81..3bc77b17102fe 100644 --- a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt @@ -2,44 +2,44 @@ WITHOUT SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array with 100000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 2456 2456 0 41.7 24.0 1.0X -ExternalAppendOnlyUnsafeRowArray 3572 3595 33 28.7 34.9 0.7X +ArrayBuffer 2569 2579 14 39.9 25.1 1.0X +ExternalAppendOnlyUnsafeRowArray 3494 3513 27 29.3 34.1 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 5511 5519 11 47.6 21.0 1.0X -ExternalAppendOnlyUnsafeRowArray 12331 12382 73 21.3 47.0 0.4X +ArrayBuffer 5447 5500 75 48.1 20.8 1.0X +ExternalAppendOnlyUnsafeRowArray 11886 11907 29 22.1 45.3 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array with 30000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 10731 10759 39 45.8 21.8 1.0X -ExternalAppendOnlyUnsafeRowArray 18516 18568 72 26.5 37.7 0.6X +ArrayBuffer 10664 10664 1 46.1 21.7 1.0X +ExternalAppendOnlyUnsafeRowArray 17290 17397 151 28.4 35.2 0.6X ================================================================================================ WITH SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Spilling with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 8284 8328 63 31.6 31.6 1.0X -ExternalAppendOnlyUnsafeRowArray 6615 6624 14 39.6 25.2 1.3X +UnsafeExternalSorter 8436 8440 6 31.1 32.2 1.0X +ExternalAppendOnlyUnsafeRowArray 6686 6713 39 39.2 25.5 1.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Spilling with 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 5 5 0 32.8 30.5 1.0X -ExternalAppendOnlyUnsafeRowArray 4 4 0 38.5 26.0 1.2X +UnsafeExternalSorter 5 5 0 33.4 29.9 1.0X +ExternalAppendOnlyUnsafeRowArray 4 4 0 39.5 25.3 1.2X diff --git a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt index ca447f9e97dbc..cd6241caf25b0 100644 --- a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt +++ b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt @@ -2,44 +2,44 @@ WITHOUT SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array with 100000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 2496 2499 4 41.0 24.4 1.0X -ExternalAppendOnlyUnsafeRowArray 3495 3513 24 29.3 34.1 0.7X +ArrayBuffer 2453 2458 8 41.7 24.0 1.0X +ExternalAppendOnlyUnsafeRowArray 3401 3413 18 30.1 33.2 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 5277 5284 10 49.7 20.1 1.0X -ExternalAppendOnlyUnsafeRowArray 12169 12171 3 21.5 46.4 0.4X +ArrayBuffer 5330 5332 3 49.2 20.3 1.0X +ExternalAppendOnlyUnsafeRowArray 12411 12462 72 21.1 47.3 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Array with 30000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 10107 10110 4 48.6 20.6 1.0X -ExternalAppendOnlyUnsafeRowArray 17021 17035 20 28.9 34.6 0.6X +ArrayBuffer 10236 10250 20 48.0 20.8 1.0X +ExternalAppendOnlyUnsafeRowArray 16811 16821 15 29.2 34.2 0.6X ================================================================================================ WITH SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Spilling with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 8435 8499 89 31.1 32.2 1.0X -ExternalAppendOnlyUnsafeRowArray 7126 7131 6 36.8 27.2 1.2X +UnsafeExternalSorter 8715 8747 45 30.1 33.2 1.0X +ExternalAppendOnlyUnsafeRowArray 6495 6507 16 40.4 24.8 1.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Spilling with 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 5 5 0 34.5 29.0 1.0X -ExternalAppendOnlyUnsafeRowArray 4 4 0 36.6 27.3 1.1X +UnsafeExternalSorter 5 5 0 33.5 29.9 1.0X +ExternalAppendOnlyUnsafeRowArray 4 4 0 40.5 24.7 1.2X diff --git a/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt b/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt index 78df1f6557073..9420529bb5166 100644 --- a/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt @@ -1,104 +1,104 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke extract for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 278 295 28 35.9 27.8 1.0X -YEAR of timestamp 604 616 15 16.6 60.4 0.5X -YEAROFWEEK of timestamp 648 659 10 15.4 64.8 0.4X -QUARTER of timestamp 653 672 30 15.3 65.3 0.4X -MONTH of timestamp 572 581 11 17.5 57.2 0.5X -WEEK of timestamp 865 868 3 11.6 86.5 0.3X -DAY of timestamp 576 583 9 17.4 57.6 0.5X -DAYOFWEEK of timestamp 755 759 7 13.3 75.5 0.4X -DOW of timestamp 751 775 39 13.3 75.1 0.4X -DOW_ISO of timestamp 709 716 6 14.1 70.9 0.4X -DAYOFWEEK_ISO of timestamp 708 709 1 14.1 70.8 0.4X -DOY of timestamp 603 614 18 16.6 60.3 0.5X -HOUR of timestamp 475 479 3 21.1 47.5 0.6X -MINUTE of timestamp 479 479 1 20.9 47.9 0.6X -SECOND of timestamp 533 536 3 18.7 53.3 0.5X +cast to timestamp 260 281 28 38.5 26.0 1.0X +YEAR of timestamp 660 684 27 15.1 66.0 0.4X +YEAROFWEEK of timestamp 621 623 2 16.1 62.1 0.4X +QUARTER of timestamp 635 637 2 15.8 63.5 0.4X +MONTH of timestamp 553 555 2 18.1 55.3 0.5X +WEEK of timestamp 847 882 41 11.8 84.7 0.3X +DAY of timestamp 561 562 1 17.8 56.1 0.5X +DAYOFWEEK of timestamp 739 743 3 13.5 73.9 0.4X +DOW of timestamp 744 744 1 13.4 74.4 0.3X +DOW_ISO of timestamp 670 676 9 14.9 67.0 0.4X +DAYOFWEEK_ISO of timestamp 668 670 2 15.0 66.8 0.4X +DOY of timestamp 596 597 1 16.8 59.6 0.4X +HOUR of timestamp 465 468 3 21.5 46.5 0.6X +MINUTE of timestamp 464 467 2 21.5 46.4 0.6X +SECOND of timestamp 531 537 6 18.8 53.1 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke date_part for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 251 251 1 39.9 25.1 1.0X -YEAR of timestamp 537 539 2 18.6 53.7 0.5X -YEAROFWEEK of timestamp 624 633 8 16.0 62.4 0.4X -QUARTER of timestamp 634 635 1 15.8 63.4 0.4X -MONTH of timestamp 556 564 10 18.0 55.6 0.5X -WEEK of timestamp 854 859 4 11.7 85.4 0.3X -DAY of timestamp 572 579 11 17.5 57.2 0.4X -DAYOFWEEK of timestamp 741 747 5 13.5 74.1 0.3X -DOW of timestamp 741 743 2 13.5 74.1 0.3X -DOW_ISO of timestamp 703 704 1 14.2 70.3 0.4X -DAYOFWEEK_ISO of timestamp 701 701 1 14.3 70.1 0.4X -DOY of timestamp 592 595 3 16.9 59.2 0.4X -HOUR of timestamp 474 476 2 21.1 47.4 0.5X -MINUTE of timestamp 476 479 5 21.0 47.6 0.5X -SECOND of timestamp 528 530 2 18.9 52.8 0.5X +cast to timestamp 234 244 9 42.8 23.4 1.0X +YEAR of timestamp 532 538 6 18.8 53.2 0.4X +YEAROFWEEK of timestamp 602 606 3 16.6 60.2 0.4X +QUARTER of timestamp 618 625 7 16.2 61.8 0.4X +MONTH of timestamp 540 549 10 18.5 54.0 0.4X +WEEK of timestamp 835 837 2 12.0 83.5 0.3X +DAY of timestamp 553 558 6 18.1 55.3 0.4X +DAYOFWEEK of timestamp 732 735 3 13.7 73.2 0.3X +DOW of timestamp 733 736 3 13.6 73.3 0.3X +DOW_ISO of timestamp 664 670 8 15.1 66.4 0.4X +DAYOFWEEK_ISO of timestamp 664 668 6 15.1 66.4 0.4X +DOY of timestamp 591 593 1 16.9 59.1 0.4X +HOUR of timestamp 461 468 7 21.7 46.1 0.5X +MINUTE of timestamp 462 464 2 21.6 46.2 0.5X +SECOND of timestamp 530 530 1 18.9 53.0 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke extract for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 523 526 4 19.1 52.3 1.0X -YEAR of date 533 535 3 18.8 53.3 1.0X -YEAROFWEEK of date 628 639 17 15.9 62.8 0.8X -QUARTER of date 629 631 3 15.9 62.9 0.8X -MONTH of date 566 577 18 17.7 56.6 0.9X -WEEK of date 859 872 21 11.6 85.9 0.6X -DAY of date 572 590 18 17.5 57.2 0.9X -DAYOFWEEK of date 741 746 9 13.5 74.1 0.7X -DOW of date 740 766 45 13.5 74.0 0.7X -DOW_ISO of date 700 707 10 14.3 70.0 0.7X -DAYOFWEEK_ISO of date 698 703 7 14.3 69.8 0.7X -DOY of date 592 596 5 16.9 59.2 0.9X -HOUR of date 993 1014 24 10.1 99.3 0.5X -MINUTE of date 995 1003 10 10.0 99.5 0.5X -SECOND of date 1058 1058 0 9.5 105.8 0.5X +cast to date 511 514 2 19.6 51.1 1.0X +YEAR of date 526 529 3 19.0 52.6 1.0X +YEAROFWEEK of date 601 607 8 16.6 60.1 0.9X +QUARTER of date 617 627 9 16.2 61.7 0.8X +MONTH of date 537 538 1 18.6 53.7 1.0X +WEEK of date 836 847 14 12.0 83.6 0.6X +DAY of date 551 557 9 18.2 55.1 0.9X +DAYOFWEEK of date 734 742 7 13.6 73.4 0.7X +DOW of date 731 734 4 13.7 73.1 0.7X +DOW_ISO of date 664 667 5 15.1 66.4 0.8X +DAYOFWEEK_ISO of date 661 666 4 15.1 66.1 0.8X +DOY of date 588 593 8 17.0 58.8 0.9X +HOUR of date 985 986 1 10.2 98.5 0.5X +MINUTE of date 980 991 14 10.2 98.0 0.5X +SECOND of date 1035 1043 13 9.7 103.5 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke date_part for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 525 574 83 19.1 52.5 1.0X -YEAR of date 539 540 1 18.6 53.9 1.0X -YEAROFWEEK of date 628 631 5 15.9 62.8 0.8X -QUARTER of date 629 640 15 15.9 62.9 0.8X -MONTH of date 553 555 2 18.1 55.3 0.9X -WEEK of date 850 852 1 11.8 85.0 0.6X -DAY of date 568 574 10 17.6 56.8 0.9X -DAYOFWEEK of date 740 741 1 13.5 74.0 0.7X -DOW of date 739 746 6 13.5 73.9 0.7X -DOW_ISO of date 699 703 4 14.3 69.9 0.8X -DAYOFWEEK_ISO of date 699 700 1 14.3 69.9 0.8X -DOY of date 590 592 3 17.0 59.0 0.9X -HOUR of date 991 992 0 10.1 99.1 0.5X -MINUTE of date 989 990 1 10.1 98.9 0.5X -SECOND of date 1058 1062 5 9.4 105.8 0.5X +cast to date 512 515 3 19.5 51.2 1.0X +YEAR of date 526 534 8 19.0 52.6 1.0X +YEAROFWEEK of date 600 602 2 16.7 60.0 0.9X +QUARTER of date 616 623 11 16.2 61.6 0.8X +MONTH of date 538 543 9 18.6 53.8 1.0X +WEEK of date 837 838 1 12.0 83.7 0.6X +DAY of date 550 553 3 18.2 55.0 0.9X +DAYOFWEEK of date 734 739 5 13.6 73.4 0.7X +DOW of date 733 759 43 13.7 73.3 0.7X +DOW_ISO of date 664 668 3 15.1 66.4 0.8X +DAYOFWEEK_ISO of date 665 666 0 15.0 66.5 0.8X +DOY of date 593 594 1 16.9 59.3 0.9X +HOUR of date 983 986 3 10.2 98.3 0.5X +MINUTE of date 979 981 3 10.2 97.9 0.5X +SECOND of date 1038 1039 1 9.6 103.8 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke extract for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 738 741 2 13.5 73.8 1.0X -YEAR of interval 718 721 4 13.9 71.8 1.0X -MONTH of interval 721 725 3 13.9 72.1 1.0X -DAY of interval 718 722 4 13.9 71.8 1.0X -HOUR of interval 730 733 4 13.7 73.0 1.0X -MINUTE of interval 724 728 3 13.8 72.4 1.0X -SECOND of interval 775 785 13 12.9 77.5 1.0X +cast to interval 723 728 5 13.8 72.3 1.0X +YEAR of interval 717 718 2 13.9 71.7 1.0X +MONTH of interval 720 722 2 13.9 72.0 1.0X +DAY of interval 716 719 2 14.0 71.6 1.0X +HOUR of interval 729 731 2 13.7 72.9 1.0X +MINUTE of interval 725 726 1 13.8 72.5 1.0X +SECOND of interval 769 771 2 13.0 76.9 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke date_part for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 742 745 4 13.5 74.2 1.0X -YEAR of interval 724 725 0 13.8 72.4 1.0X -MONTH of interval 722 724 2 13.9 72.2 1.0X -DAY of interval 728 730 1 13.7 72.8 1.0X -HOUR of interval 731 739 8 13.7 73.1 1.0X -MINUTE of interval 733 740 11 13.6 73.3 1.0X -SECOND of interval 785 800 16 12.7 78.5 0.9X +cast to interval 728 729 1 13.7 72.8 1.0X +YEAR of interval 722 722 1 13.9 72.2 1.0X +MONTH of interval 718 723 5 13.9 71.8 1.0X +DAY of interval 713 718 7 14.0 71.3 1.0X +HOUR of interval 726 727 2 13.8 72.6 1.0X +MINUTE of interval 734 736 3 13.6 73.4 1.0X +SECOND of interval 770 771 2 13.0 77.0 0.9X diff --git a/sql/core/benchmarks/ExtractBenchmark-results.txt b/sql/core/benchmarks/ExtractBenchmark-results.txt index a60f24142bc60..b472b3fea998b 100644 --- a/sql/core/benchmarks/ExtractBenchmark-results.txt +++ b/sql/core/benchmarks/ExtractBenchmark-results.txt @@ -1,104 +1,104 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke extract for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 286 311 23 34.9 28.6 1.0X -YEAR of timestamp 792 798 8 12.6 79.2 0.4X -YEAROFWEEK of timestamp 879 901 19 11.4 87.9 0.3X -QUARTER of timestamp 842 849 9 11.9 84.2 0.3X -MONTH of timestamp 799 804 5 12.5 79.9 0.4X -WEEK of timestamp 1104 1107 3 9.1 110.4 0.3X -DAY of timestamp 780 788 9 12.8 78.0 0.4X -DAYOFWEEK of timestamp 967 973 5 10.3 96.7 0.3X -DOW of timestamp 965 970 5 10.4 96.5 0.3X -DOW_ISO of timestamp 1022 1024 4 9.8 102.2 0.3X -DAYOFWEEK_ISO of timestamp 1022 1024 4 9.8 102.2 0.3X -DOY of timestamp 844 855 13 11.8 84.4 0.3X -HOUR of timestamp 558 563 5 17.9 55.8 0.5X -MINUTE of timestamp 564 564 0 17.7 56.4 0.5X -SECOND of timestamp 657 658 1 15.2 65.7 0.4X +cast to timestamp 243 273 33 41.2 24.3 1.0X +YEAR of timestamp 780 785 5 12.8 78.0 0.3X +YEAROFWEEK of timestamp 849 883 36 11.8 84.9 0.3X +QUARTER of timestamp 798 799 0 12.5 79.8 0.3X +MONTH of timestamp 758 762 4 13.2 75.8 0.3X +WEEK of timestamp 1113 1118 6 9.0 111.3 0.2X +DAY of timestamp 752 757 5 13.3 75.2 0.3X +DAYOFWEEK of timestamp 940 945 4 10.6 94.0 0.3X +DOW of timestamp 940 949 14 10.6 94.0 0.3X +DOW_ISO of timestamp 997 1004 11 10.0 99.7 0.2X +DAYOFWEEK_ISO of timestamp 991 995 4 10.1 99.1 0.2X +DOY of timestamp 811 816 6 12.3 81.1 0.3X +HOUR of timestamp 536 543 7 18.7 53.6 0.5X +MINUTE of timestamp 532 541 7 18.8 53.2 0.5X +SECOND of timestamp 636 648 21 15.7 63.6 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke date_part for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 245 248 4 40.8 24.5 1.0X -YEAR of timestamp 785 788 3 12.7 78.5 0.3X -YEAROFWEEK of timestamp 859 859 0 11.6 85.9 0.3X -QUARTER of timestamp 818 819 1 12.2 81.8 0.3X -MONTH of timestamp 776 781 4 12.9 77.6 0.3X -WEEK of timestamp 1099 1109 17 9.1 109.9 0.2X -DAY of timestamp 778 780 2 12.9 77.8 0.3X -DAYOFWEEK of timestamp 964 966 2 10.4 96.4 0.3X -DOW of timestamp 964 966 3 10.4 96.4 0.3X -DOW_ISO of timestamp 1015 1020 5 9.9 101.5 0.2X -DAYOFWEEK_ISO of timestamp 1012 1014 3 9.9 101.2 0.2X -DOY of timestamp 847 850 6 11.8 84.7 0.3X -HOUR of timestamp 560 562 4 17.9 56.0 0.4X -MINUTE of timestamp 560 569 11 17.8 56.0 0.4X -SECOND of timestamp 656 660 6 15.2 65.6 0.4X +cast to timestamp 216 223 10 46.3 21.6 1.0X +YEAR of timestamp 767 770 5 13.0 76.7 0.3X +YEAROFWEEK of timestamp 830 840 14 12.0 83.0 0.3X +QUARTER of timestamp 786 791 4 12.7 78.6 0.3X +MONTH of timestamp 758 761 3 13.2 75.8 0.3X +WEEK of timestamp 1110 1119 8 9.0 111.0 0.2X +DAY of timestamp 759 760 1 13.2 75.9 0.3X +DAYOFWEEK of timestamp 939 942 5 10.7 93.9 0.2X +DOW of timestamp 937 938 1 10.7 93.7 0.2X +DOW_ISO of timestamp 986 987 1 10.1 98.6 0.2X +DAYOFWEEK_ISO of timestamp 985 990 4 10.1 98.5 0.2X +DOY of timestamp 819 824 4 12.2 81.9 0.3X +HOUR of timestamp 531 541 12 18.8 53.1 0.4X +MINUTE of timestamp 528 532 6 19.0 52.8 0.4X +SECOND of timestamp 635 638 5 15.7 63.5 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke extract for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 727 729 4 13.8 72.7 1.0X -YEAR of date 777 787 10 12.9 77.7 0.9X -YEAROFWEEK of date 852 858 8 11.7 85.2 0.9X -QUARTER of date 813 815 3 12.3 81.3 0.9X -MONTH of date 772 775 4 12.9 77.2 0.9X -WEEK of date 1091 1093 2 9.2 109.1 0.7X -DAY of date 777 778 1 12.9 77.7 0.9X -DAYOFWEEK of date 963 965 3 10.4 96.3 0.8X -DOW of date 960 963 3 10.4 96.0 0.8X -DOW_ISO of date 1017 1018 1 9.8 101.7 0.7X -DAYOFWEEK_ISO of date 1010 1013 2 9.9 101.0 0.7X -DOY of date 840 841 1 11.9 84.0 0.9X -HOUR of date 1288 1295 8 7.8 128.8 0.6X -MINUTE of date 1299 1313 20 7.7 129.9 0.6X -SECOND of date 1383 1393 10 7.2 138.3 0.5X +cast to date 701 710 12 14.3 70.1 1.0X +YEAR of date 766 770 4 13.1 76.6 0.9X +YEAROFWEEK of date 824 828 6 12.1 82.4 0.9X +QUARTER of date 787 790 3 12.7 78.7 0.9X +MONTH of date 756 756 1 13.2 75.6 0.9X +WEEK of date 1112 1113 1 9.0 111.2 0.6X +DAY of date 756 758 3 13.2 75.6 0.9X +DAYOFWEEK of date 940 941 1 10.6 94.0 0.7X +DOW of date 942 944 2 10.6 94.2 0.7X +DOW_ISO of date 986 1001 21 10.1 98.6 0.7X +DAYOFWEEK_ISO of date 984 991 7 10.2 98.4 0.7X +DOY of date 819 827 7 12.2 81.9 0.9X +HOUR of date 1278 1290 10 7.8 127.8 0.5X +MINUTE of date 1290 1293 2 7.8 129.0 0.5X +SECOND of date 1374 1376 3 7.3 137.4 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke date_part for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 731 735 3 13.7 73.1 1.0X -YEAR of date 779 784 7 12.8 77.9 0.9X -YEAROFWEEK of date 851 856 6 11.8 85.1 0.9X -QUARTER of date 816 817 1 12.3 81.6 0.9X -MONTH of date 771 774 4 13.0 77.1 0.9X -WEEK of date 1095 1097 4 9.1 109.5 0.7X -DAY of date 774 777 3 12.9 77.4 0.9X -DAYOFWEEK of date 960 961 2 10.4 96.0 0.8X -DOW of date 959 962 4 10.4 95.9 0.8X -DOW_ISO of date 1009 1011 2 9.9 100.9 0.7X -DAYOFWEEK_ISO of date 1009 1011 2 9.9 100.9 0.7X -DOY of date 843 844 1 11.9 84.3 0.9X -HOUR of date 1289 1290 1 7.8 128.9 0.6X -MINUTE of date 1285 1289 5 7.8 128.5 0.6X -SECOND of date 1390 1395 5 7.2 139.0 0.5X +cast to date 711 722 10 14.1 71.1 1.0X +YEAR of date 758 760 3 13.2 75.8 0.9X +YEAROFWEEK of date 826 830 6 12.1 82.6 0.9X +QUARTER of date 783 785 3 12.8 78.3 0.9X +MONTH of date 755 756 1 13.2 75.5 0.9X +WEEK of date 1102 1115 11 9.1 110.2 0.6X +DAY of date 749 753 3 13.3 74.9 0.9X +DAYOFWEEK of date 940 941 1 10.6 94.0 0.8X +DOW of date 934 936 3 10.7 93.4 0.8X +DOW_ISO of date 988 988 0 10.1 98.8 0.7X +DAYOFWEEK_ISO of date 988 998 18 10.1 98.8 0.7X +DOY of date 812 817 7 12.3 81.2 0.9X +HOUR of date 1274 1281 6 7.8 127.4 0.6X +MINUTE of date 1282 1287 6 7.8 128.2 0.6X +SECOND of date 1382 1384 2 7.2 138.2 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke extract for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 1127 1130 2 8.9 112.7 1.0X -YEAR of interval 1119 1123 5 8.9 111.9 1.0X -MONTH of interval 1117 1118 2 9.0 111.7 1.0X -DAY of interval 1124 1126 2 8.9 112.4 1.0X -HOUR of interval 1119 1120 2 8.9 111.9 1.0X -MINUTE of interval 1119 1122 3 8.9 111.9 1.0X -SECOND of interval 1216 1224 10 8.2 121.6 0.9X +cast to interval 1093 1095 2 9.2 109.3 1.0X +YEAR of interval 1085 1086 2 9.2 108.5 1.0X +MONTH of interval 1075 1075 0 9.3 107.5 1.0X +DAY of interval 1071 1076 5 9.3 107.1 1.0X +HOUR of interval 1075 1082 7 9.3 107.5 1.0X +MINUTE of interval 1113 1122 12 9.0 111.3 1.0X +SECOND of interval 1179 1181 3 8.5 117.9 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Invoke date_part for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 1130 1131 2 8.8 113.0 1.0X -YEAR of interval 1113 1116 2 9.0 111.3 1.0X -MONTH of interval 1122 1122 1 8.9 112.2 1.0X -DAY of interval 1122 1124 4 8.9 112.2 1.0X -HOUR of interval 1119 1121 2 8.9 111.9 1.0X -MINUTE of interval 1118 1125 9 8.9 111.8 1.0X -SECOND of interval 1208 1211 3 8.3 120.8 0.9X +cast to interval 1080 1082 3 9.3 108.0 1.0X +YEAR of interval 1077 1080 3 9.3 107.7 1.0X +MONTH of interval 1080 1081 1 9.3 108.0 1.0X +DAY of interval 1069 1070 2 9.4 106.9 1.0X +HOUR of interval 1073 1074 2 9.3 107.3 1.0X +MINUTE of interval 1122 1125 5 8.9 112.2 1.0X +SECOND of interval 1180 1184 4 8.5 118.0 0.9X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt index 17ffe9f3fab41..417979cfb62a7 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt @@ -2,733 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6345 6437 61 2.5 403.4 1.0X -Parquet Vectorized (Pushdown) 341 363 12 46.2 21.7 18.6X -Native ORC Vectorized 5118 5274 131 3.1 325.4 1.2X -Native ORC Vectorized (Pushdown) 318 323 5 49.5 20.2 20.0X +Parquet Vectorized 6457 6500 40 2.4 410.5 1.0X +Parquet Vectorized (Pushdown) 362 383 16 43.4 23.0 17.8X +Native ORC Vectorized 5171 5288 107 3.0 328.8 1.2X +Native ORC Vectorized (Pushdown) 314 323 9 50.1 20.0 20.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6333 6355 22 2.5 402.7 1.0X -Parquet Vectorized (Pushdown) 331 347 9 47.5 21.1 19.1X -Native ORC Vectorized 5259 5281 25 3.0 334.4 1.2X -Native ORC Vectorized (Pushdown) 310 330 19 50.7 19.7 20.4X +Parquet Vectorized 6405 6424 20 2.5 407.2 1.0X +Parquet Vectorized (Pushdown) 314 326 9 50.0 20.0 20.4X +Native ORC Vectorized 5221 5259 39 3.0 331.9 1.2X +Native ORC Vectorized (Pushdown) 299 317 13 52.6 19.0 21.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6378 6405 21 2.5 405.5 1.0X -Parquet Vectorized (Pushdown) 315 324 10 50.0 20.0 20.3X -Native ORC Vectorized 5359 5364 5 2.9 340.7 1.2X -Native ORC Vectorized (Pushdown) 301 308 5 52.2 19.2 21.2X +Parquet Vectorized 6432 6453 22 2.4 408.9 1.0X +Parquet Vectorized (Pushdown) 298 310 9 52.8 18.9 21.6X +Native ORC Vectorized 5377 5388 8 2.9 341.9 1.2X +Native ORC Vectorized (Pushdown) 303 312 7 51.8 19.3 21.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6371 6394 16 2.5 405.1 1.0X -Parquet Vectorized (Pushdown) 310 315 7 50.7 19.7 20.5X -Native ORC Vectorized 5354 5384 25 2.9 340.4 1.2X -Native ORC Vectorized (Pushdown) 291 299 6 54.1 18.5 21.9X +Parquet Vectorized 6433 6478 25 2.4 409.0 1.0X +Parquet Vectorized (Pushdown) 295 302 6 53.4 18.7 21.8X +Native ORC Vectorized 5363 5368 5 2.9 341.0 1.2X +Native ORC Vectorized (Pushdown) 286 294 7 55.0 18.2 22.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6427 6456 19 2.4 408.6 1.0X -Parquet Vectorized (Pushdown) 310 312 1 50.7 19.7 20.7X -Native ORC Vectorized 5240 5253 10 3.0 333.2 1.2X -Native ORC Vectorized (Pushdown) 288 301 11 54.7 18.3 22.3X +Parquet Vectorized 6457 6470 11 2.4 410.5 1.0X +Parquet Vectorized (Pushdown) 293 300 6 53.6 18.7 22.0X +Native ORC Vectorized 5356 5366 8 2.9 340.5 1.2X +Native ORC Vectorized (Pushdown) 288 295 5 54.6 18.3 22.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 13717 13822 93 1.1 872.1 1.0X -Parquet Vectorized (Pushdown) 13817 13833 19 1.1 878.4 1.0X -Native ORC Vectorized 12689 12724 34 1.2 806.7 1.1X -Native ORC Vectorized (Pushdown) 12802 12812 9 1.2 813.9 1.1X +Parquet Vectorized 14274 14374 112 1.1 907.5 1.0X +Parquet Vectorized (Pushdown) 14553 14581 27 1.1 925.2 1.0X +Native ORC Vectorized 13537 13553 20 1.2 860.7 1.1X +Native ORC Vectorized (Pushdown) 13620 13650 40 1.2 865.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6152 6231 51 2.6 391.2 1.0X -Parquet Vectorized (Pushdown) 306 315 9 51.4 19.5 20.1X -Native ORC Vectorized 4694 4761 104 3.4 298.4 1.3X -Native ORC Vectorized (Pushdown) 274 282 9 57.4 17.4 22.5X +Parquet Vectorized 6163 6242 78 2.6 391.9 1.0X +Parquet Vectorized (Pushdown) 277 290 12 56.8 17.6 22.2X +Native ORC Vectorized 4740 4795 56 3.3 301.3 1.3X +Native ORC Vectorized (Pushdown) 281 290 6 56.0 17.8 22.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5996 6015 14 2.6 381.2 1.0X -Parquet Vectorized (Pushdown) 302 311 7 52.2 19.2 19.9X -Native ORC Vectorized 4684 4691 5 3.4 297.8 1.3X -Native ORC Vectorized (Pushdown) 281 290 9 56.0 17.9 21.3X +Parquet Vectorized 6072 6080 11 2.6 386.1 1.0X +Parquet Vectorized (Pushdown) 283 301 24 55.5 18.0 21.4X +Native ORC Vectorized 4715 4731 22 3.3 299.8 1.3X +Native ORC Vectorized (Pushdown) 281 290 10 56.0 17.9 21.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6036 6049 10 2.6 383.8 1.0X -Parquet Vectorized (Pushdown) 296 302 4 53.1 18.8 20.4X -Native ORC Vectorized 4725 4753 22 3.3 300.4 1.3X -Native ORC Vectorized (Pushdown) 276 286 6 56.9 17.6 21.8X +Parquet Vectorized 6104 6135 28 2.6 388.1 1.0X +Parquet Vectorized (Pushdown) 279 288 6 56.3 17.8 21.8X +Native ORC Vectorized 4780 4816 31 3.3 303.9 1.3X +Native ORC Vectorized (Pushdown) 279 297 13 56.4 17.7 21.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6041 6050 6 2.6 384.1 1.0X -Parquet Vectorized (Pushdown) 292 302 7 53.8 18.6 20.7X -Native ORC Vectorized 4711 4747 26 3.3 299.5 1.3X -Native ORC Vectorized (Pushdown) 271 286 8 58.0 17.2 22.3X +Parquet Vectorized 6122 6149 19 2.6 389.2 1.0X +Parquet Vectorized (Pushdown) 283 290 5 55.5 18.0 21.6X +Native ORC Vectorized 4788 4807 15 3.3 304.4 1.3X +Native ORC Vectorized (Pushdown) 274 285 6 57.4 17.4 22.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6046 6067 22 2.6 384.4 1.0X -Parquet Vectorized (Pushdown) 296 302 4 53.1 18.8 20.4X -Native ORC Vectorized 4767 4804 28 3.3 303.1 1.3X -Native ORC Vectorized (Pushdown) 274 286 7 57.4 17.4 22.1X +Parquet Vectorized 6128 6134 6 2.6 389.6 1.0X +Parquet Vectorized (Pushdown) 277 282 3 56.8 17.6 22.1X +Native ORC Vectorized 4819 4831 9 3.3 306.4 1.3X +Native ORC Vectorized (Pushdown) 296 303 7 53.1 18.8 20.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6053 6060 6 2.6 384.9 1.0X -Parquet Vectorized (Pushdown) 296 298 2 53.2 18.8 20.5X -Native ORC Vectorized 4792 4801 8 3.3 304.7 1.3X -Native ORC Vectorized (Pushdown) 273 286 8 57.7 17.3 22.2X +Parquet Vectorized 6143 6158 16 2.6 390.5 1.0X +Parquet Vectorized (Pushdown) 281 289 9 55.9 17.9 21.8X +Native ORC Vectorized 4810 4822 12 3.3 305.8 1.3X +Native ORC Vectorized (Pushdown) 276 280 4 57.1 17.5 22.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6751 6789 38 2.3 429.2 1.0X -Parquet Vectorized (Pushdown) 1591 1607 15 9.9 101.1 4.2X -Native ORC Vectorized 5460 5476 19 2.9 347.1 1.2X -Native ORC Vectorized (Pushdown) 1457 1469 11 10.8 92.7 4.6X +Parquet Vectorized 6791 6806 13 2.3 431.7 1.0X +Parquet Vectorized (Pushdown) 1541 1553 8 10.2 98.0 4.4X +Native ORC Vectorized 5445 5461 15 2.9 346.2 1.2X +Native ORC Vectorized (Pushdown) 1389 1399 11 11.3 88.3 4.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9467 9480 15 1.7 601.9 1.0X -Parquet Vectorized (Pushdown) 6594 6601 10 2.4 419.2 1.4X -Native ORC Vectorized 8160 8178 19 1.9 518.8 1.2X -Native ORC Vectorized (Pushdown) 5978 5991 14 2.6 380.1 1.6X +Parquet Vectorized 9208 9246 24 1.7 585.4 1.0X +Parquet Vectorized (Pushdown) 6355 6366 12 2.5 404.0 1.4X +Native ORC Vectorized 7986 8006 22 2.0 507.7 1.2X +Native ORC Vectorized (Pushdown) 5817 5836 24 2.7 369.8 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11963 11975 19 1.3 760.6 1.0X -Parquet Vectorized (Pushdown) 11449 11464 16 1.4 727.9 1.0X -Native ORC Vectorized 10773 10783 10 1.5 684.9 1.1X -Native ORC Vectorized (Pushdown) 10394 10409 19 1.5 660.8 1.2X +Parquet Vectorized 11608 11632 22 1.4 738.0 1.0X +Parquet Vectorized (Pushdown) 11058 11081 15 1.4 703.1 1.0X +Native ORC Vectorized 10392 10449 58 1.5 660.7 1.1X +Native ORC Vectorized (Pushdown) 9987 10003 13 1.6 635.0 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12713 12733 15 1.2 808.3 1.0X -Parquet Vectorized (Pushdown) 12801 12815 14 1.2 813.9 1.0X -Native ORC Vectorized 11367 11387 16 1.4 722.7 1.1X -Native ORC Vectorized (Pushdown) 11474 11480 10 1.4 729.5 1.1X +Parquet Vectorized 12256 12273 18 1.3 779.2 1.0X +Parquet Vectorized (Pushdown) 12325 12363 28 1.3 783.6 1.0X +Native ORC Vectorized 10919 10943 29 1.4 694.2 1.1X +Native ORC Vectorized (Pushdown) 10980 11026 35 1.4 698.1 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12741 12750 9 1.2 810.1 1.0X -Parquet Vectorized (Pushdown) 12807 12836 31 1.2 814.2 1.0X -Native ORC Vectorized 11501 11506 6 1.4 731.2 1.1X -Native ORC Vectorized (Pushdown) 11585 11594 8 1.4 736.6 1.1X +Parquet Vectorized 12196 12240 38 1.3 775.4 1.0X +Parquet Vectorized (Pushdown) 12243 12306 54 1.3 778.4 1.0X +Native ORC Vectorized 10848 10869 22 1.4 689.7 1.1X +Native ORC Vectorized (Pushdown) 10937 10964 29 1.4 695.4 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12572 12595 22 1.3 799.3 1.0X -Parquet Vectorized (Pushdown) 12635 12654 28 1.2 803.3 1.0X -Native ORC Vectorized 11466 11493 19 1.4 729.0 1.1X -Native ORC Vectorized (Pushdown) 11548 11558 10 1.4 734.2 1.1X +Parquet Vectorized 12402 12415 12 1.3 788.5 1.0X +Parquet Vectorized (Pushdown) 12413 12427 14 1.3 789.2 1.0X +Native ORC Vectorized 10821 10859 25 1.5 688.0 1.1X +Native ORC Vectorized (Pushdown) 10916 10932 13 1.4 694.0 1.1X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5530 5572 29 2.8 351.6 1.0X -Parquet Vectorized (Pushdown) 243 256 15 64.7 15.5 22.7X -Native ORC Vectorized 6173 6214 31 2.5 392.5 0.9X -Native ORC Vectorized (Pushdown) 933 935 4 16.9 59.3 5.9X +Parquet Vectorized 5635 5682 35 2.8 358.3 1.0X +Parquet Vectorized (Pushdown) 246 252 6 63.9 15.7 22.9X +Native ORC Vectorized 6232 6241 6 2.5 396.2 0.9X +Native ORC Vectorized (Pushdown) 924 934 13 17.0 58.7 6.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5521 5537 14 2.8 351.0 1.0X -Parquet Vectorized (Pushdown) 245 257 11 64.2 15.6 22.5X -Native ORC Vectorized 6340 6348 5 2.5 403.1 0.9X -Native ORC Vectorized (Pushdown) 931 935 4 16.9 59.2 5.9X +Parquet Vectorized 5646 5669 23 2.8 359.0 1.0X +Parquet Vectorized (Pushdown) 249 259 6 63.3 15.8 22.7X +Native ORC Vectorized 6380 6408 26 2.5 405.7 0.9X +Native ORC Vectorized (Pushdown) 900 909 9 17.5 57.2 6.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5476 5490 14 2.9 348.2 1.0X -Parquet Vectorized (Pushdown) 288 298 12 54.7 18.3 19.0X -Native ORC Vectorized 6322 6341 11 2.5 401.9 0.9X -Native ORC Vectorized (Pushdown) 964 971 7 16.3 61.3 5.7X +Parquet Vectorized 5569 5581 11 2.8 354.0 1.0X +Parquet Vectorized (Pushdown) 297 303 6 52.9 18.9 18.7X +Native ORC Vectorized 6378 6387 6 2.5 405.5 0.9X +Native ORC Vectorized (Pushdown) 940 959 17 16.7 59.7 5.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5477 5493 10 2.9 348.2 1.0X -Parquet Vectorized (Pushdown) 286 302 20 55.0 18.2 19.2X -Native ORC Vectorized 6324 6340 17 2.5 402.1 0.9X -Native ORC Vectorized (Pushdown) 966 975 11 16.3 61.4 5.7X +Parquet Vectorized 5560 5569 6 2.8 353.5 1.0X +Parquet Vectorized (Pushdown) 305 309 4 51.6 19.4 18.2X +Native ORC Vectorized 6377 6407 31 2.5 405.4 0.9X +Native ORC Vectorized (Pushdown) 952 959 13 16.5 60.5 5.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5550 5561 9 2.8 352.8 1.0X -Parquet Vectorized (Pushdown) 288 296 8 54.6 18.3 19.3X -Native ORC Vectorized 6438 6452 10 2.4 409.3 0.9X -Native ORC Vectorized (Pushdown) 972 977 5 16.2 61.8 5.7X +Parquet Vectorized 5648 5663 17 2.8 359.1 1.0X +Parquet Vectorized (Pushdown) 288 293 4 54.5 18.3 19.6X +Native ORC Vectorized 6430 6456 20 2.4 408.8 0.9X +Native ORC Vectorized (Pushdown) 968 971 3 16.2 61.6 5.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 13575 13632 39 1.2 863.1 1.0X -Parquet Vectorized (Pushdown) 13578 13607 21 1.2 863.2 1.0X -Native ORC Vectorized 14550 14590 49 1.1 925.0 0.9X -Native ORC Vectorized (Pushdown) 14664 14775 78 1.1 932.3 0.9X +Parquet Vectorized 14383 14409 24 1.1 914.5 1.0X +Parquet Vectorized (Pushdown) 14425 14443 18 1.1 917.1 1.0X +Native ORC Vectorized 15288 15300 11 1.0 972.0 0.9X +Native ORC Vectorized (Pushdown) 15482 15517 60 1.0 984.3 0.9X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6543 6644 84 2.4 416.0 1.0X -Parquet Vectorized (Pushdown) 891 922 41 17.7 56.6 7.3X -Native ORC Vectorized 5543 5553 11 2.8 352.4 1.2X -Native ORC Vectorized (Pushdown) 5605 5619 8 2.8 356.4 1.2X +Parquet Vectorized 6744 6776 24 2.3 428.7 1.0X +Parquet Vectorized (Pushdown) 908 916 6 17.3 57.8 7.4X +Native ORC Vectorized 5592 5608 22 2.8 355.5 1.2X +Native ORC Vectorized (Pushdown) 5664 5687 23 2.8 360.1 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6367 6383 16 2.5 404.8 1.0X -Parquet Vectorized (Pushdown) 279 286 6 56.3 17.8 22.8X -Native ORC Vectorized 5367 5377 8 2.9 341.2 1.2X -Native ORC Vectorized (Pushdown) 5436 5463 21 2.9 345.6 1.2X +Parquet Vectorized 6447 6462 16 2.4 409.9 1.0X +Parquet Vectorized (Pushdown) 300 302 2 52.4 19.1 21.5X +Native ORC Vectorized 5416 5426 11 2.9 344.3 1.2X +Native ORC Vectorized (Pushdown) 5508 5521 12 2.9 350.2 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6369 6386 15 2.5 404.9 1.0X -Parquet Vectorized (Pushdown) 277 284 6 56.9 17.6 23.0X -Native ORC Vectorized 5341 5370 28 2.9 339.6 1.2X -Native ORC Vectorized (Pushdown) 5435 5443 10 2.9 345.5 1.2X +Parquet Vectorized 6447 6462 12 2.4 409.9 1.0X +Parquet Vectorized (Pushdown) 281 288 6 56.0 17.8 23.0X +Native ORC Vectorized 5394 5400 7 2.9 342.9 1.2X +Native ORC Vectorized (Pushdown) 5467 5502 29 2.9 347.6 1.2X ================================================================================================ Pushdown benchmark for StringEndsWith ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5594 5712 85 2.8 355.6 1.0X -Parquet Vectorized (Pushdown) 384 401 14 41.0 24.4 14.6X -Native ORC Vectorized 6399 6440 49 2.5 406.8 0.9X -Native ORC Vectorized (Pushdown) 6587 6606 15 2.4 418.8 0.8X +Parquet Vectorized 5688 5817 122 2.8 361.7 1.0X +Parquet Vectorized (Pushdown) 368 379 16 42.7 23.4 15.4X +Native ORC Vectorized 6433 6447 10 2.4 409.0 0.9X +Native ORC Vectorized (Pushdown) 6684 6708 21 2.4 424.9 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5518 5545 19 2.9 350.8 1.0X -Parquet Vectorized (Pushdown) 294 324 50 53.5 18.7 18.8X -Native ORC Vectorized 6314 6348 27 2.5 401.5 0.9X -Native ORC Vectorized (Pushdown) 6509 6530 20 2.4 413.8 0.8X +Parquet Vectorized 5563 5576 14 2.8 353.7 1.0X +Parquet Vectorized (Pushdown) 266 272 4 59.1 16.9 20.9X +Native ORC Vectorized 6386 6425 30 2.5 406.0 0.9X +Native ORC Vectorized (Pushdown) 6639 6689 48 2.4 422.1 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5527 5535 7 2.8 351.4 1.0X -Parquet Vectorized (Pushdown) 284 296 9 55.3 18.1 19.4X -Native ORC Vectorized 6290 6301 10 2.5 399.9 0.9X -Native ORC Vectorized (Pushdown) 6552 6565 13 2.4 416.6 0.8X +Parquet Vectorized 5574 5578 4 2.8 354.4 1.0X +Parquet Vectorized (Pushdown) 272 278 4 57.8 17.3 20.5X +Native ORC Vectorized 6333 6412 60 2.5 402.6 0.9X +Native ORC Vectorized (Pushdown) 6604 6667 88 2.4 419.9 0.8X ================================================================================================ Pushdown benchmark for StringContains ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5855 6077 170 2.7 372.3 1.0X -Parquet Vectorized (Pushdown) 922 952 40 17.1 58.6 6.4X -Native ORC Vectorized 6452 6541 82 2.4 410.2 0.9X -Native ORC Vectorized (Pushdown) 6639 6651 10 2.4 422.1 0.9X +Parquet Vectorized 5827 5939 73 2.7 370.5 1.0X +Parquet Vectorized (Pushdown) 810 829 17 19.4 51.5 7.2X +Native ORC Vectorized 6466 6550 65 2.4 411.1 0.9X +Native ORC Vectorized (Pushdown) 6691 6714 21 2.4 425.4 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5542 5550 8 2.8 352.3 1.0X -Parquet Vectorized (Pushdown) 296 310 8 53.2 18.8 18.7X -Native ORC Vectorized 6214 6226 11 2.5 395.1 0.9X -Native ORC Vectorized (Pushdown) 6419 6431 20 2.5 408.1 0.9X +Parquet Vectorized 5546 5555 10 2.8 352.6 1.0X +Parquet Vectorized (Pushdown) 268 276 4 58.6 17.1 20.7X +Native ORC Vectorized 6251 6258 7 2.5 397.4 0.9X +Native ORC Vectorized (Pushdown) 6454 6471 13 2.4 410.3 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5537 5552 17 2.8 352.0 1.0X -Parquet Vectorized (Pushdown) 297 308 10 53.0 18.9 18.7X -Native ORC Vectorized 6232 6246 15 2.5 396.2 0.9X -Native ORC Vectorized (Pushdown) 6407 6419 14 2.5 407.4 0.9X +Parquet Vectorized 5548 5561 10 2.8 352.7 1.0X +Parquet Vectorized (Pushdown) 268 275 7 58.6 17.1 20.7X +Native ORC Vectorized 6259 6269 8 2.5 397.9 0.9X +Native ORC Vectorized (Pushdown) 6466 6487 20 2.4 411.1 0.9X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2442 2459 21 6.4 155.3 1.0X -Parquet Vectorized (Pushdown) 77 90 14 203.1 4.9 31.5X -Native ORC Vectorized 3128 3145 24 5.0 198.9 0.8X -Native ORC Vectorized (Pushdown) 57 72 9 273.6 3.7 42.5X +Parquet Vectorized 2436 2443 4 6.5 154.9 1.0X +Parquet Vectorized (Pushdown) 71 74 4 222.9 4.5 34.5X +Native ORC Vectorized 3333 3346 14 4.7 211.9 0.7X +Native ORC Vectorized (Pushdown) 58 61 4 271.6 3.7 42.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3756 3808 31 4.2 238.8 1.0X -Parquet Vectorized (Pushdown) 1912 1937 42 8.2 121.6 2.0X -Native ORC Vectorized 4593 4618 26 3.4 292.0 0.8X -Native ORC Vectorized (Pushdown) 2069 2105 29 7.6 131.5 1.8X +Parquet Vectorized 3845 3881 42 4.1 244.4 1.0X +Parquet Vectorized (Pushdown) 1961 1989 29 8.0 124.7 2.0X +Native ORC Vectorized 4660 4678 17 3.4 296.3 0.8X +Native ORC Vectorized (Pushdown) 2076 2087 11 7.6 132.0 1.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8252 8270 22 1.9 524.6 1.0X -Parquet Vectorized (Pushdown) 7939 7987 28 2.0 504.7 1.0X -Native ORC Vectorized 9304 9335 42 1.7 591.5 0.9X -Native ORC Vectorized (Pushdown) 8912 8946 32 1.8 566.6 0.9X +Parquet Vectorized 8473 8500 17 1.9 538.7 1.0X +Parquet Vectorized (Pushdown) 8212 8248 23 1.9 522.1 1.0X +Native ORC Vectorized 9900 9917 15 1.6 629.4 0.9X +Native ORC Vectorized (Pushdown) 9487 9498 9 1.7 603.2 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9337 9380 67 1.7 593.6 1.0X -Parquet Vectorized (Pushdown) 9347 9376 22 1.7 594.3 1.0X -Native ORC Vectorized 10538 10565 29 1.5 670.0 0.9X -Native ORC Vectorized (Pushdown) 10533 10559 28 1.5 669.7 0.9X +Parquet Vectorized 9464 9502 33 1.7 601.7 1.0X +Parquet Vectorized (Pushdown) 9462 9502 39 1.7 601.6 1.0X +Native ORC Vectorized 10726 10775 35 1.5 681.9 0.9X +Native ORC Vectorized (Pushdown) 10755 10784 24 1.5 683.8 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2598 2612 15 6.1 165.2 1.0X -Parquet Vectorized (Pushdown) 72 83 13 217.1 4.6 35.9X -Native ORC Vectorized 3113 3124 14 5.1 197.9 0.8X -Native ORC Vectorized (Pushdown) 55 64 12 285.4 3.5 47.1X +Parquet Vectorized 2638 2651 15 6.0 167.7 1.0X +Parquet Vectorized (Pushdown) 71 85 18 220.9 4.5 37.0X +Native ORC Vectorized 3330 3344 14 4.7 211.7 0.8X +Native ORC Vectorized (Pushdown) 55 60 5 285.6 3.5 47.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3358 3379 34 4.7 213.5 1.0X -Parquet Vectorized (Pushdown) 1080 1111 28 14.6 68.7 3.1X -Native ORC Vectorized 3874 3884 13 4.1 246.3 0.9X -Native ORC Vectorized (Pushdown) 1111 1137 34 14.2 70.6 3.0X +Parquet Vectorized 3399 3440 49 4.6 216.1 1.0X +Parquet Vectorized (Pushdown) 1064 1076 10 14.8 67.7 3.2X +Native ORC Vectorized 4064 4078 12 3.9 258.4 0.8X +Native ORC Vectorized (Pushdown) 1103 1109 6 14.3 70.2 3.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6310 6332 17 2.5 401.2 1.0X -Parquet Vectorized (Pushdown) 5049 5073 21 3.1 321.0 1.2X -Native ORC Vectorized 6975 6984 7 2.3 443.5 0.9X -Native ORC Vectorized (Pushdown) 5396 5411 14 2.9 343.1 1.2X +Parquet Vectorized 6242 6260 21 2.5 396.9 1.0X +Parquet Vectorized (Pushdown) 4988 5018 32 3.2 317.2 1.3X +Native ORC Vectorized 6949 6963 14 2.3 441.8 0.9X +Native ORC Vectorized (Pushdown) 5318 5332 10 3.0 338.1 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9115 9121 4 1.7 579.5 1.0X -Parquet Vectorized (Pushdown) 8907 8924 18 1.8 566.3 1.0X -Native ORC Vectorized 9981 9994 9 1.6 634.6 0.9X -Native ORC Vectorized (Pushdown) 9656 9675 13 1.6 613.9 0.9X +Parquet Vectorized 9079 9090 14 1.7 577.2 1.0X +Parquet Vectorized (Pushdown) 8825 8842 19 1.8 561.1 1.0X +Native ORC Vectorized 9902 9928 26 1.6 629.5 0.9X +Native ORC Vectorized (Pushdown) 9611 9616 4 1.6 611.0 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3755 3781 23 4.2 238.7 1.0X -Parquet Vectorized (Pushdown) 78 81 2 201.4 5.0 48.1X -Native ORC Vectorized 3131 3155 36 5.0 199.0 1.2X -Native ORC Vectorized (Pushdown) 54 56 4 292.6 3.4 69.8X +Parquet Vectorized 3823 3841 12 4.1 243.1 1.0X +Parquet Vectorized (Pushdown) 80 83 4 196.7 5.1 47.8X +Native ORC Vectorized 3330 3350 18 4.7 211.7 1.1X +Native ORC Vectorized (Pushdown) 55 60 5 287.2 3.5 69.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4741 4755 18 3.3 301.4 1.0X -Parquet Vectorized (Pushdown) 1415 1417 2 11.1 90.0 3.3X -Native ORC Vectorized 4049 4065 20 3.9 257.4 1.2X -Native ORC Vectorized (Pushdown) 1220 1231 17 12.9 77.6 3.9X +Parquet Vectorized 4750 4807 89 3.3 302.0 1.0X +Parquet Vectorized (Pushdown) 1400 1407 7 11.2 89.0 3.4X +Native ORC Vectorized 4157 4168 12 3.8 264.3 1.1X +Native ORC Vectorized (Pushdown) 1211 1215 3 13.0 77.0 3.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8556 8564 9 1.8 543.9 1.0X -Parquet Vectorized (Pushdown) 6743 6755 7 2.3 428.7 1.3X -Native ORC Vectorized 7513 7524 9 2.1 477.7 1.1X -Native ORC Vectorized (Pushdown) 5906 5914 5 2.7 375.5 1.4X +Parquet Vectorized 8636 8662 19 1.8 549.1 1.0X +Parquet Vectorized (Pushdown) 6754 6787 25 2.3 429.4 1.3X +Native ORC Vectorized 7526 7536 12 2.1 478.5 1.1X +Native ORC Vectorized (Pushdown) 5915 5934 13 2.7 376.0 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12314 12332 16 1.3 782.9 1.0X -Parquet Vectorized (Pushdown) 11976 11983 6 1.3 761.4 1.0X -Native ORC Vectorized 10898 10916 14 1.4 692.9 1.1X -Native ORC Vectorized (Pushdown) 10605 10636 30 1.5 674.3 1.2X +Parquet Vectorized 12415 12446 27 1.3 789.3 1.0X +Parquet Vectorized (Pushdown) 12049 12076 24 1.3 766.1 1.0X +Native ORC Vectorized 10912 10980 93 1.4 693.7 1.1X +Native ORC Vectorized (Pushdown) 10559 10608 43 1.5 671.4 1.2X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6282 6318 39 2.5 399.4 1.0X -Parquet Vectorized (Pushdown) 329 370 54 47.8 20.9 19.1X -Native ORC Vectorized 4793 4843 35 3.3 304.7 1.3X -Native ORC Vectorized (Pushdown) 307 321 15 51.2 19.5 20.4X +Parquet Vectorized 6312 6343 25 2.5 401.3 1.0X +Parquet Vectorized (Pushdown) 312 328 11 50.4 19.8 20.2X +Native ORC Vectorized 4774 4861 102 3.3 303.5 1.3X +Native ORC Vectorized (Pushdown) 286 300 15 55.0 18.2 22.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6120 6135 12 2.6 389.1 1.0X -Parquet Vectorized (Pushdown) 294 304 8 53.5 18.7 20.8X -Native ORC Vectorized 4787 4815 38 3.3 304.4 1.3X -Native ORC Vectorized (Pushdown) 285 301 12 55.2 18.1 21.5X +Parquet Vectorized 6158 6190 20 2.6 391.5 1.0X +Parquet Vectorized (Pushdown) 292 302 11 53.9 18.5 21.1X +Native ORC Vectorized 4712 4748 28 3.3 299.6 1.3X +Native ORC Vectorized (Pushdown) 285 299 13 55.2 18.1 21.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6136 6151 20 2.6 390.1 1.0X -Parquet Vectorized (Pushdown) 297 304 9 53.0 18.9 20.7X -Native ORC Vectorized 4787 4802 26 3.3 304.3 1.3X -Native ORC Vectorized (Pushdown) 286 296 7 55.0 18.2 21.4X +Parquet Vectorized 6182 6209 20 2.5 393.0 1.0X +Parquet Vectorized (Pushdown) 288 301 9 54.5 18.3 21.4X +Native ORC Vectorized 4730 4794 51 3.3 300.7 1.3X +Native ORC Vectorized (Pushdown) 295 301 5 53.3 18.8 20.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6143 6155 13 2.6 390.6 1.0X -Parquet Vectorized (Pushdown) 304 307 3 51.7 19.3 20.2X -Native ORC Vectorized 4811 4826 16 3.3 305.9 1.3X -Native ORC Vectorized (Pushdown) 294 301 5 53.5 18.7 20.9X +Parquet Vectorized 6239 6254 20 2.5 396.7 1.0X +Parquet Vectorized (Pushdown) 306 311 4 51.4 19.5 20.4X +Native ORC Vectorized 4747 4811 37 3.3 301.8 1.3X +Native ORC Vectorized (Pushdown) 304 307 3 51.8 19.3 20.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6127 6133 6 2.6 389.5 1.0X -Parquet Vectorized (Pushdown) 307 312 6 51.3 19.5 20.0X -Native ORC Vectorized 4818 4845 35 3.3 306.3 1.3X -Native ORC Vectorized (Pushdown) 298 310 9 52.8 18.9 20.6X +Parquet Vectorized 6212 6253 42 2.5 395.0 1.0X +Parquet Vectorized (Pushdown) 306 317 9 51.5 19.4 20.3X +Native ORC Vectorized 4814 4853 40 3.3 306.1 1.3X +Native ORC Vectorized (Pushdown) 306 314 6 51.3 19.5 20.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6122 6141 22 2.6 389.2 1.0X -Parquet Vectorized (Pushdown) 310 317 7 50.8 19.7 19.8X -Native ORC Vectorized 4813 4835 23 3.3 306.0 1.3X -Native ORC Vectorized (Pushdown) 300 307 6 52.4 19.1 20.4X +Parquet Vectorized 6192 6216 23 2.5 393.7 1.0X +Parquet Vectorized (Pushdown) 303 309 4 51.9 19.3 20.4X +Native ORC Vectorized 4752 4807 46 3.3 302.1 1.3X +Native ORC Vectorized (Pushdown) 307 320 9 51.2 19.5 20.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6326 6331 5 2.5 402.2 1.0X -Parquet Vectorized (Pushdown) 893 901 9 17.6 56.7 7.1X -Native ORC Vectorized 5039 5049 12 3.1 320.3 1.3X -Native ORC Vectorized (Pushdown) 399 402 3 39.4 25.4 15.8X +Parquet Vectorized 6384 6413 29 2.5 405.9 1.0X +Parquet Vectorized (Pushdown) 885 890 5 17.8 56.2 7.2X +Native ORC Vectorized 4935 4972 41 3.2 313.8 1.3X +Native ORC Vectorized (Pushdown) 421 425 2 37.3 26.8 15.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6309 6328 20 2.5 401.1 1.0X -Parquet Vectorized (Pushdown) 3291 3308 16 4.8 209.2 1.9X -Native ORC Vectorized 5019 5032 12 3.1 319.1 1.3X -Native ORC Vectorized (Pushdown) 429 433 3 36.6 27.3 14.7X +Parquet Vectorized 6392 6432 25 2.5 406.4 1.0X +Parquet Vectorized (Pushdown) 3230 3247 12 4.9 205.4 2.0X +Native ORC Vectorized 4940 4974 34 3.2 314.1 1.3X +Native ORC Vectorized (Pushdown) 430 434 4 36.6 27.4 14.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6311 6335 22 2.5 401.3 1.0X -Parquet Vectorized (Pushdown) 5508 5519 9 2.9 350.2 1.1X -Native ORC Vectorized 5020 5036 13 3.1 319.2 1.3X -Native ORC Vectorized (Pushdown) 442 444 2 35.6 28.1 14.3X +Parquet Vectorized 6399 6407 5 2.5 406.8 1.0X +Parquet Vectorized (Pushdown) 5280 5305 32 3.0 335.7 1.2X +Native ORC Vectorized 4913 4920 9 3.2 312.4 1.3X +Native ORC Vectorized (Pushdown) 422 428 5 37.3 26.8 15.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6271 6290 14 2.5 398.7 1.0X -Parquet Vectorized (Pushdown) 872 877 4 18.0 55.4 7.2X -Native ORC Vectorized 4971 4981 9 3.2 316.0 1.3X -Native ORC Vectorized (Pushdown) 497 502 4 31.7 31.6 12.6X +Parquet Vectorized 6387 6415 28 2.5 406.1 1.0X +Parquet Vectorized (Pushdown) 879 884 3 17.9 55.9 7.3X +Native ORC Vectorized 4898 4907 8 3.2 311.4 1.3X +Native ORC Vectorized (Pushdown) 512 514 1 30.7 32.6 12.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6284 6295 8 2.5 399.5 1.0X -Parquet Vectorized (Pushdown) 3320 3340 13 4.7 211.1 1.9X -Native ORC Vectorized 4972 4984 8 3.2 316.1 1.3X -Native ORC Vectorized (Pushdown) 564 567 3 27.9 35.9 11.1X +Parquet Vectorized 6409 6424 15 2.5 407.5 1.0X +Parquet Vectorized (Pushdown) 3279 3297 30 4.8 208.5 2.0X +Native ORC Vectorized 4900 4920 24 3.2 311.5 1.3X +Native ORC Vectorized (Pushdown) 584 592 7 26.9 37.2 11.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6289 6307 12 2.5 399.8 1.0X -Parquet Vectorized (Pushdown) 5740 5750 11 2.7 365.0 1.1X -Native ORC Vectorized 4972 4982 6 3.2 316.1 1.3X -Native ORC Vectorized (Pushdown) 559 567 11 28.1 35.6 11.2X +Parquet Vectorized 6420 6445 26 2.4 408.2 1.0X +Parquet Vectorized (Pushdown) 5734 5745 12 2.7 364.6 1.1X +Native ORC Vectorized 4940 5018 59 3.2 314.0 1.3X +Native ORC Vectorized (Pushdown) 575 581 7 27.4 36.5 11.2X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2748 2766 16 5.7 174.7 1.0X -Parquet Vectorized (Pushdown) 107 114 8 146.5 6.8 25.6X -Native ORC Vectorized 2194 2203 9 7.2 139.5 1.3X -Native ORC Vectorized (Pushdown) 112 121 9 140.3 7.1 24.5X +Parquet Vectorized 2841 2865 29 5.5 180.6 1.0X +Parquet Vectorized (Pushdown) 112 122 10 140.6 7.1 25.4X +Native ORC Vectorized 2239 2247 8 7.0 142.4 1.3X +Native ORC Vectorized (Pushdown) 115 130 16 136.9 7.3 24.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3325 3359 48 4.7 211.4 1.0X -Parquet Vectorized (Pushdown) 960 973 13 16.4 61.0 3.5X -Native ORC Vectorized 2691 2705 17 5.8 171.1 1.2X -Native ORC Vectorized (Pushdown) 840 846 3 18.7 53.4 4.0X +Parquet Vectorized 3366 3422 87 4.7 214.0 1.0X +Parquet Vectorized (Pushdown) 987 990 2 15.9 62.7 3.4X +Native ORC Vectorized 2766 2784 10 5.7 175.9 1.2X +Native ORC Vectorized (Pushdown) 876 879 3 18.0 55.7 3.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5896 5906 9 2.7 374.9 1.0X -Parquet Vectorized (Pushdown) 4608 4631 20 3.4 293.0 1.3X -Native ORC Vectorized 5059 5084 21 3.1 321.6 1.2X -Native ORC Vectorized (Pushdown) 4014 4027 8 3.9 255.2 1.5X +Parquet Vectorized 6015 6025 7 2.6 382.4 1.0X +Parquet Vectorized (Pushdown) 4690 4699 9 3.4 298.2 1.3X +Native ORC Vectorized 5138 5194 38 3.1 326.7 1.2X +Native ORC Vectorized (Pushdown) 3988 4002 10 3.9 253.5 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8646 8666 24 1.8 549.7 1.0X -Parquet Vectorized (Pushdown) 8379 8396 11 1.9 532.7 1.0X -Native ORC Vectorized 7526 7539 14 2.1 478.5 1.1X -Native ORC Vectorized (Pushdown) 7319 7342 16 2.1 465.4 1.2X +Parquet Vectorized 8535 8557 21 1.8 542.7 1.0X +Parquet Vectorized (Pushdown) 8308 8326 14 1.9 528.2 1.0X +Native ORC Vectorized 7581 7600 24 2.1 482.0 1.1X +Native ORC Vectorized (Pushdown) 7379 7399 22 2.1 469.1 1.2X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3102 3122 23 5.1 197.2 1.0X -Parquet Vectorized (Pushdown) 3096 3104 5 5.1 196.8 1.0X -Native ORC Vectorized 1983 1994 15 7.9 126.1 1.6X -Native ORC Vectorized (Pushdown) 39 44 5 404.7 2.5 79.8X +Parquet Vectorized 3155 3166 9 5.0 200.6 1.0X +Parquet Vectorized (Pushdown) 3169 3174 5 5.0 201.5 1.0X +Native ORC Vectorized 2102 2116 17 7.5 133.7 1.5X +Native ORC Vectorized (Pushdown) 39 44 6 399.9 2.5 80.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3791 3820 27 4.1 241.0 1.0X -Parquet Vectorized (Pushdown) 3755 3774 12 4.2 238.8 1.0X -Native ORC Vectorized 2618 2635 18 6.0 166.5 1.4X -Native ORC Vectorized (Pushdown) 860 865 6 18.3 54.7 4.4X +Parquet Vectorized 3827 3848 30 4.1 243.3 1.0X +Parquet Vectorized (Pushdown) 3803 3831 37 4.1 241.8 1.0X +Native ORC Vectorized 2738 2757 20 5.7 174.1 1.4X +Native ORC Vectorized (Pushdown) 879 887 9 17.9 55.9 4.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6492 6500 8 2.4 412.8 1.0X -Parquet Vectorized (Pushdown) 6485 6497 12 2.4 412.3 1.0X -Native ORC Vectorized 5272 5286 21 3.0 335.2 1.2X -Native ORC Vectorized (Pushdown) 4245 4253 9 3.7 269.9 1.5X +Parquet Vectorized 6597 6622 19 2.4 419.4 1.0X +Parquet Vectorized (Pushdown) 6618 6639 14 2.4 420.8 1.0X +Native ORC Vectorized 5324 5342 19 3.0 338.5 1.2X +Native ORC Vectorized (Pushdown) 4259 4264 5 3.7 270.8 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9353 9362 9 1.7 594.7 1.0X -Parquet Vectorized (Pushdown) 9335 9349 17 1.7 593.5 1.0X -Native ORC Vectorized 7781 7799 17 2.0 494.7 1.2X -Native ORC Vectorized (Pushdown) 7598 7613 9 2.1 483.1 1.2X +Parquet Vectorized 9349 9366 22 1.7 594.4 1.0X +Parquet Vectorized (Pushdown) 9360 9391 21 1.7 595.1 1.0X +Native ORC Vectorized 7882 7909 23 2.0 501.1 1.2X +Native ORC Vectorized (Pushdown) 7666 7676 6 2.1 487.4 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2578 2590 12 6.1 163.9 1.0X -Parquet Vectorized (Pushdown) 70 77 11 223.4 4.5 36.6X -Native ORC Vectorized 1982 1987 8 7.9 126.0 1.3X -Native ORC Vectorized (Pushdown) 39 43 5 404.6 2.5 66.3X +Parquet Vectorized 2617 2627 7 6.0 166.4 1.0X +Parquet Vectorized (Pushdown) 69 74 9 229.5 4.4 38.2X +Native ORC Vectorized 2092 2097 4 7.5 133.0 1.3X +Native ORC Vectorized (Pushdown) 38 43 5 409.7 2.4 68.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3228 3236 9 4.9 205.2 1.0X -Parquet Vectorized (Pushdown) 983 987 5 16.0 62.5 3.3X -Native ORC Vectorized 2607 2617 6 6.0 165.7 1.2X -Native ORC Vectorized (Pushdown) 859 864 4 18.3 54.6 3.8X +Parquet Vectorized 3278 3287 10 4.8 208.4 1.0X +Parquet Vectorized (Pushdown) 999 1010 9 15.7 63.5 3.3X +Native ORC Vectorized 2724 2732 7 5.8 173.2 1.2X +Native ORC Vectorized (Pushdown) 864 870 5 18.2 54.9 3.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5897 5908 9 2.7 374.9 1.0X -Parquet Vectorized (Pushdown) 4693 4705 13 3.4 298.3 1.3X -Native ORC Vectorized 5145 5195 75 3.1 327.1 1.1X -Native ORC Vectorized (Pushdown) 4134 4139 5 3.8 262.8 1.4X +Parquet Vectorized 6098 6125 40 2.6 387.7 1.0X +Parquet Vectorized (Pushdown) 4842 4859 15 3.2 307.9 1.3X +Native ORC Vectorized 5243 5246 3 3.0 333.4 1.2X +Native ORC Vectorized (Pushdown) 4205 4220 14 3.7 267.4 1.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8540 8549 9 1.8 542.9 1.0X -Parquet Vectorized (Pushdown) 8330 8339 11 1.9 529.6 1.0X -Native ORC Vectorized 7638 7650 11 2.1 485.6 1.1X -Native ORC Vectorized (Pushdown) 7440 7448 11 2.1 473.0 1.1X +Parquet Vectorized 8867 8893 24 1.8 563.7 1.0X +Parquet Vectorized (Pushdown) 8630 8677 38 1.8 548.7 1.0X +Native ORC Vectorized 7897 7900 2 2.0 502.1 1.1X +Native ORC Vectorized (Pushdown) 7700 7716 11 2.0 489.6 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2748 2759 18 5.7 174.7 1.0X -Parquet Vectorized (Pushdown) 70 73 3 224.7 4.5 39.3X -Native ORC Vectorized 1986 1999 18 7.9 126.2 1.4X -Native ORC Vectorized (Pushdown) 39 42 5 407.9 2.5 71.3X +Parquet Vectorized 2783 2801 17 5.7 177.0 1.0X +Parquet Vectorized (Pushdown) 72 75 4 218.9 4.6 38.7X +Native ORC Vectorized 2023 2032 6 7.8 128.6 1.4X +Native ORC Vectorized (Pushdown) 40 43 4 393.7 2.5 69.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3397 3403 7 4.6 216.0 1.0X -Parquet Vectorized (Pushdown) 999 1006 6 15.7 63.5 3.4X -Native ORC Vectorized 2612 2620 12 6.0 166.1 1.3X -Native ORC Vectorized (Pushdown) 876 879 2 18.0 55.7 3.9X +Parquet Vectorized 3474 3479 3 4.5 220.9 1.0X +Parquet Vectorized (Pushdown) 1036 1044 8 15.2 65.9 3.4X +Native ORC Vectorized 2757 2766 7 5.7 175.3 1.3X +Native ORC Vectorized (Pushdown) 910 914 4 17.3 57.9 3.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6116 6123 9 2.6 388.8 1.0X -Parquet Vectorized (Pushdown) 4802 4813 11 3.3 305.3 1.3X -Native ORC Vectorized 5152 5160 7 3.1 327.6 1.2X -Native ORC Vectorized (Pushdown) 4126 4138 9 3.8 262.3 1.5X +Parquet Vectorized 6243 6270 16 2.5 396.9 1.0X +Parquet Vectorized (Pushdown) 4928 4956 36 3.2 313.3 1.3X +Native ORC Vectorized 5326 5332 5 3.0 338.6 1.2X +Native ORC Vectorized (Pushdown) 4262 4272 8 3.7 271.0 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8692 8746 68 1.8 552.6 1.0X -Parquet Vectorized (Pushdown) 8481 8495 15 1.9 539.2 1.0X -Native ORC Vectorized 7644 7653 10 2.1 486.0 1.1X -Native ORC Vectorized (Pushdown) 7449 7462 9 2.1 473.6 1.2X +Parquet Vectorized 8993 9036 38 1.7 571.7 1.0X +Parquet Vectorized (Pushdown) 8777 8803 19 1.8 558.0 1.0X +Native ORC Vectorized 7774 7790 17 2.0 494.3 1.2X +Native ORC Vectorized (Pushdown) 7573 7587 12 2.1 481.5 1.2X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 58 72 19 0.0 57741888.0 1.0X -Parquet Vectorized (Pushdown) 58 61 3 0.0 58429929.0 1.0X -Native ORC Vectorized 51 53 2 0.0 51359839.0 1.1X -Native ORC Vectorized (Pushdown) 53 55 3 0.0 53142981.0 1.1X +Parquet Vectorized 49 75 18 0.0 49268544.0 1.0X +Parquet Vectorized (Pushdown) 49 52 3 0.0 48949281.0 1.0X +Native ORC Vectorized 43 46 3 0.0 43343584.0 1.1X +Native ORC Vectorized (Pushdown) 44 48 4 0.0 44392858.0 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 400 420 16 0.0 400224642.0 1.0X -Parquet Vectorized (Pushdown) 399 417 11 0.0 399319343.0 1.0X -Native ORC Vectorized 387 393 7 0.0 387215337.0 1.0X -Native ORC Vectorized (Pushdown) 390 396 7 0.0 389851290.0 1.0X +Parquet Vectorized 193 196 3 0.0 192567723.0 1.0X +Parquet Vectorized (Pushdown) 191 206 18 0.0 191266175.0 1.0X +Native ORC Vectorized 178 182 4 0.0 178471724.0 1.1X +Native ORC Vectorized (Pushdown) 184 190 7 0.0 183580008.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 2216 2266 67 0.0 2215862652.0 1.0X -Parquet Vectorized (Pushdown) 2237 2281 47 0.0 2237304947.0 1.0X -Native ORC Vectorized 2202 2257 58 0.0 2202335420.0 1.0X -Native ORC Vectorized (Pushdown) 2219 2262 65 0.0 2219444511.0 1.0X +Parquet Vectorized 581 587 6 0.0 581443562.0 1.0X +Parquet Vectorized (Pushdown) 591 611 14 0.0 591021175.0 1.0X +Native ORC Vectorized 563 580 11 0.0 563194077.0 1.0X +Native ORC Vectorized (Pushdown) 583 597 12 0.0 582533796.0 1.0X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index f762a7147d31b..e6f878de0a974 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -2,733 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6564 6709 108 2.4 417.3 1.0X -Parquet Vectorized (Pushdown) 315 335 18 50.0 20.0 20.9X -Native ORC Vectorized 5085 5205 71 3.1 323.3 1.3X -Native ORC Vectorized (Pushdown) 296 309 11 53.2 18.8 22.2X +Parquet Vectorized 6867 6919 54 2.3 436.6 1.0X +Parquet Vectorized (Pushdown) 313 342 24 50.3 19.9 22.0X +Native ORC Vectorized 5135 5177 42 3.1 326.5 1.3X +Native ORC Vectorized (Pushdown) 314 327 9 50.1 19.9 21.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6573 6616 51 2.4 417.9 1.0X -Parquet Vectorized (Pushdown) 291 309 16 54.1 18.5 22.6X -Native ORC Vectorized 5027 5047 17 3.1 319.6 1.3X -Native ORC Vectorized (Pushdown) 292 316 19 53.9 18.5 22.5X +Parquet Vectorized 6952 6967 17 2.3 442.0 1.0X +Parquet Vectorized (Pushdown) 313 324 14 50.2 19.9 22.2X +Native ORC Vectorized 5212 5234 18 3.0 331.4 1.3X +Native ORC Vectorized (Pushdown) 318 331 8 49.5 20.2 21.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6453 6497 31 2.4 410.3 1.0X -Parquet Vectorized (Pushdown) 289 295 6 54.4 18.4 22.3X -Native ORC Vectorized 4973 5006 25 3.2 316.2 1.3X -Native ORC Vectorized (Pushdown) 276 288 14 57.0 17.5 23.4X +Parquet Vectorized 6921 6940 11 2.3 440.1 1.0X +Parquet Vectorized (Pushdown) 299 310 11 52.7 19.0 23.2X +Native ORC Vectorized 5203 5210 6 3.0 330.8 1.3X +Native ORC Vectorized (Pushdown) 312 319 7 50.4 19.8 22.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6498 6553 74 2.4 413.1 1.0X -Parquet Vectorized (Pushdown) 284 294 7 55.3 18.1 22.9X -Native ORC Vectorized 5070 5087 11 3.1 322.3 1.3X -Native ORC Vectorized (Pushdown) 272 287 14 57.9 17.3 23.9X +Parquet Vectorized 6899 6925 15 2.3 438.6 1.0X +Parquet Vectorized (Pushdown) 286 303 13 55.0 18.2 24.1X +Native ORC Vectorized 5194 5210 15 3.0 330.2 1.3X +Native ORC Vectorized (Pushdown) 296 303 6 53.2 18.8 23.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6558 6601 43 2.4 416.9 1.0X -Parquet Vectorized (Pushdown) 275 288 9 57.1 17.5 23.8X -Native ORC Vectorized 5016 5046 26 3.1 318.9 1.3X -Native ORC Vectorized (Pushdown) 273 289 18 57.5 17.4 24.0X +Parquet Vectorized 6934 6957 27 2.3 440.8 1.0X +Parquet Vectorized (Pushdown) 288 296 8 54.6 18.3 24.1X +Native ORC Vectorized 5212 5229 15 3.0 331.4 1.3X +Native ORC Vectorized (Pushdown) 304 308 3 51.8 19.3 22.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 13444 13642 185 1.2 854.8 1.0X -Parquet Vectorized (Pushdown) 13455 13505 51 1.2 855.4 1.0X -Native ORC Vectorized 12196 12247 41 1.3 775.4 1.1X -Native ORC Vectorized (Pushdown) 12230 12264 21 1.3 777.5 1.1X +Parquet Vectorized 13657 13798 103 1.2 868.3 1.0X +Parquet Vectorized (Pushdown) 13709 13730 14 1.1 871.6 1.0X +Native ORC Vectorized 12028 12061 37 1.3 764.7 1.1X +Native ORC Vectorized (Pushdown) 12105 12152 29 1.3 769.6 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6126 6159 26 2.6 389.5 1.0X -Parquet Vectorized (Pushdown) 260 273 10 60.6 16.5 23.6X -Native ORC Vectorized 4546 4572 18 3.5 289.0 1.3X -Native ORC Vectorized (Pushdown) 260 275 11 60.5 16.5 23.5X +Parquet Vectorized 6544 6575 26 2.4 416.1 1.0X +Parquet Vectorized (Pushdown) 274 283 9 57.4 17.4 23.9X +Native ORC Vectorized 4734 4753 13 3.3 301.0 1.4X +Native ORC Vectorized (Pushdown) 283 293 8 55.5 18.0 23.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6123 6138 11 2.6 389.3 1.0X -Parquet Vectorized (Pushdown) 273 286 15 57.7 17.3 22.4X -Native ORC Vectorized 4557 4590 42 3.5 289.7 1.3X -Native ORC Vectorized (Pushdown) 265 277 8 59.3 16.9 23.1X +Parquet Vectorized 6522 6549 39 2.4 414.7 1.0X +Parquet Vectorized (Pushdown) 285 296 10 55.3 18.1 22.9X +Native ORC Vectorized 4717 4734 11 3.3 299.9 1.4X +Native ORC Vectorized (Pushdown) 290 296 5 54.3 18.4 22.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6105 6212 72 2.6 388.1 1.0X -Parquet Vectorized (Pushdown) 272 277 7 57.8 17.3 22.4X -Native ORC Vectorized 4581 4651 77 3.4 291.2 1.3X -Native ORC Vectorized (Pushdown) 264 275 9 59.7 16.8 23.2X +Parquet Vectorized 6556 6567 10 2.4 416.8 1.0X +Parquet Vectorized (Pushdown) 279 288 6 56.3 17.8 23.5X +Native ORC Vectorized 4778 4790 8 3.3 303.8 1.4X +Native ORC Vectorized (Pushdown) 285 291 4 55.2 18.1 23.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6162 6217 59 2.6 391.8 1.0X -Parquet Vectorized (Pushdown) 263 275 9 59.8 16.7 23.4X -Native ORC Vectorized 4611 4630 23 3.4 293.1 1.3X -Native ORC Vectorized (Pushdown) 259 267 5 60.8 16.4 23.8X +Parquet Vectorized 6561 6594 41 2.4 417.1 1.0X +Parquet Vectorized (Pushdown) 279 284 4 56.4 17.7 23.5X +Native ORC Vectorized 4785 4792 8 3.3 304.2 1.4X +Native ORC Vectorized (Pushdown) 284 292 6 55.3 18.1 23.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6109 6137 21 2.6 388.4 1.0X -Parquet Vectorized (Pushdown) 260 270 5 60.4 16.5 23.5X -Native ORC Vectorized 4596 4621 34 3.4 292.2 1.3X -Native ORC Vectorized (Pushdown) 263 272 7 59.7 16.8 23.2X +Parquet Vectorized 6568 6587 21 2.4 417.6 1.0X +Parquet Vectorized (Pushdown) 277 282 3 56.9 17.6 23.7X +Native ORC Vectorized 4775 4798 19 3.3 303.6 1.4X +Native ORC Vectorized (Pushdown) 284 290 5 55.3 18.1 23.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6104 6142 41 2.6 388.1 1.0X -Parquet Vectorized (Pushdown) 266 278 13 59.0 16.9 22.9X -Native ORC Vectorized 4601 4668 40 3.4 292.5 1.3X -Native ORC Vectorized (Pushdown) 264 271 7 59.5 16.8 23.1X +Parquet Vectorized 6557 6576 20 2.4 416.9 1.0X +Parquet Vectorized (Pushdown) 275 283 7 57.3 17.5 23.9X +Native ORC Vectorized 4783 4807 23 3.3 304.1 1.4X +Native ORC Vectorized (Pushdown) 284 289 4 55.4 18.0 23.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6775 6878 122 2.3 430.8 1.0X -Parquet Vectorized (Pushdown) 1502 1519 13 10.5 95.5 4.5X -Native ORC Vectorized 5241 5259 17 3.0 333.2 1.3X -Native ORC Vectorized (Pushdown) 1346 1359 11 11.7 85.6 5.0X +Parquet Vectorized 7224 7258 23 2.2 459.3 1.0X +Parquet Vectorized (Pushdown) 1586 1589 4 9.9 100.8 4.6X +Native ORC Vectorized 5423 5455 30 2.9 344.8 1.3X +Native ORC Vectorized (Pushdown) 1408 1430 25 11.2 89.5 5.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9068 9115 48 1.7 576.5 1.0X -Parquet Vectorized (Pushdown) 6144 6157 17 2.6 390.6 1.5X -Native ORC Vectorized 7649 7712 67 2.1 486.3 1.2X -Native ORC Vectorized (Pushdown) 5542 5561 15 2.8 352.4 1.6X +Parquet Vectorized 9684 9692 5 1.6 615.7 1.0X +Parquet Vectorized (Pushdown) 6559 6581 17 2.4 417.0 1.5X +Native ORC Vectorized 7866 7894 24 2.0 500.1 1.2X +Native ORC Vectorized (Pushdown) 5654 5668 11 2.8 359.5 1.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11477 11634 171 1.4 729.7 1.0X -Parquet Vectorized (Pushdown) 10963 11008 79 1.4 697.0 1.0X -Native ORC Vectorized 9938 9974 34 1.6 631.9 1.2X -Native ORC Vectorized (Pushdown) 9611 9667 77 1.6 611.1 1.2X +Parquet Vectorized 12234 12243 10 1.3 777.8 1.0X +Parquet Vectorized (Pushdown) 11654 11671 17 1.3 740.9 1.0X +Native ORC Vectorized 10449 10479 23 1.5 664.3 1.2X +Native ORC Vectorized (Pushdown) 10073 10120 40 1.6 640.4 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11960 12016 47 1.3 760.4 1.0X -Parquet Vectorized (Pushdown) 12071 12135 58 1.3 767.5 1.0X -Native ORC Vectorized 10598 10650 53 1.5 673.8 1.1X -Native ORC Vectorized (Pushdown) 10651 10736 70 1.5 677.2 1.1X +Parquet Vectorized 12733 12756 20 1.2 809.5 1.0X +Parquet Vectorized (Pushdown) 12700 12719 20 1.2 807.4 1.0X +Native ORC Vectorized 10963 10996 31 1.4 697.0 1.2X +Native ORC Vectorized (Pushdown) 11063 11088 23 1.4 703.4 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11960 11979 19 1.3 760.4 1.0X -Parquet Vectorized (Pushdown) 12058 12147 68 1.3 766.7 1.0X -Native ORC Vectorized 10563 10620 37 1.5 671.6 1.1X -Native ORC Vectorized (Pushdown) 10708 10947 187 1.5 680.8 1.1X +Parquet Vectorized 12663 12687 27 1.2 805.1 1.0X +Parquet Vectorized (Pushdown) 12760 12792 26 1.2 811.2 1.0X +Native ORC Vectorized 10947 10976 42 1.4 696.0 1.2X +Native ORC Vectorized (Pushdown) 11021 11058 24 1.4 700.7 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12110 12189 60 1.3 769.9 1.0X -Parquet Vectorized (Pushdown) 12337 12422 81 1.3 784.4 1.0X -Native ORC Vectorized 10589 10660 105 1.5 673.2 1.1X -Native ORC Vectorized (Pushdown) 10648 10762 72 1.5 677.0 1.1X +Parquet Vectorized 12632 12656 24 1.2 803.1 1.0X +Parquet Vectorized (Pushdown) 12696 12733 30 1.2 807.2 1.0X +Native ORC Vectorized 10943 10969 22 1.4 695.7 1.2X +Native ORC Vectorized (Pushdown) 11050 11103 41 1.4 702.6 1.1X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5859 5871 12 2.7 372.5 1.0X -Parquet Vectorized (Pushdown) 237 246 10 66.4 15.1 24.8X -Native ORC Vectorized 6491 6523 29 2.4 412.7 0.9X -Native ORC Vectorized (Pushdown) 907 910 2 17.3 57.6 6.5X +Parquet Vectorized 5890 5925 33 2.7 374.5 1.0X +Parquet Vectorized (Pushdown) 239 251 11 65.8 15.2 24.7X +Native ORC Vectorized 6519 6541 16 2.4 414.5 0.9X +Native ORC Vectorized (Pushdown) 959 961 1 16.4 61.0 6.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5937 5962 23 2.6 377.5 1.0X -Parquet Vectorized (Pushdown) 239 245 9 65.8 15.2 24.9X -Native ORC Vectorized 6769 6788 34 2.3 430.4 0.9X -Native ORC Vectorized (Pushdown) 914 925 13 17.2 58.1 6.5X +Parquet Vectorized 6021 6046 24 2.6 382.8 1.0X +Parquet Vectorized (Pushdown) 241 257 11 65.2 15.3 25.0X +Native ORC Vectorized 6712 6738 20 2.3 426.7 0.9X +Native ORC Vectorized (Pushdown) 957 970 10 16.4 60.9 6.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5868 5878 6 2.7 373.1 1.0X -Parquet Vectorized (Pushdown) 284 289 3 55.3 18.1 20.6X -Native ORC Vectorized 6676 6696 23 2.4 424.5 0.9X -Native ORC Vectorized (Pushdown) 956 963 11 16.5 60.8 6.1X +Parquet Vectorized 5962 5982 17 2.6 379.1 1.0X +Parquet Vectorized (Pushdown) 288 294 8 54.6 18.3 20.7X +Native ORC Vectorized 6667 6694 27 2.4 423.9 0.9X +Native ORC Vectorized (Pushdown) 986 997 6 15.9 62.7 6.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5940 5949 6 2.6 377.7 1.0X -Parquet Vectorized (Pushdown) 290 295 5 54.2 18.5 20.5X -Native ORC Vectorized 6733 6746 10 2.3 428.1 0.9X -Native ORC Vectorized (Pushdown) 953 966 12 16.5 60.6 6.2X +Parquet Vectorized 5961 5968 5 2.6 379.0 1.0X +Parquet Vectorized (Pushdown) 286 301 11 54.9 18.2 20.8X +Native ORC Vectorized 6618 6670 68 2.4 420.7 0.9X +Native ORC Vectorized (Pushdown) 988 996 10 15.9 62.8 6.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5946 5966 17 2.6 378.0 1.0X -Parquet Vectorized (Pushdown) 292 296 5 53.9 18.5 20.4X -Native ORC Vectorized 6741 6751 6 2.3 428.6 0.9X -Native ORC Vectorized (Pushdown) 958 964 5 16.4 60.9 6.2X +Parquet Vectorized 6025 6052 15 2.6 383.1 1.0X +Parquet Vectorized (Pushdown) 288 296 8 54.6 18.3 20.9X +Native ORC Vectorized 6727 6756 39 2.3 427.7 0.9X +Native ORC Vectorized (Pushdown) 988 997 10 15.9 62.8 6.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14342 14388 32 1.1 911.8 1.0X -Parquet Vectorized (Pushdown) 14351 14404 51 1.1 912.4 1.0X -Native ORC Vectorized 14291 14316 30 1.1 908.6 1.0X -Native ORC Vectorized (Pushdown) 14452 14468 10 1.1 918.8 1.0X +Parquet Vectorized 14170 14195 23 1.1 900.9 1.0X +Parquet Vectorized (Pushdown) 14143 14168 22 1.1 899.2 1.0X +Native ORC Vectorized 14438 14488 39 1.1 918.0 1.0X +Native ORC Vectorized (Pushdown) 14638 14703 103 1.1 930.6 1.0X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7092 7104 14 2.2 450.9 1.0X -Parquet Vectorized (Pushdown) 930 935 3 16.9 59.1 7.6X -Native ORC Vectorized 5306 5327 28 3.0 337.3 1.3X -Native ORC Vectorized (Pushdown) 5385 5398 10 2.9 342.4 1.3X +Parquet Vectorized 7156 7212 82 2.2 455.0 1.0X +Parquet Vectorized (Pushdown) 948 952 5 16.6 60.2 7.6X +Native ORC Vectorized 5320 5345 31 3.0 338.2 1.3X +Native ORC Vectorized (Pushdown) 5413 5424 7 2.9 344.1 1.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6867 6913 26 2.3 436.6 1.0X -Parquet Vectorized (Pushdown) 278 283 4 56.5 17.7 24.7X -Native ORC Vectorized 5146 5154 5 3.1 327.2 1.3X -Native ORC Vectorized (Pushdown) 5225 5236 7 3.0 332.2 1.3X +Parquet Vectorized 7011 7034 30 2.2 445.7 1.0X +Parquet Vectorized (Pushdown) 280 286 8 56.1 17.8 25.0X +Native ORC Vectorized 5194 5204 9 3.0 330.2 1.3X +Native ORC Vectorized (Pushdown) 5264 5284 15 3.0 334.7 1.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6831 6849 11 2.3 434.3 1.0X -Parquet Vectorized (Pushdown) 265 272 6 59.2 16.9 25.7X -Native ORC Vectorized 5114 5140 18 3.1 325.2 1.3X -Native ORC Vectorized (Pushdown) 5193 5227 27 3.0 330.1 1.3X +Parquet Vectorized 6995 7025 21 2.2 444.7 1.0X +Parquet Vectorized (Pushdown) 277 287 12 56.7 17.6 25.2X +Native ORC Vectorized 5182 5205 23 3.0 329.5 1.3X +Native ORC Vectorized (Pushdown) 5260 5271 11 3.0 334.4 1.3X ================================================================================================ Pushdown benchmark for StringEndsWith ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5890 5903 16 2.7 374.5 1.0X -Parquet Vectorized (Pushdown) 366 369 3 43.0 23.3 16.1X -Native ORC Vectorized 6686 6712 29 2.4 425.1 0.9X -Native ORC Vectorized (Pushdown) 6877 6895 12 2.3 437.2 0.9X +Parquet Vectorized 5961 5985 24 2.6 379.0 1.0X +Parquet Vectorized (Pushdown) 366 375 10 42.9 23.3 16.3X +Native ORC Vectorized 6698 6718 19 2.3 425.9 0.9X +Native ORC Vectorized (Pushdown) 6899 6921 18 2.3 438.6 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5860 5875 13 2.7 372.6 1.0X -Parquet Vectorized (Pushdown) 269 275 7 58.5 17.1 21.8X -Native ORC Vectorized 6606 6637 23 2.4 420.0 0.9X -Native ORC Vectorized (Pushdown) 6803 6830 28 2.3 432.5 0.9X +Parquet Vectorized 5940 5950 7 2.6 377.7 1.0X +Parquet Vectorized (Pushdown) 269 278 9 58.5 17.1 22.1X +Native ORC Vectorized 6613 6643 24 2.4 420.5 0.9X +Native ORC Vectorized (Pushdown) 6856 6870 14 2.3 435.9 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5806 5835 24 2.7 369.1 1.0X -Parquet Vectorized (Pushdown) 263 271 4 59.9 16.7 22.1X -Native ORC Vectorized 6617 6624 5 2.4 420.7 0.9X -Native ORC Vectorized (Pushdown) 6771 6784 13 2.3 430.5 0.9X +Parquet Vectorized 5939 5955 17 2.6 377.6 1.0X +Parquet Vectorized (Pushdown) 270 292 36 58.2 17.2 22.0X +Native ORC Vectorized 6634 6655 20 2.4 421.8 0.9X +Native ORC Vectorized (Pushdown) 6824 6853 25 2.3 433.9 0.9X ================================================================================================ Pushdown benchmark for StringContains ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5983 6035 43 2.6 380.4 1.0X -Parquet Vectorized (Pushdown) 790 795 4 19.9 50.2 7.6X -Native ORC Vectorized 6785 6803 11 2.3 431.4 0.9X -Native ORC Vectorized (Pushdown) 6943 6977 24 2.3 441.4 0.9X +Parquet Vectorized 6160 6173 14 2.6 391.7 1.0X +Parquet Vectorized (Pushdown) 801 808 5 19.6 50.9 7.7X +Native ORC Vectorized 6872 6881 11 2.3 436.9 0.9X +Native ORC Vectorized (Pushdown) 7071 7082 7 2.2 449.6 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5790 5839 29 2.7 368.1 1.0X -Parquet Vectorized (Pushdown) 267 271 4 58.9 17.0 21.7X -Native ORC Vectorized 6623 6635 14 2.4 421.1 0.9X -Native ORC Vectorized (Pushdown) 6782 6797 10 2.3 431.2 0.9X +Parquet Vectorized 5949 5968 17 2.6 378.2 1.0X +Parquet Vectorized (Pushdown) 272 278 4 57.9 17.3 21.9X +Native ORC Vectorized 6649 6666 23 2.4 422.7 0.9X +Native ORC Vectorized (Pushdown) 6860 6888 23 2.3 436.1 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5811 5831 16 2.7 369.4 1.0X -Parquet Vectorized (Pushdown) 263 273 10 59.8 16.7 22.1X -Native ORC Vectorized 6563 6609 27 2.4 417.3 0.9X -Native ORC Vectorized (Pushdown) 6734 6772 26 2.3 428.1 0.9X +Parquet Vectorized 5936 5959 19 2.6 377.4 1.0X +Parquet Vectorized (Pushdown) 271 276 3 58.1 17.2 21.9X +Native ORC Vectorized 6637 6657 15 2.4 422.0 0.9X +Native ORC Vectorized (Pushdown) 6852 6878 24 2.3 435.6 0.9X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2805 2825 20 5.6 178.3 1.0X -Parquet Vectorized (Pushdown) 70 73 5 226.2 4.4 40.3X -Native ORC Vectorized 3503 3543 26 4.5 222.7 0.8X -Native ORC Vectorized (Pushdown) 55 59 3 286.3 3.5 51.1X +Parquet Vectorized 2866 2879 13 5.5 182.2 1.0X +Parquet Vectorized (Pushdown) 71 74 5 222.8 4.5 40.6X +Native ORC Vectorized 3278 3290 12 4.8 208.4 0.9X +Native ORC Vectorized (Pushdown) 59 63 5 268.1 3.7 48.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4151 4162 9 3.8 263.9 1.0X -Parquet Vectorized (Pushdown) 1966 1981 15 8.0 125.0 2.1X -Native ORC Vectorized 5029 5082 51 3.1 319.7 0.8X -Native ORC Vectorized (Pushdown) 2193 2203 8 7.2 139.4 1.9X +Parquet Vectorized 4201 4213 14 3.7 267.1 1.0X +Parquet Vectorized (Pushdown) 1950 1959 6 8.1 124.0 2.2X +Native ORC Vectorized 4784 4797 15 3.3 304.2 0.9X +Native ORC Vectorized (Pushdown) 2117 2120 3 7.4 134.6 2.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8776 8810 40 1.8 558.0 1.0X -Parquet Vectorized (Pushdown) 8460 8484 13 1.9 537.9 1.0X -Native ORC Vectorized 9930 9952 17 1.6 631.4 0.9X -Native ORC Vectorized (Pushdown) 9440 9476 30 1.7 600.2 0.9X +Parquet Vectorized 8854 8870 18 1.8 562.9 1.0X +Parquet Vectorized (Pushdown) 8480 8486 8 1.9 539.1 1.0X +Native ORC Vectorized 9614 9653 56 1.6 611.2 0.9X +Native ORC Vectorized (Pushdown) 9180 9242 85 1.7 583.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10066 10176 84 1.6 640.0 1.0X -Parquet Vectorized (Pushdown) 10147 10173 29 1.6 645.1 1.0X -Native ORC Vectorized 10790 10854 60 1.5 686.0 0.9X -Native ORC Vectorized (Pushdown) 10900 11013 189 1.4 693.0 0.9X +Parquet Vectorized 9942 9968 19 1.6 632.1 1.0X +Parquet Vectorized (Pushdown) 9975 9993 14 1.6 634.2 1.0X +Native ORC Vectorized 10610 10638 19 1.5 674.6 0.9X +Native ORC Vectorized (Pushdown) 10626 10648 16 1.5 675.6 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2978 2995 19 5.3 189.3 1.0X -Parquet Vectorized (Pushdown) 69 72 3 229.4 4.4 43.4X -Native ORC Vectorized 3520 3535 13 4.5 223.8 0.8X -Native ORC Vectorized (Pushdown) 53 56 3 296.7 3.4 56.2X +Parquet Vectorized 3028 3063 27 5.2 192.5 1.0X +Parquet Vectorized (Pushdown) 69 71 2 227.4 4.4 43.8X +Native ORC Vectorized 3306 3322 21 4.8 210.2 0.9X +Native ORC Vectorized (Pushdown) 56 59 4 281.4 3.6 54.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3733 3745 9 4.2 237.4 1.0X -Parquet Vectorized (Pushdown) 1078 1089 6 14.6 68.5 3.5X -Native ORC Vectorized 4305 4316 11 3.7 273.7 0.9X -Native ORC Vectorized (Pushdown) 1110 1113 3 14.2 70.6 3.4X +Parquet Vectorized 3790 3798 7 4.1 241.0 1.0X +Parquet Vectorized (Pushdown) 1082 1086 3 14.5 68.8 3.5X +Native ORC Vectorized 4052 4071 35 3.9 257.6 0.9X +Native ORC Vectorized (Pushdown) 1078 1081 3 14.6 68.5 3.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6729 6741 14 2.3 427.8 1.0X -Parquet Vectorized (Pushdown) 5185 5240 41 3.0 329.7 1.3X -Native ORC Vectorized 7200 7224 21 2.2 457.8 0.9X -Native ORC Vectorized (Pushdown) 5405 5438 22 2.9 343.7 1.2X +Parquet Vectorized 6691 6712 15 2.4 425.4 1.0X +Parquet Vectorized (Pushdown) 5196 5211 17 3.0 330.3 1.3X +Native ORC Vectorized 6925 6934 8 2.3 440.3 1.0X +Native ORC Vectorized (Pushdown) 5264 5279 19 3.0 334.7 1.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9576 9593 18 1.6 608.8 1.0X -Parquet Vectorized (Pushdown) 9301 9312 16 1.7 591.3 1.0X -Native ORC Vectorized 10115 10143 26 1.6 643.1 0.9X -Native ORC Vectorized (Pushdown) 9809 9814 3 1.6 623.6 1.0X +Parquet Vectorized 9504 9527 19 1.7 604.3 1.0X +Parquet Vectorized (Pushdown) 9218 9233 17 1.7 586.1 1.0X +Native ORC Vectorized 9809 9836 18 1.6 623.6 1.0X +Native ORC Vectorized (Pushdown) 9507 9531 17 1.7 604.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4258 4267 8 3.7 270.7 1.0X -Parquet Vectorized (Pushdown) 75 77 3 210.3 4.8 56.9X -Native ORC Vectorized 3587 3638 85 4.4 228.1 1.2X -Native ORC Vectorized (Pushdown) 52 55 3 302.8 3.3 82.0X +Parquet Vectorized 4264 4282 25 3.7 271.1 1.0X +Parquet Vectorized (Pushdown) 75 79 3 208.7 4.8 56.6X +Native ORC Vectorized 3347 3364 16 4.7 212.8 1.3X +Native ORC Vectorized (Pushdown) 55 59 5 283.6 3.5 76.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5156 5167 18 3.1 327.8 1.0X -Parquet Vectorized (Pushdown) 1386 1395 9 11.3 88.1 3.7X -Native ORC Vectorized 4486 4506 27 3.5 285.2 1.1X -Native ORC Vectorized (Pushdown) 1242 1251 8 12.7 79.0 4.2X +Parquet Vectorized 5214 5231 10 3.0 331.5 1.0X +Parquet Vectorized (Pushdown) 1409 1413 2 11.2 89.6 3.7X +Native ORC Vectorized 4207 4222 25 3.7 267.5 1.2X +Native ORC Vectorized (Pushdown) 1209 1211 2 13.0 76.9 4.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8812 8822 7 1.8 560.2 1.0X -Parquet Vectorized (Pushdown) 6728 6732 4 2.3 427.8 1.3X -Native ORC Vectorized 7787 7836 60 2.0 495.1 1.1X -Native ORC Vectorized (Pushdown) 6007 6023 24 2.6 381.9 1.5X +Parquet Vectorized 8897 8913 12 1.8 565.6 1.0X +Parquet Vectorized (Pushdown) 6816 6830 19 2.3 433.3 1.3X +Native ORC Vectorized 7648 7665 14 2.1 486.3 1.2X +Native ORC Vectorized (Pushdown) 5932 5954 25 2.7 377.1 1.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12367 12381 9 1.3 786.3 1.0X -Parquet Vectorized (Pushdown) 11977 12018 24 1.3 761.5 1.0X -Native ORC Vectorized 11109 11169 79 1.4 706.3 1.1X -Native ORC Vectorized (Pushdown) 10772 10786 15 1.5 684.9 1.1X +Parquet Vectorized 12548 12558 13 1.3 797.8 1.0X +Parquet Vectorized (Pushdown) 12139 12156 22 1.3 771.8 1.0X +Native ORC Vectorized 11055 11089 25 1.4 702.8 1.1X +Native ORC Vectorized (Pushdown) 10746 10789 41 1.5 683.2 1.2X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6442 6482 25 2.4 409.6 1.0X -Parquet Vectorized (Pushdown) 284 293 15 55.4 18.0 22.7X -Native ORC Vectorized 4965 4990 17 3.2 315.7 1.3X -Native ORC Vectorized (Pushdown) 281 288 8 56.1 17.8 23.0X +Parquet Vectorized 6561 6633 92 2.4 417.1 1.0X +Parquet Vectorized (Pushdown) 281 287 6 55.9 17.9 23.3X +Native ORC Vectorized 4666 4681 19 3.4 296.6 1.4X +Native ORC Vectorized (Pushdown) 289 298 8 54.4 18.4 22.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6451 6461 9 2.4 410.1 1.0X -Parquet Vectorized (Pushdown) 283 286 3 55.6 18.0 22.8X -Native ORC Vectorized 4937 4981 34 3.2 313.9 1.3X -Native ORC Vectorized (Pushdown) 279 288 11 56.4 17.7 23.1X +Parquet Vectorized 6554 6599 62 2.4 416.7 1.0X +Parquet Vectorized (Pushdown) 284 295 10 55.3 18.1 23.1X +Native ORC Vectorized 4674 4695 23 3.4 297.1 1.4X +Native ORC Vectorized (Pushdown) 293 303 13 53.7 18.6 22.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6449 6462 8 2.4 410.0 1.0X -Parquet Vectorized (Pushdown) 283 293 10 55.6 18.0 22.8X -Native ORC Vectorized 4955 4964 12 3.2 315.0 1.3X -Native ORC Vectorized (Pushdown) 280 284 3 56.2 17.8 23.0X +Parquet Vectorized 6540 6557 16 2.4 415.8 1.0X +Parquet Vectorized (Pushdown) 284 298 12 55.4 18.1 23.0X +Native ORC Vectorized 4667 4680 8 3.4 296.7 1.4X +Native ORC Vectorized (Pushdown) 290 297 7 54.3 18.4 22.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6448 6474 23 2.4 409.9 1.0X -Parquet Vectorized (Pushdown) 302 321 43 52.1 19.2 21.4X -Native ORC Vectorized 4977 4994 16 3.2 316.4 1.3X -Native ORC Vectorized (Pushdown) 297 301 3 53.0 18.9 21.7X +Parquet Vectorized 6564 6587 18 2.4 417.3 1.0X +Parquet Vectorized (Pushdown) 299 306 4 52.6 19.0 21.9X +Native ORC Vectorized 4686 4707 22 3.4 297.9 1.4X +Native ORC Vectorized (Pushdown) 305 310 3 51.5 19.4 21.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6478 6503 36 2.4 411.8 1.0X -Parquet Vectorized (Pushdown) 301 307 4 52.2 19.1 21.5X -Native ORC Vectorized 4972 5002 20 3.2 316.1 1.3X -Native ORC Vectorized (Pushdown) 297 305 11 52.9 18.9 21.8X +Parquet Vectorized 6568 6599 31 2.4 417.6 1.0X +Parquet Vectorized (Pushdown) 307 309 1 51.2 19.5 21.4X +Native ORC Vectorized 4684 4700 19 3.4 297.8 1.4X +Native ORC Vectorized (Pushdown) 302 310 11 52.1 19.2 21.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6506 6522 11 2.4 413.7 1.0X -Parquet Vectorized (Pushdown) 305 309 3 51.6 19.4 21.3X -Native ORC Vectorized 5057 5062 4 3.1 321.5 1.3X -Native ORC Vectorized (Pushdown) 304 309 3 51.7 19.4 21.4X +Parquet Vectorized 6567 6584 12 2.4 417.5 1.0X +Parquet Vectorized (Pushdown) 306 308 3 51.4 19.5 21.5X +Native ORC Vectorized 4684 4694 9 3.4 297.8 1.4X +Native ORC Vectorized (Pushdown) 308 313 3 51.1 19.6 21.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6675 6693 24 2.4 424.4 1.0X -Parquet Vectorized (Pushdown) 896 899 3 17.6 57.0 7.4X -Native ORC Vectorized 5264 5272 9 3.0 334.7 1.3X -Native ORC Vectorized (Pushdown) 407 410 4 38.7 25.9 16.4X +Parquet Vectorized 6743 6760 23 2.3 428.7 1.0X +Parquet Vectorized (Pushdown) 902 909 7 17.4 57.3 7.5X +Native ORC Vectorized 4877 4900 14 3.2 310.1 1.4X +Native ORC Vectorized (Pushdown) 414 415 1 38.0 26.3 16.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6662 6668 8 2.4 423.6 1.0X -Parquet Vectorized (Pushdown) 3357 3364 6 4.7 213.4 2.0X -Native ORC Vectorized 5164 5191 23 3.0 328.3 1.3X -Native ORC Vectorized (Pushdown) 426 429 3 36.9 27.1 15.6X +Parquet Vectorized 6731 6752 22 2.3 427.9 1.0X +Parquet Vectorized (Pushdown) 3328 3339 11 4.7 211.6 2.0X +Native ORC Vectorized 4904 4908 5 3.2 311.8 1.4X +Native ORC Vectorized (Pushdown) 431 433 2 36.5 27.4 15.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6648 6658 12 2.4 422.7 1.0X -Parquet Vectorized (Pushdown) 5843 5860 15 2.7 371.5 1.1X -Native ORC Vectorized 5182 5189 6 3.0 329.5 1.3X -Native ORC Vectorized (Pushdown) 432 436 9 36.4 27.5 15.4X +Parquet Vectorized 6720 6732 8 2.3 427.2 1.0X +Parquet Vectorized (Pushdown) 6064 6085 14 2.6 385.6 1.1X +Native ORC Vectorized 4885 4893 11 3.2 310.6 1.4X +Native ORC Vectorized (Pushdown) 439 451 20 35.9 27.9 15.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6611 6629 16 2.4 420.3 1.0X -Parquet Vectorized (Pushdown) 892 898 5 17.6 56.7 7.4X -Native ORC Vectorized 5126 5154 32 3.1 325.9 1.3X -Native ORC Vectorized (Pushdown) 500 505 3 31.4 31.8 13.2X +Parquet Vectorized 6698 6710 9 2.3 425.8 1.0X +Parquet Vectorized (Pushdown) 927 932 5 17.0 59.0 7.2X +Native ORC Vectorized 4843 4859 31 3.2 307.9 1.4X +Native ORC Vectorized (Pushdown) 509 515 6 30.9 32.4 13.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6556 6583 17 2.4 416.8 1.0X -Parquet Vectorized (Pushdown) 3433 3448 17 4.6 218.2 1.9X -Native ORC Vectorized 5099 5119 12 3.1 324.2 1.3X -Native ORC Vectorized (Pushdown) 570 572 3 27.6 36.2 11.5X +Parquet Vectorized 6698 6717 19 2.3 425.8 1.0X +Parquet Vectorized (Pushdown) 3443 3458 19 4.6 218.9 1.9X +Native ORC Vectorized 4838 4869 27 3.3 307.6 1.4X +Native ORC Vectorized (Pushdown) 571 574 4 27.6 36.3 11.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6631 6642 10 2.4 421.6 1.0X -Parquet Vectorized (Pushdown) 5877 5888 8 2.7 373.6 1.1X -Native ORC Vectorized 5141 5148 8 3.1 326.9 1.3X -Native ORC Vectorized (Pushdown) 585 587 2 26.9 37.2 11.3X +Parquet Vectorized 6694 6714 16 2.3 425.6 1.0X +Parquet Vectorized (Pushdown) 5855 5876 26 2.7 372.2 1.1X +Native ORC Vectorized 4833 4848 14 3.3 307.3 1.4X +Native ORC Vectorized (Pushdown) 559 561 2 28.1 35.5 12.0X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3074 3122 77 5.1 195.4 1.0X -Parquet Vectorized (Pushdown) 107 111 6 146.8 6.8 28.7X -Native ORC Vectorized 2473 2482 6 6.4 157.2 1.2X -Native ORC Vectorized (Pushdown) 114 117 5 138.0 7.2 27.0X +Parquet Vectorized 3146 3222 55 5.0 200.0 1.0X +Parquet Vectorized (Pushdown) 108 112 4 145.3 6.9 29.1X +Native ORC Vectorized 2163 2182 29 7.3 137.5 1.5X +Native ORC Vectorized (Pushdown) 115 119 6 137.4 7.3 27.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3739 3758 16 4.2 237.7 1.0X -Parquet Vectorized (Pushdown) 1004 1011 5 15.7 63.8 3.7X -Native ORC Vectorized 3078 3092 17 5.1 195.7 1.2X -Native ORC Vectorized (Pushdown) 918 920 1 17.1 58.4 4.1X +Parquet Vectorized 3769 3787 16 4.2 239.6 1.0X +Parquet Vectorized (Pushdown) 990 998 6 15.9 63.0 3.8X +Native ORC Vectorized 2723 2728 3 5.8 173.1 1.4X +Native ORC Vectorized (Pushdown) 854 857 2 18.4 54.3 4.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6391 6394 2 2.5 406.3 1.0X -Parquet Vectorized (Pushdown) 4890 4907 10 3.2 310.9 1.3X -Native ORC Vectorized 5584 5613 20 2.8 355.0 1.1X -Native ORC Vectorized (Pushdown) 4397 4412 10 3.6 279.6 1.5X +Parquet Vectorized 6447 6463 14 2.4 409.9 1.0X +Parquet Vectorized (Pushdown) 4925 4941 13 3.2 313.1 1.3X +Native ORC Vectorized 5158 5166 9 3.0 327.9 1.2X +Native ORC Vectorized (Pushdown) 4108 4130 29 3.8 261.2 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9020 9050 32 1.7 573.4 1.0X -Parquet Vectorized (Pushdown) 8757 8792 23 1.8 556.7 1.0X -Native ORC Vectorized 8277 8316 42 1.9 526.2 1.1X -Native ORC Vectorized (Pushdown) 8050 8069 14 2.0 511.8 1.1X +Parquet Vectorized 8983 9016 29 1.8 571.1 1.0X +Parquet Vectorized (Pushdown) 8710 8734 14 1.8 553.8 1.0X +Native ORC Vectorized 7637 7670 33 2.1 485.5 1.2X +Native ORC Vectorized (Pushdown) 7453 7479 22 2.1 473.9 1.2X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3218 3228 10 4.9 204.6 1.0X -Parquet Vectorized (Pushdown) 3203 3213 10 4.9 203.7 1.0X -Native ORC Vectorized 2387 2391 5 6.6 151.7 1.3X -Native ORC Vectorized (Pushdown) 39 42 4 407.8 2.5 83.4X +Parquet Vectorized 3234 3250 11 4.9 205.6 1.0X +Parquet Vectorized (Pushdown) 3243 3254 10 4.9 206.2 1.0X +Native ORC Vectorized 2077 2092 22 7.6 132.1 1.6X +Native ORC Vectorized (Pushdown) 40 42 4 394.9 2.5 81.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3900 3912 11 4.0 247.9 1.0X -Parquet Vectorized (Pushdown) 3896 3903 6 4.0 247.7 1.0X -Native ORC Vectorized 2987 2996 11 5.3 189.9 1.3X -Native ORC Vectorized (Pushdown) 889 892 3 17.7 56.5 4.4X +Parquet Vectorized 3945 3975 41 4.0 250.8 1.0X +Parquet Vectorized (Pushdown) 3936 3950 15 4.0 250.3 1.0X +Native ORC Vectorized 2699 2712 16 5.8 171.6 1.5X +Native ORC Vectorized (Pushdown) 864 871 6 18.2 54.9 4.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6698 6711 15 2.3 425.9 1.0X -Parquet Vectorized (Pushdown) 6698 6708 9 2.3 425.8 1.0X -Native ORC Vectorized 5550 5563 22 2.8 352.9 1.2X -Native ORC Vectorized (Pushdown) 4359 4374 18 3.6 277.2 1.5X +Parquet Vectorized 6749 6770 14 2.3 429.1 1.0X +Parquet Vectorized (Pushdown) 6746 6762 24 2.3 428.9 1.0X +Native ORC Vectorized 5192 5218 16 3.0 330.1 1.3X +Native ORC Vectorized (Pushdown) 4140 4152 15 3.8 263.2 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9385 9400 18 1.7 596.7 1.0X -Parquet Vectorized (Pushdown) 9378 9390 11 1.7 596.3 1.0X -Native ORC Vectorized 8168 8194 28 1.9 519.3 1.1X -Native ORC Vectorized (Pushdown) 7949 7959 9 2.0 505.4 1.2X +Parquet Vectorized 9447 9478 29 1.7 600.6 1.0X +Parquet Vectorized (Pushdown) 9462 9486 25 1.7 601.6 1.0X +Native ORC Vectorized 7861 7994 77 2.0 499.8 1.2X +Native ORC Vectorized (Pushdown) 7811 7838 17 2.0 496.6 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2972 2983 14 5.3 189.0 1.0X -Parquet Vectorized (Pushdown) 68 71 3 231.7 4.3 43.8X -Native ORC Vectorized 2359 2366 4 6.7 150.0 1.3X -Native ORC Vectorized (Pushdown) 38 40 3 416.7 2.4 78.7X +Parquet Vectorized 3007 3018 9 5.2 191.2 1.0X +Parquet Vectorized (Pushdown) 69 71 3 229.6 4.4 43.9X +Native ORC Vectorized 2066 2069 2 7.6 131.4 1.5X +Native ORC Vectorized (Pushdown) 39 41 3 399.5 2.5 76.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3649 3652 4 4.3 232.0 1.0X -Parquet Vectorized (Pushdown) 1043 1047 3 15.1 66.3 3.5X -Native ORC Vectorized 2989 2989 1 5.3 190.0 1.2X -Native ORC Vectorized (Pushdown) 888 892 2 17.7 56.5 4.1X +Parquet Vectorized 3717 3738 27 4.2 236.3 1.0X +Parquet Vectorized (Pushdown) 1052 1055 4 14.9 66.9 3.5X +Native ORC Vectorized 2695 2702 7 5.8 171.3 1.4X +Native ORC Vectorized (Pushdown) 863 867 5 18.2 54.9 4.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6413 6426 8 2.5 407.7 1.0X -Parquet Vectorized (Pushdown) 5009 5049 48 3.1 318.5 1.3X -Native ORC Vectorized 5548 5553 11 2.8 352.7 1.2X -Native ORC Vectorized (Pushdown) 4359 4368 9 3.6 277.1 1.5X +Parquet Vectorized 6536 6551 9 2.4 415.5 1.0X +Parquet Vectorized (Pushdown) 5041 5059 13 3.1 320.5 1.3X +Native ORC Vectorized 5201 5223 21 3.0 330.7 1.3X +Native ORC Vectorized (Pushdown) 4134 4139 5 3.8 262.9 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9114 9138 26 1.7 579.5 1.0X -Parquet Vectorized (Pushdown) 8869 8880 9 1.8 563.9 1.0X -Native ORC Vectorized 8175 8193 20 1.9 519.8 1.1X -Native ORC Vectorized (Pushdown) 7947 7956 11 2.0 505.3 1.1X +Parquet Vectorized 9201 9222 27 1.7 585.0 1.0X +Parquet Vectorized (Pushdown) 8940 8961 23 1.8 568.4 1.0X +Native ORC Vectorized 7987 8023 27 2.0 507.8 1.2X +Native ORC Vectorized (Pushdown) 7792 7808 21 2.0 495.4 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3001 3005 6 5.2 190.8 1.0X -Parquet Vectorized (Pushdown) 68 70 3 232.2 4.3 44.3X -Native ORC Vectorized 2359 2362 3 6.7 150.0 1.3X -Native ORC Vectorized (Pushdown) 38 40 4 415.7 2.4 79.3X +Parquet Vectorized 3037 3044 8 5.2 193.1 1.0X +Parquet Vectorized (Pushdown) 68 71 3 230.6 4.3 44.5X +Native ORC Vectorized 2068 2082 23 7.6 131.5 1.5X +Native ORC Vectorized (Pushdown) 39 42 3 400.5 2.5 77.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3686 3693 5 4.3 234.4 1.0X -Parquet Vectorized (Pushdown) 1044 1048 4 15.1 66.4 3.5X -Native ORC Vectorized 2984 2989 4 5.3 189.7 1.2X -Native ORC Vectorized (Pushdown) 889 891 2 17.7 56.5 4.1X +Parquet Vectorized 3730 3735 5 4.2 237.1 1.0X +Parquet Vectorized (Pushdown) 1047 1052 4 15.0 66.5 3.6X +Native ORC Vectorized 2700 2704 4 5.8 171.7 1.4X +Native ORC Vectorized (Pushdown) 861 877 22 18.3 54.8 4.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6454 6464 8 2.4 410.3 1.0X -Parquet Vectorized (Pushdown) 5018 5033 26 3.1 319.0 1.3X -Native ORC Vectorized 5545 5556 9 2.8 352.5 1.2X -Native ORC Vectorized (Pushdown) 4357 4377 14 3.6 277.0 1.5X +Parquet Vectorized 6517 6528 18 2.4 414.4 1.0X +Parquet Vectorized (Pushdown) 5046 5050 5 3.1 320.8 1.3X +Native ORC Vectorized 5189 5203 15 3.0 329.9 1.3X +Native ORC Vectorized (Pushdown) 4131 4148 22 3.8 262.6 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9143 9162 28 1.7 581.3 1.0X -Parquet Vectorized (Pushdown) 8888 8895 6 1.8 565.1 1.0X -Native ORC Vectorized 8163 8178 25 1.9 519.0 1.1X -Native ORC Vectorized (Pushdown) 7942 7966 35 2.0 504.9 1.2X +Parquet Vectorized 9216 9225 7 1.7 585.9 1.0X +Parquet Vectorized (Pushdown) 8966 8976 15 1.8 570.0 1.0X +Native ORC Vectorized 7990 8006 18 2.0 508.0 1.2X +Native ORC Vectorized (Pushdown) 7788 7804 19 2.0 495.1 1.2X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 55 57 2 0.0 55430933.0 1.0X -Parquet Vectorized (Pushdown) 56 59 5 0.0 56257088.0 1.0X -Native ORC Vectorized 50 52 2 0.0 50120677.0 1.1X -Native ORC Vectorized (Pushdown) 52 55 3 0.0 52126525.0 1.1X +Parquet Vectorized 48 50 2 0.0 47822192.0 1.0X +Parquet Vectorized (Pushdown) 49 52 4 0.0 48715892.0 1.0X +Native ORC Vectorized 43 44 2 0.0 42630483.0 1.1X +Native ORC Vectorized (Pushdown) 44 47 4 0.0 44086388.0 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 405 417 11 0.0 404938298.0 1.0X -Parquet Vectorized (Pushdown) 407 412 6 0.0 406571487.0 1.0X -Native ORC Vectorized 394 397 4 0.0 394366762.0 1.0X -Native ORC Vectorized (Pushdown) 397 406 7 0.0 396723685.0 1.0X +Parquet Vectorized 189 197 6 0.0 189302685.0 1.0X +Parquet Vectorized (Pushdown) 192 196 5 0.0 191858297.0 1.0X +Native ORC Vectorized 182 189 7 0.0 182429398.0 1.0X +Native ORC Vectorized (Pushdown) 186 190 3 0.0 185920182.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 2255 2270 9 0.0 2255338602.0 1.0X -Parquet Vectorized (Pushdown) 2258 2279 13 0.0 2258126416.0 1.0X -Native ORC Vectorized 2244 2260 10 0.0 2243733317.0 1.0X -Native ORC Vectorized (Pushdown) 2255 2290 29 0.0 2254729481.0 1.0X +Parquet Vectorized 600 607 9 0.0 599862493.0 1.0X +Parquet Vectorized (Pushdown) 606 622 12 0.0 605756895.0 1.0X +Native ORC Vectorized 591 600 12 0.0 591069360.0 1.0X +Native ORC Vectorized (Pushdown) 595 609 9 0.0 594620092.0 1.0X diff --git a/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt b/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt index bd83ba8858f29..ae1f8694afbf5 100644 --- a/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ GenerateExec benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor GenerateExec Benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -GenerateExec Benchmark wholestage off 71281 71290 12 1.4 712.8 1.0X -GenerateExec Benchmark wholestage on 21377 22190 461 4.7 213.8 3.3X +GenerateExec Benchmark wholestage off 73608 73642 47 1.4 736.1 1.0X +GenerateExec Benchmark wholestage on 20481 20591 165 4.9 204.8 3.6X diff --git a/sql/core/benchmarks/GenerateExecBenchmark-results.txt b/sql/core/benchmarks/GenerateExecBenchmark-results.txt index 7aaa8fad9e560..6790608ad6b2b 100644 --- a/sql/core/benchmarks/GenerateExecBenchmark-results.txt +++ b/sql/core/benchmarks/GenerateExecBenchmark-results.txt @@ -2,11 +2,11 @@ GenerateExec benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor GenerateExec Benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -GenerateExec Benchmark wholestage off 73307 73512 290 1.4 733.1 1.0X -GenerateExec Benchmark wholestage on 24438 24523 84 4.1 244.4 3.0X +GenerateExec Benchmark wholestage off 72745 72752 10 1.4 727.4 1.0X +GenerateExec Benchmark wholestage on 23957 24433 339 4.2 239.6 3.0X diff --git a/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt b/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt index 8e47f7e27a85b..e326f00783419 100644 --- a/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt @@ -2,10 +2,10 @@ LongToUnsafeRowMap metrics ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LongToUnsafeRowMap metrics: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -LongToUnsafeRowMap 255 259 3 2.0 510.5 1.0X +LongToUnsafeRowMap 266 269 3 1.9 532.1 1.0X diff --git a/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt b/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt index f201c27de387e..5364545cd8af7 100644 --- a/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt +++ b/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt @@ -2,10 +2,10 @@ LongToUnsafeRowMap metrics ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor LongToUnsafeRowMap metrics: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -LongToUnsafeRowMap 261 268 6 1.9 521.5 1.0X +LongToUnsafeRowMap 260 264 3 1.9 519.7 1.0X diff --git a/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt b/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt index e571db07479a0..4ee151d851b96 100644 --- a/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt @@ -2,739 +2,739 @@ In Expression Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 46 61 9 217.9 4.6 1.0X -InSet expression 68 73 6 146.3 6.8 0.7X +In expression 39 52 10 254.0 3.9 1.0X +InSet expression 61 68 5 162.9 6.1 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 36 42 6 275.1 3.6 1.0X -InSet expression 62 66 4 160.3 6.2 0.6X +In expression 37 42 6 267.8 3.7 1.0X +InSet expression 57 62 4 173.9 5.7 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 65 70 6 153.5 6.5 1.0X -InSet expression 77 81 4 130.5 7.7 0.9X +In expression 66 72 5 151.0 6.6 1.0X +InSet expression 82 85 4 122.2 8.2 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 106 111 7 94.6 10.6 1.0X -InSet expression 84 87 3 119.7 8.4 1.3X +In expression 106 111 5 94.1 10.6 1.0X +InSet expression 96 101 6 103.7 9.6 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 184 189 4 54.3 18.4 1.0X -InSet expression 98 102 4 102.1 9.8 1.9X +In expression 185 189 5 54.1 18.5 1.0X +InSet expression 124 128 4 80.4 12.4 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 393 395 2 25.5 39.3 1.0X -InSet expression 187 192 6 53.5 18.7 2.1X +In expression 397 403 9 25.2 39.7 1.0X +InSet expression 187 190 3 53.4 18.7 2.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 33 5 345.2 2.9 1.0X -InSet expression 75 79 3 132.5 7.5 0.4X +In expression 32 35 4 315.0 3.2 1.0X +InSet expression 85 88 4 117.4 8.5 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 36 41 5 274.6 3.6 1.0X -InSet expression 92 95 2 109.1 9.2 0.4X +In expression 41 44 5 244.9 4.1 1.0X +InSet expression 98 101 2 101.8 9.8 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 62 66 4 161.1 6.2 1.0X -InSet expression 91 93 2 110.1 9.1 0.7X +In expression 63 65 3 159.0 6.3 1.0X +InSet expression 98 100 2 102.4 9.8 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 119 122 4 84.3 11.9 1.0X -InSet expression 128 129 1 78.4 12.8 0.9X +In expression 120 123 4 83.6 12.0 1.0X +InSet expression 133 137 4 74.9 13.3 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 186 189 5 53.7 18.6 1.0X -InSet expression 114 116 3 87.5 11.4 1.6X +In expression 197 201 4 50.7 19.7 1.0X +InSet expression 120 124 4 83.1 12.0 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 368 372 5 27.2 36.8 1.0X -InSet expression 122 124 1 81.9 12.2 3.0X +In expression 370 372 2 27.1 37.0 1.0X +InSet expression 132 135 3 76.0 13.2 2.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 551 559 5 18.1 55.1 1.0X -InSet expression 135 138 2 74.0 13.5 4.1X +In expression 556 557 2 18.0 55.6 1.0X +InSet expression 145 148 3 68.9 14.5 3.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 734 737 3 13.6 73.4 1.0X -InSet expression 148 151 2 67.5 14.8 4.9X +In expression 731 736 4 13.7 73.1 1.0X +InSet expression 158 160 2 63.4 15.8 4.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 941 947 5 10.6 94.1 1.0X -InSet expression 162 165 2 61.7 16.2 5.8X +In expression 944 945 2 10.6 94.4 1.0X +InSet expression 172 174 2 58.2 17.2 5.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 27 31 6 374.5 2.7 1.0X -InSet expression 73 75 3 137.7 7.3 0.4X +In expression 27 30 4 372.4 2.7 1.0X +InSet expression 81 84 2 122.8 8.1 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 36 41 7 276.7 3.6 1.0X -InSet expression 91 93 1 109.7 9.1 0.4X +In expression 37 39 3 270.3 3.7 1.0X +InSet expression 98 100 2 102.4 9.8 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 60 64 4 166.6 6.0 1.0X -InSet expression 120 122 1 83.5 12.0 0.5X +In expression 62 63 3 162.1 6.2 1.0X +InSet expression 123 125 1 81.1 12.3 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 111 116 5 89.8 11.1 1.0X -InSet expression 134 137 4 74.8 13.4 0.8X +In expression 112 116 10 89.1 11.2 1.0X +InSet expression 140 142 2 71.3 14.0 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 195 197 3 51.3 19.5 1.0X -InSet expression 116 119 3 85.8 11.6 1.7X +In expression 194 198 4 51.4 19.4 1.0X +InSet expression 123 126 4 81.3 12.3 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 348 351 4 28.7 34.8 1.0X -InSet expression 122 125 2 81.7 12.2 2.8X +In expression 344 347 3 29.1 34.4 1.0X +InSet expression 128 130 2 77.8 12.8 2.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 513 516 5 19.5 51.3 1.0X -InSet expression 133 135 2 75.1 13.3 3.9X +In expression 514 517 3 19.5 51.4 1.0X +InSet expression 143 146 3 70.2 14.3 3.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 658 663 4 15.2 65.8 1.0X -InSet expression 146 149 3 68.3 14.6 4.5X +In expression 664 670 6 15.1 66.4 1.0X +InSet expression 156 159 2 64.0 15.6 4.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 847 853 7 11.8 84.7 1.0X -InSet expression 159 162 2 62.7 15.9 5.3X +In expression 848 851 4 11.8 84.8 1.0X +InSet expression 169 172 2 59.2 16.9 5.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 27 30 4 368.5 2.7 1.0X -InSet expression 80 83 3 124.6 8.0 0.3X +In expression 28 30 4 359.2 2.8 1.0X +InSet expression 82 84 2 121.6 8.2 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 35 38 5 285.9 3.5 1.0X -InSet expression 97 99 1 103.0 9.7 0.4X +In expression 39 41 3 259.6 3.9 1.0X +InSet expression 99 101 2 101.1 9.9 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 63 65 3 157.7 6.3 1.0X -InSet expression 97 100 4 102.8 9.7 0.7X +In expression 61 63 3 164.4 6.1 1.0X +InSet expression 99 102 2 101.0 9.9 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 116 119 4 86.1 11.6 1.0X -InSet expression 135 137 1 74.3 13.5 0.9X +In expression 111 112 3 89.9 11.1 1.0X +InSet expression 136 138 2 73.4 13.6 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 184 197 6 54.5 18.4 1.0X -InSet expression 117 119 2 85.4 11.7 1.6X +In expression 196 199 4 50.9 19.6 1.0X +InSet expression 118 121 2 84.4 11.8 1.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 342 351 8 29.2 34.2 1.0X -InSet expression 124 126 1 80.6 12.4 2.8X +In expression 360 364 3 27.8 36.0 1.0X +InSet expression 127 131 6 78.8 12.7 2.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 511 518 9 19.6 51.1 1.0X -InSet expression 136 140 3 73.3 13.6 3.7X +In expression 514 517 3 19.4 51.4 1.0X +InSet expression 139 141 2 72.0 13.9 3.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 668 675 9 15.0 66.8 1.0X -InSet expression 149 152 4 67.1 14.9 4.5X +In expression 673 687 16 14.8 67.3 1.0X +InSet expression 151 153 2 66.4 15.1 4.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 826 831 3 12.1 82.6 1.0X -InSet expression 161 163 1 62.2 16.1 5.1X +In expression 833 836 3 12.0 83.3 1.0X +InSet expression 163 166 3 61.5 16.3 5.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 20 24 4 497.1 2.0 1.0X -InSet expression 77 80 1 129.1 7.7 0.3X +In expression 21 25 4 466.0 2.1 1.0X +InSet expression 79 81 1 126.7 7.9 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 28 31 4 356.5 2.8 1.0X -InSet expression 94 96 1 105.9 9.4 0.3X +In expression 30 32 5 335.1 3.0 1.0X +InSet expression 96 98 2 104.5 9.6 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 56 60 4 177.3 5.6 1.0X -InSet expression 108 110 1 92.5 10.8 0.5X +In expression 58 60 3 171.6 5.8 1.0X +InSet expression 109 111 2 91.5 10.9 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 100 102 3 99.7 10.0 1.0X -InSet expression 133 135 1 75.1 13.3 0.8X +In expression 101 103 3 98.6 10.1 1.0X +InSet expression 134 136 2 74.6 13.4 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 179 182 4 55.7 17.9 1.0X -InSet expression 120 123 3 83.2 12.0 1.5X +In expression 180 182 3 55.5 18.0 1.0X +InSet expression 121 124 4 82.4 12.1 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 337 347 7 29.6 33.7 1.0X -InSet expression 127 131 9 78.9 12.7 2.7X +In expression 339 344 3 29.5 33.9 1.0X +InSet expression 127 130 2 78.4 12.7 2.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 506 517 21 19.8 50.6 1.0X -InSet expression 135 139 4 73.8 13.5 3.7X +In expression 507 507 0 19.7 50.7 1.0X +InSet expression 138 140 2 72.3 13.8 3.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 666 672 4 15.0 66.6 1.0X -InSet expression 148 152 3 67.4 14.8 4.5X +In expression 664 675 10 15.1 66.4 1.0X +InSet expression 151 153 1 66.2 15.1 4.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 831 860 62 12.0 83.1 1.0X -InSet expression 159 162 1 62.7 15.9 5.2X +In expression 833 867 63 12.0 83.3 1.0X +InSet expression 162 165 2 61.8 16.2 5.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 19 20 2 526.2 1.9 1.0X -InSet expression 86 87 1 116.9 8.6 0.2X +In expression 20 22 3 501.8 2.0 1.0X +InSet expression 89 90 1 113.0 8.9 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 28 31 3 352.8 2.8 1.0X -InSet expression 101 103 2 98.9 10.1 0.3X +In expression 30 33 4 334.8 3.0 1.0X +InSet expression 105 107 2 95.6 10.5 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 53 56 3 190.2 5.3 1.0X -InSet expression 101 103 2 98.9 10.1 0.5X +In expression 54 58 5 184.2 5.4 1.0X +InSet expression 104 106 2 96.5 10.4 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 92 94 3 108.2 9.2 1.0X -InSet expression 135 138 2 73.8 13.5 0.7X +In expression 93 95 3 107.1 9.3 1.0X +InSet expression 137 139 3 73.2 13.7 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 171 173 2 58.4 17.1 1.0X -InSet expression 121 123 2 82.4 12.1 1.4X +In expression 172 173 3 58.1 17.2 1.0X +InSet expression 120 122 2 83.2 12.0 1.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 328 331 3 30.5 32.8 1.0X -InSet expression 129 131 2 77.7 12.9 2.5X +In expression 332 337 6 30.2 33.2 1.0X +InSet expression 129 131 2 77.8 12.9 2.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 36 40 4 274.9 3.6 1.0X -InSet expression 77 81 8 130.3 7.7 0.5X +In expression 36 40 4 281.6 3.6 1.0X +InSet expression 78 80 2 127.9 7.8 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 61 63 3 164.9 6.1 1.0X -InSet expression 98 99 1 102.5 9.8 0.6X +In expression 61 65 3 164.8 6.1 1.0X +InSet expression 100 102 2 99.6 10.0 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 147 150 3 67.8 14.7 1.0X -InSet expression 99 101 1 101.1 9.9 1.5X +In expression 149 151 3 67.0 14.9 1.0X +InSet expression 100 102 2 100.1 10.0 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 227 232 6 44.0 22.7 1.0X -InSet expression 144 146 2 69.5 14.4 1.6X +In expression 231 234 3 43.3 23.1 1.0X +InSet expression 146 147 2 68.7 14.6 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 384 387 4 26.0 38.4 1.0X -InSet expression 116 118 1 86.0 11.6 3.3X +In expression 387 389 2 25.9 38.7 1.0X +InSet expression 117 119 2 85.5 11.7 3.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1895 1931 68 5.3 189.5 1.0X -InSet expression 120 122 2 83.6 12.0 15.9X +In expression 1905 1963 66 5.2 190.5 1.0X +InSet expression 122 123 1 81.9 12.2 15.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 36 39 3 281.1 3.6 1.0X -InSet expression 77 80 2 129.7 7.7 0.5X +In expression 39 40 3 258.5 3.9 1.0X +InSet expression 79 81 2 126.5 7.9 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 61 64 4 163.7 6.1 1.0X -InSet expression 98 100 1 101.8 9.8 0.6X +In expression 62 63 3 161.7 6.2 1.0X +InSet expression 100 102 2 99.8 10.0 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 151 153 3 66.3 15.1 1.0X -InSet expression 100 103 3 99.9 10.0 1.5X +In expression 150 152 3 66.7 15.0 1.0X +InSet expression 99 101 4 101.1 9.9 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 227 232 10 44.0 22.7 1.0X -InSet expression 143 145 1 70.1 14.3 1.6X +In expression 231 233 2 43.3 23.1 1.0X +InSet expression 147 149 4 68.1 14.7 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 386 389 3 25.9 38.6 1.0X -InSet expression 116 117 1 86.2 11.6 3.3X +In expression 390 391 2 25.6 39.0 1.0X +InSet expression 119 121 2 84.3 11.9 3.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 2266 2298 49 4.4 226.6 1.0X -InSet expression 119 121 1 83.9 11.9 19.0X +In expression 2236 2268 70 4.5 223.6 1.0X +InSet expression 122 124 1 81.8 12.2 18.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 20 22 3 49.4 20.2 1.0X -InSet expression 59 61 2 17.0 58.8 0.3X +In expression 21 22 2 47.2 21.2 1.0X +InSet expression 58 60 2 17.3 58.0 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 26 28 3 37.9 26.4 1.0X -InSet expression 61 63 2 16.3 61.3 0.4X +In expression 27 29 2 37.0 27.0 1.0X +InSet expression 60 62 2 16.6 60.1 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 40 43 3 24.7 40.4 1.0X -InSet expression 62 65 4 16.2 61.7 0.7X +In expression 42 43 2 23.9 41.8 1.0X +InSet expression 61 63 1 16.3 61.2 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 79 81 3 12.6 79.4 1.0X -InSet expression 67 69 2 14.8 67.4 1.2X +In expression 80 82 2 12.5 80.3 1.0X +InSet expression 65 67 1 15.3 65.4 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 240 245 7 4.2 240.0 1.0X -InSet expression 65 68 4 15.4 65.1 3.7X +In expression 241 243 3 4.1 241.2 1.0X +InSet expression 64 66 3 15.6 64.1 3.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 572 576 5 1.7 571.9 1.0X -InSet expression 66 68 1 15.1 66.4 8.6X +In expression 581 582 1 1.7 580.6 1.0X +InSet expression 66 68 3 15.1 66.2 8.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 2 199.2 5.0 1.0X -InSet expression 5 6 2 211.3 4.7 1.1X +In expression 5 6 2 201.8 5.0 1.0X +InSet expression 5 6 2 211.8 4.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 2 205.8 4.9 1.0X -InSet expression 5 5 2 210.7 4.7 1.0X +In expression 5 6 2 207.5 4.8 1.0X +InSet expression 5 6 2 207.7 4.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 2 194.7 5.1 1.0X -InSet expression 5 6 2 191.2 5.2 1.0X +In expression 5 7 3 193.2 5.2 1.0X +InSet expression 5 6 2 190.2 5.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 6 7 2 167.7 6.0 1.0X -InSet expression 6 7 2 167.6 6.0 1.0X +In expression 6 7 2 167.4 6.0 1.0X +InSet expression 6 7 2 168.1 5.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 7 8 2 134.1 7.5 1.0X -InSet expression 7 8 2 135.4 7.4 1.0X +In expression 8 9 3 132.3 7.6 1.0X +InSet expression 8 9 3 133.1 7.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 11 11 2 94.8 10.6 1.0X -InSet expression 11 11 1 95.0 10.5 1.0X +In expression 11 13 3 91.4 10.9 1.0X +InSet expression 11 13 3 93.3 10.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 30 1 34.3 29.1 1.0X -InSet expression 43 45 2 23.1 43.3 0.7X +In expression 29 34 5 34.2 29.3 1.0X +InSet expression 43 46 3 23.2 43.1 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 36 2 29.0 34.5 1.0X -InSet expression 46 47 1 21.9 45.6 0.8X +In expression 34 38 4 29.4 34.1 1.0X +InSet expression 46 50 3 21.9 45.7 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 44 45 2 23.0 43.5 1.0X -InSet expression 50 51 1 19.9 50.3 0.9X +In expression 43 47 3 23.2 43.1 1.0X +InSet expression 60 62 3 16.5 60.5 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 58 60 2 17.1 58.4 1.0X -InSet expression 54 55 1 18.5 54.2 1.1X +In expression 59 59 1 17.0 58.7 1.0X +InSet expression 54 54 1 18.6 53.7 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 90 92 2 11.1 90.4 1.0X -InSet expression 51 53 1 19.6 51.1 1.8X +In expression 89 90 2 11.2 89.4 1.0X +InSet expression 51 51 1 19.7 50.8 1.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 408 413 5 2.4 408.2 1.0X -InSet expression 51 53 2 19.6 50.9 8.0X +In expression 408 410 4 2.5 407.6 1.0X +InSet expression 52 52 1 19.4 51.6 7.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 17 18 2 593.9 1.7 1.0X -InSet expression 81 83 2 123.5 8.1 0.2X +In expression 17 19 2 579.0 1.7 1.0X +InSet expression 84 85 2 118.9 8.4 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 23 25 4 442.1 2.3 1.0X -InSet expression 95 96 1 105.7 9.5 0.2X +In expression 23 24 2 435.8 2.3 1.0X +InSet expression 97 98 1 103.3 9.7 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 48 50 2 209.9 4.8 1.0X -InSet expression 128 130 1 78.1 12.8 0.4X +In expression 49 49 2 206.1 4.9 1.0X +InSet expression 129 130 1 77.8 12.9 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 81 84 3 123.3 8.1 1.0X -InSet expression 161 163 1 62.0 16.1 0.5X +In expression 82 83 2 122.1 8.2 1.0X +InSet expression 160 162 1 62.5 16.0 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 152 154 2 65.6 15.2 1.0X -InSet expression 137 138 1 73.0 13.7 1.1X +In expression 153 154 2 65.3 15.3 1.0X +InSet expression 138 140 1 72.4 13.8 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 295 306 11 33.9 29.5 1.0X -InSet expression 133 134 1 75.4 13.3 2.2X +In expression 296 308 11 33.8 29.6 1.0X +InSet expression 134 136 1 74.7 13.4 2.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 304 306 3 32.9 30.4 1.0X -InSet expression 300 303 3 33.3 30.0 1.0X +In expression 288 289 1 34.7 28.8 1.0X +InSet expression 284 288 3 35.2 28.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 307 313 4 32.6 30.7 1.0X -InSet expression 300 302 2 33.3 30.0 1.0X +In expression 290 294 2 34.5 29.0 1.0X +InSet expression 284 287 2 35.2 28.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 344 345 1 29.1 34.4 1.0X -InSet expression 300 301 1 33.4 30.0 1.1X +In expression 328 329 1 30.5 32.8 1.0X +InSet expression 287 289 2 34.8 28.7 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 387 393 7 25.9 38.7 1.0X -InSet expression 300 302 1 33.3 30.0 1.3X +In expression 381 389 17 26.3 38.1 1.0X +InSet expression 285 289 4 35.1 28.5 1.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 488 489 1 20.5 48.8 1.0X -InSet expression 300 305 3 33.3 30.0 1.6X +In expression 474 477 3 21.1 47.4 1.0X +InSet expression 287 289 1 34.9 28.7 1.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 643 646 3 15.6 64.3 1.0X -InSet expression 303 305 2 33.0 30.3 2.1X +In expression 618 620 1 16.2 61.8 1.0X +InSet expression 287 290 3 34.8 28.7 2.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 803 805 2 12.5 80.3 1.0X -InSet expression 305 306 1 32.8 30.5 2.6X +In expression 789 793 3 12.7 78.9 1.0X +InSet expression 291 295 3 34.3 29.1 2.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 960 972 8 10.4 96.0 1.0X -InSet expression 306 308 2 32.7 30.6 3.1X +In expression 952 973 27 10.5 95.2 1.0X +InSet expression 292 294 2 34.2 29.2 3.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1122 1145 26 8.9 112.2 1.0X -InSet expression 371 374 2 27.0 37.1 3.0X +In expression 1110 1118 7 9.0 111.0 1.0X +InSet expression 369 371 2 27.1 36.9 3.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 26 27 2 38.8 25.8 1.0X -InSet expression 58 60 2 17.2 58.2 0.4X +In expression 27 28 2 37.7 26.5 1.0X +InSet expression 59 60 2 17.0 58.8 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 41 43 1 24.3 41.1 1.0X -InSet expression 87 90 4 11.5 87.1 0.5X +In expression 42 43 2 24.1 41.6 1.0X +InSet expression 87 89 2 11.5 86.9 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 162 165 3 6.2 162.3 1.0X -InSet expression 102 104 2 9.8 101.6 1.6X +In expression 172 174 3 5.8 171.8 1.0X +InSet expression 102 103 1 9.8 101.6 1.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 393 400 12 2.5 393.1 1.0X -InSet expression 130 134 3 7.7 130.4 3.0X +In expression 399 401 2 2.5 398.8 1.0X +InSet expression 131 133 1 7.7 130.7 3.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 818 822 4 1.2 818.1 1.0X -InSet expression 146 149 2 6.8 146.5 5.6X +In expression 780 785 5 1.3 779.7 1.0X +InSet expression 146 149 3 6.8 146.3 5.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1772 1922 315 0.6 1772.2 1.0X -InSet expression 164 167 2 6.1 164.2 10.8X +In expression 1715 1871 325 0.6 1714.6 1.0X +InSet expression 164 166 2 6.1 163.6 10.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 17 18 1 58.3 17.1 1.0X -InSet expression 83 87 9 12.1 82.9 0.2X +In expression 17 20 3 57.7 17.3 1.0X +InSet expression 87 90 6 11.5 86.9 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 24 25 1 42.0 23.8 1.0X -InSet expression 126 128 1 7.9 126.2 0.2X +In expression 25 27 2 40.8 24.5 1.0X +InSet expression 134 136 2 7.5 133.9 0.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 71 73 1 14.1 71.1 1.0X -InSet expression 149 150 1 6.7 148.8 0.5X +In expression 72 73 1 13.8 72.2 1.0X +InSet expression 157 163 14 6.4 156.9 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 191 192 1 5.2 191.2 1.0X -InSet expression 190 194 4 5.3 189.9 1.0X +In expression 198 199 2 5.0 198.0 1.0X +InSet expression 202 204 2 5.0 202.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 564 582 35 1.8 563.9 1.0X -InSet expression 214 217 2 4.7 214.2 2.6X +In expression 555 573 35 1.8 554.6 1.0X +InSet expression 229 233 3 4.4 229.2 2.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1282 1479 291 0.8 1282.4 1.0X -InSet expression 243 252 7 4.1 243.3 5.3X +In expression 1395 1508 239 0.7 1395.2 1.0X +InSet expression 262 264 2 3.8 261.5 5.3X diff --git a/sql/core/benchmarks/InExpressionBenchmark-results.txt b/sql/core/benchmarks/InExpressionBenchmark-results.txt index 5178c51124c4f..539cb9a5060c9 100644 --- a/sql/core/benchmarks/InExpressionBenchmark-results.txt +++ b/sql/core/benchmarks/InExpressionBenchmark-results.txt @@ -2,739 +2,739 @@ In Expression Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 43 61 11 230.8 4.3 1.0X -InSet expression 88 94 7 113.6 8.8 0.5X +In expression 48 60 8 210.3 4.8 1.0X +InSet expression 85 92 6 117.9 8.5 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 36 42 5 279.2 3.6 1.0X -InSet expression 82 86 4 121.3 8.2 0.4X +In expression 38 43 6 263.4 3.8 1.0X +InSet expression 79 84 4 125.8 7.9 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 65 69 3 154.3 6.5 1.0X -InSet expression 83 87 4 121.0 8.3 0.8X +In expression 67 70 4 149.6 6.7 1.0X +InSet expression 84 88 3 119.3 8.4 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 105 107 2 95.4 10.5 1.0X -InSet expression 88 91 3 113.8 8.8 1.2X +In expression 107 110 4 93.9 10.7 1.0X +InSet expression 89 92 3 112.7 8.9 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 193 196 5 51.8 19.3 1.0X -InSet expression 98 100 3 102.2 9.8 2.0X +In expression 194 198 4 51.5 19.4 1.0X +InSet expression 99 102 3 101.5 9.9 2.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 388 390 2 25.8 38.8 1.0X -InSet expression 178 180 2 56.1 17.8 2.2X +In expression 397 401 3 25.2 39.7 1.0X +InSet expression 191 194 5 52.5 19.1 2.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 30 34 4 330.9 3.0 1.0X -InSet expression 91 95 3 109.5 9.1 0.3X +In expression 32 35 4 316.4 3.2 1.0X +InSet expression 96 100 2 104.6 9.6 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 38 42 6 259.9 3.8 1.0X -InSet expression 106 109 4 94.7 10.6 0.4X +In expression 37 40 4 268.0 3.7 1.0X +InSet expression 109 113 4 91.4 10.9 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 61 65 6 164.1 6.1 1.0X -InSet expression 106 108 2 94.6 10.6 0.6X +In expression 63 65 3 159.7 6.3 1.0X +InSet expression 109 116 18 91.6 10.9 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 116 118 2 86.3 11.6 1.0X -InSet expression 136 138 2 73.4 13.6 0.9X +In expression 118 120 2 84.6 11.8 1.0X +InSet expression 139 142 3 72.0 13.9 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 197 199 3 50.8 19.7 1.0X -InSet expression 119 121 1 84.0 11.9 1.7X +In expression 186 188 2 53.8 18.6 1.0X +InSet expression 122 126 3 81.7 12.2 1.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 364 367 3 27.5 36.4 1.0X -InSet expression 126 128 1 79.3 12.6 2.9X +In expression 366 368 3 27.3 36.6 1.0X +InSet expression 133 135 1 75.3 13.3 2.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 548 551 3 18.2 54.8 1.0X -InSet expression 139 143 3 71.8 13.9 3.9X +In expression 551 555 7 18.1 55.1 1.0X +InSet expression 145 147 2 69.0 14.5 3.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 666 672 4 15.0 66.6 1.0X -InSet expression 153 155 3 65.3 15.3 4.4X +In expression 664 785 87 15.1 66.4 1.0X +InSet expression 157 161 2 63.6 15.7 4.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 934 938 5 10.7 93.4 1.0X -InSet expression 165 168 2 60.4 16.5 5.6X +In expression 884 892 5 11.3 88.4 1.0X +InSet expression 170 173 3 58.7 17.0 5.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 28 30 2 352.5 2.8 1.0X -InSet expression 87 89 4 114.7 8.7 0.3X +In expression 30 32 3 334.7 3.0 1.0X +InSet expression 92 94 3 108.7 9.2 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 37 4 291.1 3.4 1.0X -InSet expression 106 108 2 94.2 10.6 0.3X +In expression 36 39 4 277.4 3.6 1.0X +InSet expression 109 110 1 91.7 10.9 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 69 71 1 144.2 6.9 1.0X -InSet expression 128 133 8 78.0 12.8 0.5X +In expression 70 72 2 142.4 7.0 1.0X +InSet expression 131 133 1 76.3 13.1 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 105 107 2 95.0 10.5 1.0X -InSet expression 143 145 2 70.1 14.3 0.7X +In expression 106 107 2 94.6 10.6 1.0X +InSet expression 146 148 1 68.5 14.6 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 195 197 2 51.2 19.5 1.0X -InSet expression 123 127 5 81.4 12.3 1.6X +In expression 196 197 2 51.1 19.6 1.0X +InSet expression 126 130 5 79.6 12.6 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 353 356 4 28.3 35.3 1.0X -InSet expression 127 129 1 78.7 12.7 2.8X +In expression 354 357 4 28.3 35.4 1.0X +InSet expression 130 132 1 76.9 13.0 2.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 500 502 3 20.0 50.0 1.0X -InSet expression 137 142 7 73.1 13.7 3.7X +In expression 501 504 4 20.0 50.1 1.0X +InSet expression 143 148 5 70.0 14.3 3.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 666 670 4 15.0 66.6 1.0X -InSet expression 151 154 2 66.1 15.1 4.4X +In expression 665 667 4 15.0 66.5 1.0X +InSet expression 155 158 1 64.4 15.5 4.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 854 858 4 11.7 85.4 1.0X -InSet expression 162 164 1 61.9 16.2 5.3X +In expression 858 861 3 11.7 85.8 1.0X +InSet expression 167 171 4 59.9 16.7 5.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 27 29 3 372.2 2.7 1.0X -InSet expression 89 92 2 111.7 8.9 0.3X +In expression 28 30 2 356.5 2.8 1.0X +InSet expression 91 93 2 109.8 9.1 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 39 2 291.3 3.4 1.0X -InSet expression 107 109 1 93.5 10.7 0.3X +In expression 35 37 2 283.2 3.5 1.0X +InSet expression 107 110 2 93.1 10.7 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 60 65 9 166.2 6.0 1.0X -InSet expression 109 112 2 91.9 10.9 0.6X +In expression 62 66 11 160.7 6.2 1.0X +InSet expression 110 113 3 90.6 11.0 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 103 104 1 97.2 10.3 1.0X -InSet expression 140 144 8 71.5 14.0 0.7X +In expression 105 106 1 95.4 10.5 1.0X +InSet expression 142 144 1 70.3 14.2 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 183 184 3 54.8 18.3 1.0X -InSet expression 119 121 1 83.9 11.9 1.5X +In expression 195 195 1 51.4 19.5 1.0X +InSet expression 122 124 2 81.7 12.2 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 353 355 2 28.3 35.3 1.0X -InSet expression 122 127 3 81.6 12.2 2.9X +In expression 360 364 3 27.8 36.0 1.0X +InSet expression 130 132 2 76.7 13.0 2.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 508 510 3 19.7 50.8 1.0X -InSet expression 135 140 8 74.0 13.5 3.8X +In expression 509 514 8 19.6 50.9 1.0X +InSet expression 142 143 1 70.5 14.2 3.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 668 692 33 15.0 66.8 1.0X -InSet expression 147 149 2 68.2 14.7 4.6X +In expression 668 683 23 15.0 66.8 1.0X +InSet expression 153 155 1 65.4 15.3 4.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 835 841 8 12.0 83.5 1.0X -InSet expression 160 162 2 62.6 16.0 5.2X +In expression 830 854 45 12.1 83.0 1.0X +InSet expression 165 167 1 60.8 16.5 5.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 22 24 3 456.0 2.2 1.0X -InSet expression 86 89 4 116.4 8.6 0.3X +In expression 23 24 2 444.2 2.3 1.0X +InSet expression 88 91 3 113.3 8.8 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 31 3 350.9 2.9 1.0X -InSet expression 103 105 3 97.1 10.3 0.3X +In expression 29 31 2 345.8 2.9 1.0X +InSet expression 104 106 1 95.9 10.4 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 56 57 2 179.0 5.6 1.0X -InSet expression 118 120 1 84.6 11.8 0.5X +In expression 57 58 2 176.4 5.7 1.0X +InSet expression 119 121 1 83.8 11.9 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 100 101 1 100.0 10.0 1.0X -InSet expression 138 140 2 72.6 13.8 0.7X +In expression 101 102 1 99.2 10.1 1.0X +InSet expression 139 142 3 71.7 13.9 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 179 181 2 55.8 17.9 1.0X -InSet expression 122 124 1 82.2 12.2 1.5X +In expression 180 181 2 55.6 18.0 1.0X +InSet expression 125 128 4 80.1 12.5 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 344 347 3 29.1 34.4 1.0X -InSet expression 126 128 2 79.7 12.6 2.7X +In expression 346 350 4 28.9 34.6 1.0X +InSet expression 130 131 2 77.1 13.0 2.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 505 506 2 19.8 50.5 1.0X -InSet expression 136 139 2 73.5 13.6 3.7X +In expression 506 508 3 19.8 50.6 1.0X +InSet expression 141 144 2 71.0 14.1 3.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 661 665 3 15.1 66.1 1.0X -InSet expression 147 149 1 68.1 14.7 4.5X +In expression 658 665 4 15.2 65.8 1.0X +InSet expression 153 155 2 65.5 15.3 4.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 818 856 78 12.2 81.8 1.0X -InSet expression 159 161 3 63.0 15.9 5.2X +In expression 821 858 76 12.2 82.1 1.0X +InSet expression 164 166 1 61.1 16.4 5.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 22 24 4 459.0 2.2 1.0X -InSet expression 82 86 5 121.2 8.2 0.3X +In expression 22 24 3 456.1 2.2 1.0X +InSet expression 88 90 2 113.9 8.8 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 28 29 2 355.6 2.8 1.0X -InSet expression 99 101 1 101.1 9.9 0.3X +In expression 30 31 2 337.8 3.0 1.0X +InSet expression 103 106 2 96.8 10.3 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 53 55 2 190.1 5.3 1.0X -InSet expression 102 105 5 98.5 10.2 0.5X +In expression 55 56 2 180.8 5.5 1.0X +InSet expression 106 108 2 94.5 10.6 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 91 94 3 109.4 9.1 1.0X -InSet expression 132 134 1 75.7 13.2 0.7X +In expression 95 98 2 105.3 9.5 1.0X +InSet expression 136 139 4 73.5 13.6 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 175 183 10 57.2 17.5 1.0X -InSet expression 112 114 2 89.5 11.2 1.6X +In expression 172 177 5 58.0 17.2 1.0X +InSet expression 116 119 4 86.3 11.6 1.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 327 332 4 30.6 32.7 1.0X -InSet expression 119 128 18 84.3 11.9 2.8X +In expression 330 347 10 30.3 33.0 1.0X +InSet expression 125 127 2 80.2 12.5 2.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 35 37 3 286.9 3.5 1.0X -InSet expression 113 115 1 88.8 11.3 0.3X +In expression 37 38 3 273.6 3.7 1.0X +InSet expression 114 116 2 87.4 11.4 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 64 65 2 157.2 6.4 1.0X -InSet expression 143 148 10 70.2 14.3 0.4X +In expression 65 66 1 154.2 6.5 1.0X +InSet expression 143 145 3 70.1 14.3 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 125 127 2 79.8 12.5 1.0X -InSet expression 143 147 5 70.1 14.3 0.9X +In expression 128 130 3 78.1 12.8 1.0X +InSet expression 144 146 3 69.4 14.4 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 208 210 1 48.0 20.8 1.0X -InSet expression 188 190 2 53.3 18.8 1.1X +In expression 210 211 2 47.7 21.0 1.0X +InSet expression 191 192 1 52.3 19.1 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 365 369 6 27.4 36.5 1.0X -InSet expression 148 154 15 67.6 14.8 2.5X +In expression 367 369 4 27.3 36.7 1.0X +InSet expression 149 151 2 67.1 14.9 2.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1682 1771 104 5.9 168.2 1.0X -InSet expression 148 150 1 67.5 14.8 11.3X +In expression 1682 1789 90 5.9 168.2 1.0X +InSet expression 151 152 1 66.2 15.1 11.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 35 36 2 286.9 3.5 1.0X -InSet expression 95 97 2 105.0 9.5 0.4X +In expression 41 42 3 246.1 4.1 1.0X +InSet expression 116 117 2 86.6 11.6 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 63 64 2 159.6 6.3 1.0X -InSet expression 116 118 1 85.9 11.6 0.5X +In expression 63 65 6 158.1 6.3 1.0X +InSet expression 144 147 3 69.3 14.4 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 127 129 1 78.6 12.7 1.0X -InSet expression 118 120 1 84.9 11.8 1.1X +In expression 128 129 2 78.2 12.8 1.0X +InSet expression 144 146 4 69.6 14.4 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 207 210 3 48.3 20.7 1.0X -InSet expression 155 158 1 64.3 15.5 1.3X +In expression 210 210 2 47.7 21.0 1.0X +InSet expression 196 198 3 51.1 19.6 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 364 370 12 27.4 36.4 1.0X -InSet expression 126 129 3 79.3 12.6 2.9X +In expression 367 367 1 27.3 36.7 1.0X +InSet expression 152 154 1 65.6 15.2 2.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1920 1928 7 5.2 192.0 1.0X -InSet expression 134 139 5 74.8 13.4 14.4X +In expression 1925 2082 125 5.2 192.5 1.0X +InSet expression 155 158 2 64.3 15.5 12.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 20 21 3 51.3 19.5 1.0X -InSet expression 56 57 1 17.9 56.0 0.3X +In expression 21 23 2 47.8 20.9 1.0X +InSet expression 58 61 7 17.2 58.1 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 25 27 2 40.2 24.9 1.0X -InSet expression 58 60 1 17.3 57.9 0.4X +In expression 26 28 2 38.7 25.9 1.0X +InSet expression 60 63 2 16.5 60.5 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 40 41 2 25.3 39.5 1.0X -InSet expression 58 61 4 17.3 58.0 0.7X +In expression 42 43 2 24.1 41.5 1.0X +InSet expression 61 63 2 16.4 60.8 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 76 77 1 13.2 75.8 1.0X -InSet expression 62 64 1 16.1 62.2 1.2X +In expression 80 80 1 12.6 79.6 1.0X +InSet expression 65 67 1 15.3 65.3 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 234 236 4 4.3 234.3 1.0X -InSet expression 60 62 2 16.7 60.0 3.9X +In expression 237 238 2 4.2 236.9 1.0X +InSet expression 63 65 2 15.9 62.8 3.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 548 605 125 1.8 547.7 1.0X -InSet expression 63 65 4 15.9 62.7 8.7X +In expression 555 611 124 1.8 554.9 1.0X +InSet expression 65 68 4 15.4 64.8 8.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 1 202.4 4.9 1.0X -InSet expression 5 6 1 207.4 4.8 1.0X +In expression 5 6 2 194.4 5.1 1.0X +InSet expression 5 6 2 205.4 4.9 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 1 201.8 5.0 1.0X -InSet expression 5 5 1 203.6 4.9 1.0X +In expression 5 6 2 199.6 5.0 1.0X +InSet expression 5 6 1 205.5 4.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 1 187.1 5.3 1.0X -InSet expression 5 6 1 185.3 5.4 1.0X +In expression 5 7 2 182.1 5.5 1.0X +InSet expression 5 6 1 187.6 5.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 6 7 1 163.5 6.1 1.0X -InSet expression 6 7 1 161.6 6.2 1.0X +In expression 6 7 2 160.2 6.2 1.0X +InSet expression 6 7 1 160.7 6.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 8 8 1 127.7 7.8 1.0X -InSet expression 8 9 2 127.7 7.8 1.0X +In expression 8 9 1 125.2 8.0 1.0X +InSet expression 8 9 1 122.6 8.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 11 12 1 90.6 11.0 1.0X -InSet expression 11 12 1 90.6 11.0 1.0X +In expression 12 13 1 84.6 11.8 1.0X +InSet expression 12 13 1 84.8 11.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 32 4 34.6 28.9 1.0X -InSet expression 45 46 2 22.3 44.8 0.6X +In expression 30 32 2 33.2 30.2 1.0X +InSet expression 49 51 2 20.2 49.4 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 36 2 29.3 34.1 1.0X -InSet expression 48 49 1 21.0 47.6 0.7X +In expression 36 38 2 27.8 36.0 1.0X +InSet expression 51 53 2 19.5 51.4 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 43 44 2 23.5 42.5 1.0X -InSet expression 51 52 1 19.6 50.9 0.8X +In expression 44 45 1 22.7 44.0 1.0X +InSet expression 56 57 1 17.9 55.7 0.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 58 59 2 17.4 57.6 1.0X -InSet expression 55 57 1 18.2 54.8 1.1X +In expression 59 60 2 16.9 59.3 1.0X +InSet expression 60 61 1 16.6 60.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 88 90 1 11.3 88.4 1.0X -InSet expression 51 52 1 19.6 51.1 1.7X +In expression 138 139 1 7.3 137.5 1.0X +InSet expression 56 57 1 17.9 56.0 2.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 382 388 12 2.6 381.7 1.0X -InSet expression 52 53 1 19.4 51.5 7.4X +In expression 392 393 2 2.6 392.0 1.0X +InSet expression 56 58 6 17.8 56.0 7.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 16 17 1 622.8 1.6 1.0X -InSet expression 89 92 3 112.0 8.9 0.2X +In expression 17 18 2 602.6 1.7 1.0X +InSet expression 91 93 2 109.3 9.1 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 23 25 4 427.6 2.3 1.0X -InSet expression 99 101 1 101.0 9.9 0.2X +In expression 24 26 2 412.8 2.4 1.0X +InSet expression 101 103 2 98.5 10.1 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 45 46 2 220.5 4.5 1.0X -InSet expression 125 127 1 80.0 12.5 0.4X +In expression 47 48 1 212.0 4.7 1.0X +InSet expression 127 129 3 78.5 12.7 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 80 82 1 124.5 8.0 1.0X -InSet expression 151 153 2 66.2 15.1 0.5X +In expression 82 83 1 121.4 8.2 1.0X +InSet expression 155 157 1 64.5 15.5 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 152 153 2 65.8 15.2 1.0X -InSet expression 130 132 1 76.7 13.0 1.2X +In expression 154 155 2 65.1 15.4 1.0X +InSet expression 133 137 6 74.9 13.3 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 295 304 9 33.9 29.5 1.0X -InSet expression 127 129 1 78.8 12.7 2.3X +In expression 296 307 12 33.8 29.6 1.0X +InSet expression 128 130 1 77.8 12.8 2.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 441 442 1 22.7 44.1 1.0X -InSet expression 434 440 9 23.1 43.4 1.0X +In expression 441 443 2 22.7 44.1 1.0X +InSet expression 437 440 4 22.9 43.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 445 450 7 22.5 44.5 1.0X -InSet expression 437 438 3 22.9 43.7 1.0X +In expression 447 452 4 22.4 44.7 1.0X +InSet expression 441 443 2 22.7 44.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 485 486 1 20.6 48.5 1.0X -InSet expression 436 438 2 23.0 43.6 1.1X +In expression 470 471 1 21.3 47.0 1.0X +InSet expression 438 440 3 22.9 43.8 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 547 549 1 18.3 54.7 1.0X -InSet expression 441 445 4 22.7 44.1 1.2X +In expression 542 543 2 18.5 54.2 1.0X +InSet expression 440 443 3 22.7 44.0 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 608 610 1 16.4 60.8 1.0X -InSet expression 440 441 1 22.7 44.0 1.4X +In expression 619 620 1 16.1 61.9 1.0X +InSet expression 442 445 3 22.6 44.2 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 776 782 7 12.9 77.6 1.0X -InSet expression 436 440 2 22.9 43.6 1.8X +In expression 785 790 4 12.7 78.5 1.0X +InSet expression 441 448 7 22.7 44.1 1.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 300 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 939 945 6 10.6 93.9 1.0X -InSet expression 441 443 2 22.7 44.1 2.1X +In expression 933 937 6 10.7 93.3 1.0X +InSet expression 441 444 2 22.7 44.1 2.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 400 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1098 1105 10 9.1 109.8 1.0X -InSet expression 447 450 2 22.4 44.7 2.5X +In expression 1096 1106 7 9.1 109.6 1.0X +InSet expression 443 444 1 22.6 44.3 2.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 500 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1297 1305 7 7.7 129.7 1.0X -InSet expression 537 539 1 18.6 53.7 2.4X +In expression 1270 1273 2 7.9 127.0 1.0X +InSet expression 551 554 3 18.1 55.1 2.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 24 25 2 41.2 24.3 1.0X -InSet expression 57 58 1 17.6 56.8 0.4X +In expression 26 28 4 38.9 25.7 1.0X +InSet expression 58 59 2 17.4 57.6 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 41 42 1 24.4 41.0 1.0X -InSet expression 83 86 4 12.0 83.4 0.5X +In expression 42 43 2 24.1 41.5 1.0X +InSet expression 85 87 1 11.7 85.5 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 157 158 2 6.4 156.6 1.0X -InSet expression 98 99 1 10.2 97.6 1.6X +In expression 156 158 2 6.4 156.2 1.0X +InSet expression 100 102 2 10.0 99.8 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 371 374 2 2.7 371.2 1.0X -InSet expression 125 127 1 8.0 125.2 3.0X +In expression 388 389 1 2.6 387.5 1.0X +InSet expression 128 130 3 7.8 128.2 3.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 757 759 3 1.3 756.8 1.0X -InSet expression 142 144 1 7.0 142.5 5.3X +In expression 761 767 6 1.3 761.2 1.0X +InSet expression 143 149 13 7.0 143.1 5.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1671 1822 219 0.6 1671.2 1.0X -InSet expression 159 173 37 6.3 159.2 10.5X +In expression 1682 1860 303 0.6 1682.0 1.0X +InSet expression 160 163 2 6.3 160.0 10.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 5 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 20 21 1 49.8 20.1 1.0X -InSet expression 76 78 5 13.2 75.6 0.3X +In expression 22 23 2 46.5 21.5 1.0X +InSet expression 81 85 3 12.3 81.5 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 10 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 30 2 34.3 29.1 1.0X -InSet expression 116 117 2 8.7 115.6 0.3X +In expression 31 33 2 31.9 31.4 1.0X +InSet expression 122 125 3 8.2 122.4 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 25 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 68 70 1 14.6 68.3 1.0X -InSet expression 131 137 4 7.6 131.2 0.5X +In expression 71 73 1 14.0 71.2 1.0X +InSet expression 144 146 2 6.9 144.0 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 50 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 187 189 3 5.4 186.5 1.0X -InSet expression 176 179 3 5.7 175.5 1.1X +In expression 209 210 1 4.8 209.0 1.0X +InSet expression 186 189 3 5.4 186.1 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 100 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 491 498 8 2.0 491.0 1.0X -InSet expression 199 201 1 5.0 199.0 2.5X +In expression 490 502 7 2.0 489.9 1.0X +InSet expression 209 213 4 4.8 209.2 2.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor 200 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1139 1377 266 0.9 1139.0 1.0X -InSet expression 227 232 8 4.4 227.2 5.0X +In expression 1212 1422 205 0.8 1211.6 1.0X +InSet expression 239 242 5 4.2 238.6 5.1X diff --git a/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt b/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt index 4d79ea0b65033..94ffd3ca73811 100644 --- a/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ Int In-memory with 1000000 rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -columnar deserialization + columnar-to-row 147 200 59 6.8 147.4 1.0X -row-based deserialization 129 158 42 7.8 129.0 1.1X +columnar deserialization + columnar-to-row 171 215 38 5.8 171.5 1.0X +row-based deserialization 136 139 2 7.3 136.2 1.3X diff --git a/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt b/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt index 6787b645563b3..3feaaca07c885 100644 --- a/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt +++ b/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt @@ -2,11 +2,11 @@ Int In-memory with 1000000 rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -columnar deserialization + columnar-to-row 188 199 12 5.3 187.6 1.0X -row-based deserialization 142 216 115 7.0 141.9 1.3X +columnar deserialization + columnar-to-row 184 210 23 5.4 184.0 1.0X +row-based deserialization 142 144 2 7.0 142.2 1.3X diff --git a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt index 85601d9e9757f..f78e7551cefe1 100644 --- a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt @@ -1,8 +1,8 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dynamic insert table benchmark, totalRows = 200000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -one partition column, 100 partitions 9762 9793 43 0.0 48810.6 1.0X -two partition columns, 500 partitions 25446 25796 495 0.0 127230.3 0.4X -three partition columns, 2000 partitions 68971 69095 176 0.0 344853.7 0.1X +one partition column, 100 partitions 8137 8169 44 0.0 40687.0 1.0X +two partition columns, 500 partitions 20814 20937 174 0.0 104067.7 0.4X +three partition columns, 2000 partitions 56067 56122 78 0.0 280335.3 0.1X diff --git a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt index a8b6b9b48805d..8ca9c389f7348 100644 --- a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt +++ b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt @@ -1,8 +1,8 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor dynamic insert table benchmark, totalRows = 200000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -one partition column, 100 partitions 9336 9513 250 0.0 46681.2 1.0X -two partition columns, 500 partitions 25266 25745 677 0.0 126332.0 0.4X -three partition columns, 2000 partitions 69778 70117 479 0.0 348891.4 0.1X +one partition column, 100 partitions 7555 7583 41 0.0 37772.8 1.0X +two partition columns, 500 partitions 20496 20667 242 0.0 102480.4 0.4X +three partition columns, 2000 partitions 56071 56093 30 0.0 280357.3 0.1X diff --git a/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt b/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt index 260eec63f5118..8e46de244bcc0 100644 --- a/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt @@ -1,40 +1,40 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare string w/ interval 427 498 62 2.3 426.8 1.0X -prepare string w/o interval 385 389 4 2.6 384.8 1.1X -1 units w/ interval 340 343 4 2.9 340.0 1.3X -1 units w/o interval 380 387 6 2.6 380.3 1.1X -2 units w/ interval 549 557 7 1.8 549.2 0.8X -2 units w/o interval 553 555 4 1.8 553.1 0.8X -3 units w/ interval 1125 1126 2 0.9 1124.7 0.4X -3 units w/o interval 1144 1149 7 0.9 1143.7 0.4X -4 units w/ interval 1338 1341 3 0.7 1337.7 0.3X -4 units w/o interval 1351 1367 18 0.7 1351.1 0.3X -5 units w/ interval 1506 1510 5 0.7 1506.4 0.3X -5 units w/o interval 1522 1523 1 0.7 1521.6 0.3X -6 units w/ interval 1644 1651 11 0.6 1643.6 0.3X -6 units w/o interval 1654 1661 10 0.6 1653.8 0.3X -7 units w/ interval 2058 2066 9 0.5 2058.2 0.2X -7 units w/o interval 2069 2072 5 0.5 2068.7 0.2X -8 units w/ interval 2291 2295 6 0.4 2290.9 0.2X -8 units w/o interval 2348 2358 12 0.4 2347.9 0.2X -9 units w/ interval 2453 2457 5 0.4 2452.8 0.2X -9 units w/o interval 2460 2472 16 0.4 2460.0 0.2X -10 units w/ interval 2709 2716 6 0.4 2709.3 0.2X -10 units w/o interval 2706 2707 1 0.4 2705.6 0.2X -11 units w/ interval 3049 3055 7 0.3 3048.7 0.1X -11 units w/o interval 3043 3050 7 0.3 3042.5 0.1X +prepare string w/ interval 397 422 27 2.5 396.9 1.0X +prepare string w/o interval 365 395 43 2.7 365.0 1.1X +1 units w/ interval 337 347 15 3.0 337.4 1.2X +1 units w/o interval 358 365 10 2.8 357.6 1.1X +2 units w/ interval 526 528 2 1.9 526.3 0.8X +2 units w/o interval 535 539 5 1.9 535.3 0.7X +3 units w/ interval 1126 1133 7 0.9 1125.5 0.4X +3 units w/o interval 1115 1118 3 0.9 1115.0 0.4X +4 units w/ interval 1310 1315 4 0.8 1310.3 0.3X +4 units w/o interval 1327 1333 5 0.8 1327.1 0.3X +5 units w/ interval 1453 1457 7 0.7 1452.9 0.3X +5 units w/o interval 1467 1472 4 0.7 1467.5 0.3X +6 units w/ interval 1615 1618 3 0.6 1614.7 0.2X +6 units w/o interval 1617 1617 1 0.6 1616.6 0.2X +7 units w/ interval 2046 2053 6 0.5 2046.0 0.2X +7 units w/o interval 2067 2072 4 0.5 2067.4 0.2X +8 units w/ interval 2277 2288 11 0.4 2277.3 0.2X +8 units w/o interval 2291 2297 5 0.4 2290.8 0.2X +9 units w/ interval 2612 2618 8 0.4 2611.7 0.2X +9 units w/o interval 2647 2651 4 0.4 2646.5 0.1X +10 units w/ interval 2842 2849 8 0.4 2841.8 0.1X +10 units w/o interval 2838 2841 3 0.4 2838.4 0.1X +11 units w/ interval 3071 3077 9 0.3 3070.8 0.1X +11 units w/o interval 3083 3088 6 0.3 3082.8 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor make_interval(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -prepare make_interval() 351 355 4 2.9 350.8 1.0X -make_interval(0, 1, 2, 3, 4, 5, 50.123456) 42 44 2 23.9 41.9 8.4X -make_interval(*, *, 2, 3, 4, 5, 50.123456) 52 54 3 19.2 52.0 6.7X -make_interval(0, 1, *, *, 4, 5, 50.123456) 61 64 3 16.4 60.9 5.8X -make_interval(0, 1, 2, 3, *, *, *) 344 348 5 2.9 344.5 1.0X -make_interval(*, *, *, *, *, *, *) 359 363 6 2.8 359.2 1.0X +prepare make_interval() 356 357 2 2.8 355.5 1.0X +make_interval(0, 1, 2, 3, 4, 5, 50.123456) 44 53 8 22.8 43.8 8.1X +make_interval(*, *, 2, 3, 4, 5, 50.123456) 53 57 4 18.8 53.3 6.7X +make_interval(0, 1, *, *, 4, 5, 50.123456) 56 56 0 17.9 55.8 6.4X +make_interval(0, 1, 2, 3, *, *, *) 326 327 1 3.1 326.1 1.1X +make_interval(*, *, *, *, *, *, *) 342 345 3 2.9 341.6 1.0X diff --git a/sql/core/benchmarks/IntervalBenchmark-results.txt b/sql/core/benchmarks/IntervalBenchmark-results.txt index f09ebdc4d121f..5bd12d7b15ef5 100644 --- a/sql/core/benchmarks/IntervalBenchmark-results.txt +++ b/sql/core/benchmarks/IntervalBenchmark-results.txt @@ -1,40 +1,40 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare string w/ interval 409 467 97 2.4 409.1 1.0X -prepare string w/o interval 371 378 11 2.7 371.2 1.1X -1 units w/ interval 358 373 14 2.8 358.5 1.1X -1 units w/o interval 382 395 22 2.6 382.1 1.1X -2 units w/ interval 532 536 4 1.9 531.9 0.8X -2 units w/o interval 538 544 6 1.9 537.9 0.8X -3 units w/ interval 1202 1205 3 0.8 1202.1 0.3X -3 units w/o interval 1222 1227 5 0.8 1222.0 0.3X -4 units w/ interval 1403 1408 5 0.7 1403.3 0.3X -4 units w/o interval 1432 1435 4 0.7 1431.6 0.3X -5 units w/ interval 1552 1564 16 0.6 1551.5 0.3X -5 units w/o interval 1559 1562 3 0.6 1558.8 0.3X -6 units w/ interval 1700 1705 5 0.6 1700.2 0.2X -6 units w/o interval 1721 1728 8 0.6 1720.8 0.2X -7 units w/ interval 2241 2244 4 0.4 2241.0 0.2X -7 units w/o interval 2254 2265 10 0.4 2254.3 0.2X -8 units w/ interval 2505 2519 15 0.4 2505.0 0.2X -8 units w/o interval 2505 2508 3 0.4 2505.5 0.2X -9 units w/ interval 2621 2629 7 0.4 2621.2 0.2X -9 units w/o interval 2623 2628 4 0.4 2623.1 0.2X -10 units w/ interval 2844 2849 6 0.4 2843.6 0.1X -10 units w/o interval 2829 2842 20 0.4 2829.2 0.1X -11 units w/ interval 3143 3146 3 0.3 3142.7 0.1X -11 units w/o interval 3147 3156 10 0.3 3146.9 0.1X +prepare string w/ interval 407 418 12 2.5 406.7 1.0X +prepare string w/o interval 375 383 8 2.7 374.6 1.1X +1 units w/ interval 386 387 1 2.6 385.8 1.1X +1 units w/o interval 343 352 11 2.9 343.0 1.2X +2 units w/ interval 511 513 2 2.0 511.1 0.8X +2 units w/o interval 526 529 3 1.9 526.5 0.8X +3 units w/ interval 1196 1199 4 0.8 1196.3 0.3X +3 units w/o interval 1171 1174 3 0.9 1171.0 0.3X +4 units w/ interval 1389 1392 3 0.7 1389.3 0.3X +4 units w/o interval 1401 1403 2 0.7 1400.5 0.3X +5 units w/ interval 1545 1549 4 0.6 1545.2 0.3X +5 units w/o interval 1545 1552 8 0.6 1544.9 0.3X +6 units w/ interval 1689 1692 3 0.6 1689.0 0.2X +6 units w/o interval 1703 1706 5 0.6 1702.5 0.2X +7 units w/ interval 2287 2287 1 0.4 2286.6 0.2X +7 units w/o interval 2267 2272 4 0.4 2267.2 0.2X +8 units w/ interval 2475 2479 5 0.4 2474.8 0.2X +8 units w/o interval 2471 2476 4 0.4 2471.1 0.2X +9 units w/ interval 2625 2629 3 0.4 2625.4 0.2X +9 units w/o interval 2616 2624 12 0.4 2616.0 0.2X +10 units w/ interval 2850 2852 2 0.4 2850.5 0.1X +10 units w/o interval 2842 2845 4 0.4 2842.3 0.1X +11 units w/ interval 3177 3180 4 0.3 3177.3 0.1X +11 units w/o interval 3164 3174 8 0.3 3164.1 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor make_interval(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -prepare make_interval() 368 374 5 2.7 368.5 1.0X -make_interval(0, 1, 2, 3, 4, 5, 50.123456) 47 50 3 21.1 47.3 7.8X -make_interval(*, *, 2, 3, 4, 5, 50.123456) 59 62 5 17.0 58.8 6.3X -make_interval(0, 1, *, *, 4, 5, 50.123456) 62 64 3 16.0 62.3 5.9X -make_interval(0, 1, 2, 3, *, *, *) 342 345 2 2.9 342.0 1.1X -make_interval(*, *, *, *, *, *, *) 351 357 7 2.8 350.9 1.1X +prepare make_interval() 337 340 3 3.0 337.0 1.0X +make_interval(0, 1, 2, 3, 4, 5, 50.123456) 42 43 1 23.6 42.4 7.9X +make_interval(*, *, 2, 3, 4, 5, 50.123456) 53 55 3 19.0 52.7 6.4X +make_interval(0, 1, *, *, 4, 5, 50.123456) 56 60 5 17.9 55.7 6.0X +make_interval(0, 1, 2, 3, *, *, *) 341 345 5 2.9 341.2 1.0X +make_interval(*, *, *, *, *, *, *) 343 344 1 2.9 342.8 1.0X diff --git a/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt b/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt index 473cfdde4d76d..b908a2502d766 100644 --- a/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt @@ -2,81 +2,81 @@ Join Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long wholestage off 2088 2099 15 10.0 99.6 1.0X -Join w long wholestage on 918 947 28 22.8 43.8 2.3X +Join w long wholestage off 2048 2052 5 10.2 97.7 1.0X +Join w long wholestage on 884 926 37 23.7 42.1 2.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w long duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long duplicated wholestage off 1991 1993 3 10.5 94.9 1.0X -Join w long duplicated wholestage on 911 923 16 23.0 43.4 2.2X +Join w long duplicated wholestage off 2023 2028 7 10.4 96.5 1.0X +Join w long duplicated wholestage on 887 904 18 23.6 42.3 2.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w 2 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 ints wholestage off 106730 106790 85 0.2 5089.3 1.0X -Join w 2 ints wholestage on 105489 105534 40 0.2 5030.1 1.0X +Join w 2 ints wholestage off 107738 107744 9 0.2 5137.3 1.0X +Join w 2 ints wholestage on 105798 105824 18 0.2 5044.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w 2 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs wholestage off 3315 3323 12 6.3 158.1 1.0X -Join w 2 longs wholestage on 1972 1997 25 10.6 94.0 1.7X +Join w 2 longs wholestage off 3236 3370 189 6.5 154.3 1.0X +Join w 2 longs wholestage on 1977 2014 37 10.6 94.3 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w 2 longs duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs duplicated wholestage off 8534 8563 42 2.5 406.9 1.0X -Join w 2 longs duplicated wholestage on 5521 5729 121 3.8 263.3 1.5X +Join w 2 longs duplicated wholestage off 8713 8742 42 2.4 415.5 1.0X +Join w 2 longs duplicated wholestage on 5435 5556 105 3.9 259.2 1.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor outer join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -outer join w long wholestage off 1590 1593 5 13.2 75.8 1.0X -outer join w long wholestage on 948 978 46 22.1 45.2 1.7X +outer join w long wholestage off 1586 1675 126 13.2 75.6 1.0X +outer join w long wholestage on 904 935 30 23.2 43.1 1.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor semi join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -semi join w long wholestage off 1053 1055 3 19.9 50.2 1.0X -semi join w long wholestage on 568 585 15 37.0 27.1 1.9X +semi join w long wholestage off 1052 1053 1 19.9 50.2 1.0X +semi join w long wholestage on 522 528 4 40.2 24.9 2.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sort merge join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort merge join wholestage off 519 527 11 4.0 247.7 1.0X -sort merge join wholestage on 467 493 27 4.5 222.5 1.1X +sort merge join wholestage off 525 536 15 4.0 250.4 1.0X +sort merge join wholestage on 460 465 5 4.6 219.4 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sort merge join with duplicates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -sort merge join with duplicates wholestage off 1031 1042 15 2.0 491.7 1.0X -sort merge join with duplicates wholestage on 960 968 8 2.2 457.8 1.1X +sort merge join with duplicates wholestage off 1008 1020 17 2.1 480.5 1.0X +sort merge join with duplicates wholestage on 920 934 14 2.3 438.5 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor shuffle hash join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -shuffle hash join wholestage off 530 537 10 7.9 126.4 1.0X -shuffle hash join wholestage on 415 434 12 10.1 99.1 1.3X +shuffle hash join wholestage off 494 508 20 8.5 117.7 1.0X +shuffle hash join wholestage on 412 426 17 10.2 98.1 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -broadcast nested loop join wholestage off 25590 25605 22 0.8 1220.2 1.0X -broadcast nested loop join wholestage on 18711 18767 79 1.1 892.2 1.4X +broadcast nested loop join wholestage off 25779 25822 61 0.8 1229.2 1.0X +broadcast nested loop join wholestage on 18110 18272 148 1.2 863.6 1.4X diff --git a/sql/core/benchmarks/JoinBenchmark-results.txt b/sql/core/benchmarks/JoinBenchmark-results.txt index 9c460f39d1ae7..abf8364e533d7 100644 --- a/sql/core/benchmarks/JoinBenchmark-results.txt +++ b/sql/core/benchmarks/JoinBenchmark-results.txt @@ -2,81 +2,81 @@ Join Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long wholestage off 2221 2232 15 9.4 105.9 1.0X -Join w long wholestage on 1032 1080 56 20.3 49.2 2.2X +Join w long wholestage off 2021 2057 50 10.4 96.4 1.0X +Join w long wholestage on 922 960 38 22.7 44.0 2.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w long duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long duplicated wholestage off 2180 2181 1 9.6 104.0 1.0X -Join w long duplicated wholestage on 917 927 10 22.9 43.7 2.4X +Join w long duplicated wholestage off 2060 2199 197 10.2 98.2 1.0X +Join w long duplicated wholestage on 927 935 11 22.6 44.2 2.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w 2 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 ints wholestage off 112458 112473 21 0.2 5362.4 1.0X -Join w 2 ints wholestage on 110885 110937 68 0.2 5287.4 1.0X +Join w 2 ints wholestage off 112712 112721 12 0.2 5374.5 1.0X +Join w 2 ints wholestage on 111144 111183 26 0.2 5299.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w 2 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs wholestage off 3502 3507 7 6.0 167.0 1.0X -Join w 2 longs wholestage on 2071 2085 10 10.1 98.8 1.7X +Join w 2 longs wholestage off 3114 3126 17 6.7 148.5 1.0X +Join w 2 longs wholestage on 1971 1991 17 10.6 94.0 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Join w 2 longs duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs duplicated wholestage off 9384 9385 2 2.2 447.4 1.0X -Join w 2 longs duplicated wholestage on 5493 5515 16 3.8 261.9 1.7X +Join w 2 longs duplicated wholestage off 8230 8239 13 2.5 392.4 1.0X +Join w 2 longs duplicated wholestage on 5478 5494 16 3.8 261.2 1.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor outer join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -outer join w long wholestage off 1871 1884 19 11.2 89.2 1.0X -outer join w long wholestage on 1031 1054 30 20.4 49.1 1.8X +outer join w long wholestage off 1607 1627 28 13.1 76.6 1.0X +outer join w long wholestage on 906 914 6 23.1 43.2 1.8X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor semi join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -semi join w long wholestage off 1189 1195 8 17.6 56.7 1.0X -semi join w long wholestage on 549 569 35 38.2 26.2 2.2X +semi join w long wholestage off 1047 1050 3 20.0 49.9 1.0X +semi join w long wholestage on 522 530 5 40.2 24.9 2.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sort merge join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort merge join wholestage off 526 535 13 4.0 250.9 1.0X -sort merge join wholestage on 461 470 6 4.5 220.0 1.1X +sort merge join wholestage off 512 516 5 4.1 244.2 1.0X +sort merge join wholestage on 459 477 13 4.6 218.7 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sort merge join with duplicates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -sort merge join with duplicates wholestage off 1026 1054 39 2.0 489.2 1.0X -sort merge join with duplicates wholestage on 922 948 28 2.3 439.4 1.1X +sort merge join with duplicates wholestage off 970 984 20 2.2 462.4 1.0X +sort merge join with duplicates wholestage on 868 879 10 2.4 413.9 1.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor shuffle hash join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -shuffle hash join wholestage off 521 533 16 8.0 124.3 1.0X -shuffle hash join wholestage on 383 393 10 11.0 91.3 1.4X +shuffle hash join wholestage off 512 520 13 8.2 122.0 1.0X +shuffle hash join wholestage on 353 369 20 11.9 84.1 1.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -broadcast nested loop join wholestage off 29667 29788 171 0.7 1414.6 1.0X -broadcast nested loop join wholestage on 18946 19016 66 1.1 903.4 1.6X +broadcast nested loop join wholestage off 25058 25234 249 0.8 1194.9 1.0X +broadcast nested loop join wholestage on 18197 18557 692 1.2 867.7 1.4X diff --git a/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt index d87eb6530a855..381f30f110867 100644 --- a/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt @@ -3,128 +3,128 @@ Benchmark for performance of JSON parsing ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2240 2300 98 2.2 448.0 1.0X -UTF-8 is set 3325 3333 8 1.5 665.0 0.7X +No encoding 2632 2713 96 1.9 526.3 1.0X +UTF-8 is set 4814 4824 12 1.0 962.8 0.5X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 1890 1917 23 2.6 378.1 1.0X -UTF-8 is set 3155 3158 3 1.6 630.9 0.6X +No encoding 2193 2256 82 2.3 438.6 1.0X +UTF-8 is set 4539 4544 5 1.1 907.8 0.5X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 5079 5266 188 0.2 5078.9 1.0X -UTF-8 is set 4272 4280 6 0.2 4272.5 1.2X +No encoding 4593 4651 87 0.2 4592.9 1.0X +UTF-8 is set 4837 4856 30 0.2 4837.0 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 9614 9866 271 0.0 192271.0 1.0X -UTF-8 is set 10517 10608 80 0.0 210331.2 0.9X +No encoding 9423 9596 234 0.0 188463.6 1.0X +UTF-8 is set 10747 10797 49 0.0 214934.1 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 1661 1666 6 0.6 1660.6 1.0X -Select 1 column 1078 1081 2 0.9 1078.3 1.5X +Select 10 columns 1769 1779 11 0.6 1769.0 1.0X +Select 1 column 1217 1220 3 0.8 1217.4 1.5X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 604 612 10 1.7 604.3 1.0X -Short column with UTF-8 828 839 15 1.2 828.3 0.7X -Wide column without encoding 7212 7255 38 0.1 7212.1 0.1X -Wide column with UTF-8 7446 7462 15 0.1 7445.8 0.1X +Short column without encoding 658 665 6 1.5 658.4 1.0X +Short column with UTF-8 1144 1162 16 0.9 1143.9 0.6X +Wide column without encoding 5152 5164 19 0.2 5151.8 0.1X +Wide column with UTF-8 7246 7274 28 0.1 7246.1 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 60 63 2 16.6 60.1 1.0X -from_json 1168 1175 7 0.9 1168.4 0.1X -json_tuple 1158 1170 16 0.9 1158.4 0.1X -get_json_object wholestage off 1075 1081 6 0.9 1074.8 0.1X -get_json_object wholestage on 1018 1029 13 1.0 1018.1 0.1X +Text read 59 62 2 16.9 59.0 1.0X +from_json 1119 1125 6 0.9 1119.4 0.1X +json_tuple 1039 1044 6 1.0 1039.4 0.1X +get_json_object wholestage off 1054 1060 5 0.9 1053.7 0.1X +get_json_object wholestage on 991 998 6 1.0 991.2 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 232 238 10 21.6 46.4 1.0X -schema inferring 1919 1928 9 2.6 383.7 0.1X -parsing 2717 2724 7 1.8 543.4 0.1X +Text read 235 242 12 21.3 46.9 1.0X +schema inferring 1966 1972 9 2.5 393.1 0.1X +parsing 2961 2978 24 1.7 592.2 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 562 569 7 8.9 112.5 1.0X -Schema inferring 2424 2432 9 2.1 484.8 0.2X -Parsing without charset 2808 2810 3 1.8 561.7 0.2X -Parsing with UTF-8 3993 4001 12 1.3 798.5 0.1X +Text read 563 569 6 8.9 112.5 1.0X +Schema inferring 2535 2538 3 2.0 507.0 0.2X +Parsing without charset 3072 3102 36 1.6 614.4 0.2X +Parsing with UTF-8 5607 5629 33 0.9 1121.5 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 101 108 7 9.9 101.4 1.0X -to_json(timestamp) 705 707 2 1.4 704.6 0.1X -write timestamps to files 598 611 20 1.7 598.3 0.2X -Create a dataset of dates 112 118 10 8.9 111.9 0.9X -to_json(date) 546 548 2 1.8 546.3 0.2X -write dates to files 393 399 9 2.5 393.1 0.3X +Create a dataset of timestamps 103 105 2 9.7 102.7 1.0X +to_json(timestamp) 555 557 3 1.8 554.8 0.2X +write timestamps to files 591 597 7 1.7 591.0 0.2X +Create a dataset of dates 121 125 4 8.3 120.8 0.8X +to_json(date) 420 422 3 2.4 419.6 0.2X +write dates to files 393 394 1 2.5 392.6 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 149 153 4 6.7 149.4 1.0X -read timestamps from files 1044 1049 5 1.0 1043.6 0.1X -infer timestamps from files 1973 1983 12 0.5 1972.7 0.1X -read date text from files 140 143 5 7.1 140.0 1.1X -read date from files 690 698 9 1.4 690.3 0.2X -timestamp strings 141 149 7 7.1 140.8 1.1X -parse timestamps from Dataset[String] 1265 1266 2 0.8 1264.5 0.1X -infer timestamps from Dataset[String] 2160 2169 12 0.5 2160.5 0.1X -date strings 248 250 2 4.0 248.3 0.6X -parse dates from Dataset[String] 1010 1015 6 1.0 1009.6 0.1X -from_json(timestamp) 1781 1810 27 0.6 1781.1 0.1X -from_json(date) 1510 1514 4 0.7 1510.0 0.1X -infer error timestamps from Dataset[String] with default format 1412 1420 8 0.7 1412.2 0.1X -infer error timestamps from Dataset[String] with user-provided format 1372 1378 6 0.7 1371.6 0.1X -infer error timestamps from Dataset[String] with legacy format 1427 1439 18 0.7 1426.6 0.1X +read timestamp text from files 143 149 9 7.0 143.4 1.0X +read timestamps from files 1102 1110 13 0.9 1101.8 0.1X +infer timestamps from files 2042 2051 14 0.5 2041.7 0.1X +read date text from files 140 143 4 7.2 139.6 1.0X +read date from files 739 764 33 1.4 739.1 0.2X +timestamp strings 135 136 1 7.4 134.6 1.1X +parse timestamps from Dataset[String] 1321 1328 7 0.8 1320.8 0.1X +infer timestamps from Dataset[String] 2235 2239 5 0.4 2235.3 0.1X +date strings 194 196 3 5.2 193.6 0.7X +parse dates from Dataset[String] 1054 1058 4 0.9 1054.1 0.1X +from_json(timestamp) 1750 1753 4 0.6 1750.0 0.1X +from_json(date) 1476 1480 6 0.7 1476.1 0.1X +infer error timestamps from Dataset[String] with default format 1499 1502 4 0.7 1499.4 0.1X +infer error timestamps from Dataset[String] with user-provided format 1491 1496 7 0.7 1491.1 0.1X +infer error timestamps from Dataset[String] with legacy format 1528 1538 9 0.7 1527.8 0.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 6100 6124 33 0.0 61003.7 1.0X -pushdown disabled 5957 5981 31 0.0 59569.9 1.0X -w/ filters 729 737 8 0.1 7291.0 8.4X +w/o filters 6122 6143 24 0.0 61217.0 1.0X +pushdown disabled 5947 5957 10 0.0 59474.6 1.0X +w/ filters 700 703 4 0.1 7004.2 8.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partial JSON results: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -parse invalid JSON 2476 2480 5 0.0 247550.8 1.0X +parse invalid JSON 2496 2508 16 0.0 249615.2 1.0X diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt index bf7662a428dfb..106ad732614c0 100644 --- a/sql/core/benchmarks/JsonBenchmark-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-results.txt @@ -3,128 +3,128 @@ Benchmark for performance of JSON parsing ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2335 2405 102 2.1 467.1 1.0X -UTF-8 is set 3188 3205 17 1.6 637.5 0.7X +No encoding 2318 2390 99 2.2 463.5 1.0X +UTF-8 is set 4814 4832 16 1.0 962.8 0.5X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2017 2130 130 2.5 403.4 1.0X -UTF-8 is set 3090 3104 22 1.6 618.0 0.7X +No encoding 2303 2369 93 2.2 460.6 1.0X +UTF-8 is set 4841 4855 12 1.0 968.2 0.5X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 3264 3390 134 0.3 3264.0 1.0X -UTF-8 is set 4385 4419 40 0.2 4384.9 0.7X +No encoding 4324 4400 95 0.2 4324.1 1.0X +UTF-8 is set 4825 4836 10 0.2 4824.6 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 8549 8761 245 0.0 170970.8 1.0X -UTF-8 is set 9833 9868 31 0.0 196661.2 0.9X +No encoding 9577 9734 206 0.0 191531.0 1.0X +UTF-8 is set 10240 10256 14 0.0 204805.9 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 1485 1498 22 0.7 1484.5 1.0X -Select 1 column 1056 1063 6 0.9 1055.6 1.4X +Select 10 columns 1602 1605 3 0.6 1601.6 1.0X +Select 1 column 1160 1169 12 0.9 1160.0 1.4X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 622 630 9 1.6 622.2 1.0X -Short column with UTF-8 792 802 13 1.3 792.1 0.8X -Wide column without encoding 7214 7321 111 0.1 7214.3 0.1X -Wide column with UTF-8 6455 6493 54 0.2 6454.8 0.1X +Short column without encoding 661 665 4 1.5 660.7 1.0X +Short column with UTF-8 1188 1188 1 0.8 1187.9 0.6X +Wide column without encoding 5314 5336 21 0.2 5313.6 0.1X +Wide column with UTF-8 7265 7267 2 0.1 7265.1 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 59 61 3 16.9 59.1 1.0X -from_json 1083 1088 5 0.9 1082.5 0.1X -json_tuple 1125 1133 7 0.9 1125.5 0.1X -get_json_object wholestage off 1049 1062 12 1.0 1048.6 0.1X -get_json_object wholestage on 968 975 7 1.0 968.1 0.1X +Text read 58 64 6 17.4 57.5 1.0X +from_json 1092 1106 21 0.9 1091.9 0.1X +json_tuple 1075 1077 3 0.9 1074.7 0.1X +get_json_object wholestage off 1071 1081 10 0.9 1071.1 0.1X +get_json_object wholestage on 1020 1033 20 1.0 1019.7 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 234 238 5 21.4 46.8 1.0X -schema inferring 1774 1776 2 2.8 354.8 0.1X -parsing 2648 2686 33 1.9 529.6 0.1X +Text read 233 270 61 21.5 46.6 1.0X +schema inferring 1786 1790 8 2.8 357.2 0.1X +parsing 2837 2844 9 1.8 567.4 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 615 634 23 8.1 123.0 1.0X -Schema inferring 2319 2330 10 2.2 463.8 0.3X -Parsing without charset 2834 2844 9 1.8 566.8 0.2X -Parsing with UTF-8 3741 3758 17 1.3 748.1 0.2X +Text read 622 625 4 8.0 124.4 1.0X +Schema inferring 2378 2386 7 2.1 475.7 0.3X +Parsing without charset 3016 3019 3 1.7 603.2 0.2X +Parsing with UTF-8 5651 5660 11 0.9 1130.2 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 117 125 10 8.6 116.8 1.0X -to_json(timestamp) 803 809 5 1.2 803.0 0.1X -write timestamps to files 698 701 4 1.4 697.6 0.2X -Create a dataset of dates 123 128 6 8.1 123.2 0.9X -to_json(date) 594 602 7 1.7 594.2 0.2X -write dates to files 471 479 7 2.1 471.4 0.2X +Create a dataset of timestamps 100 101 2 10.0 99.6 1.0X +to_json(timestamp) 648 653 5 1.5 648.1 0.2X +write timestamps to files 677 683 6 1.5 677.2 0.1X +Create a dataset of dates 128 130 3 7.8 128.1 0.8X +to_json(date) 455 466 10 2.2 455.1 0.2X +write dates to files 444 448 5 2.3 443.8 0.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 157 161 3 6.4 156.8 1.0X -read timestamps from files 1010 1019 10 1.0 1009.8 0.2X -infer timestamps from files 1924 1930 10 0.5 1923.9 0.1X -read date text from files 147 150 4 6.8 146.6 1.1X -read date from files 705 710 6 1.4 705.5 0.2X -timestamp strings 151 159 7 6.6 150.9 1.0X -parse timestamps from Dataset[String] 1191 1193 1 0.8 1191.3 0.1X -infer timestamps from Dataset[String] 2049 2055 7 0.5 2049.2 0.1X -date strings 228 235 6 4.4 228.3 0.7X -parse dates from Dataset[String] 955 967 14 1.0 954.8 0.2X -from_json(timestamp) 1669 1681 12 0.6 1669.4 0.1X -from_json(date) 1444 1447 5 0.7 1443.9 0.1X -infer error timestamps from Dataset[String] with default format 1398 1401 5 0.7 1397.9 0.1X -infer error timestamps from Dataset[String] with user-provided format 1420 1423 2 0.7 1420.2 0.1X -infer error timestamps from Dataset[String] with legacy format 1419 1437 21 0.7 1418.7 0.1X +read timestamp text from files 155 157 3 6.5 154.7 1.0X +read timestamps from files 1091 1093 2 0.9 1091.1 0.1X +infer timestamps from files 2016 2017 1 0.5 2016.2 0.1X +read date text from files 141 142 1 7.1 141.2 1.1X +read date from files 744 751 6 1.3 744.2 0.2X +timestamp strings 132 135 3 7.6 131.5 1.2X +parse timestamps from Dataset[String] 1247 1249 2 0.8 1246.8 0.1X +infer timestamps from Dataset[String] 2129 2134 7 0.5 2128.7 0.1X +date strings 203 204 1 4.9 202.6 0.8X +parse dates from Dataset[String] 1005 1006 1 1.0 1004.7 0.2X +from_json(timestamp) 1659 1672 11 0.6 1659.4 0.1X +from_json(date) 1413 1416 4 0.7 1413.2 0.1X +infer error timestamps from Dataset[String] with default format 1400 1407 6 0.7 1400.4 0.1X +infer error timestamps from Dataset[String] with user-provided format 1411 1420 13 0.7 1410.7 0.1X +infer error timestamps from Dataset[String] with legacy format 1441 1461 20 0.7 1441.2 0.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 6592 6601 7 0.0 65920.4 1.0X -pushdown disabled 5825 5829 4 0.0 58246.5 1.1X -w/ filters 664 802 200 0.2 6643.7 9.9X +w/o filters 5832 5837 9 0.0 58320.8 1.0X +pushdown disabled 5810 5821 10 0.0 58100.1 1.0X +w/ filters 679 767 147 0.1 6792.3 8.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partial JSON results: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -parse invalid JSON 2714 2828 195 0.0 271356.0 1.0X +parse invalid JSON 2325 2417 159 0.0 232496.5 1.0X diff --git a/sql/core/benchmarks/LargeRowBenchmark-jdk21-results.txt b/sql/core/benchmarks/LargeRowBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..dbcf544b492d9 --- /dev/null +++ b/sql/core/benchmarks/LargeRowBenchmark-jdk21-results.txt @@ -0,0 +1,26 @@ +================================================================================================ +Large Row Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +#rows: 100, #cols: 10, cell: 1.3 MB: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +built-in UPPER 5909 6154 347 0.0 59088236.5 1.0X +udf UPPER 4106 4364 364 0.0 41062501.9 1.4X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +#rows: 1, #cols: 1, cell: 300.0 MB: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +built-in UPPER 1317 1319 3 0.0 1317449498.0 1.0X +udf UPPER 954 975 25 0.0 953744994.0 1.4X + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +#rows: 1, #cols: 200, cell: 1.0 MB: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +built-in UPPER 1118 1138 28 0.0 1117901962.0 1.0X +udf UPPER 1145 1210 91 0.0 1145234313.0 1.0X + + diff --git a/sql/core/benchmarks/LargeRowBenchmark-results.txt b/sql/core/benchmarks/LargeRowBenchmark-results.txt new file mode 100644 index 0000000000000..9fafe282238b6 --- /dev/null +++ b/sql/core/benchmarks/LargeRowBenchmark-results.txt @@ -0,0 +1,26 @@ +================================================================================================ +Large Row Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +#rows: 100, #cols: 10, cell: 1.3 MB: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +built-in UPPER 6610 6651 58 0.0 66101681.9 1.0X +udf UPPER 4289 4291 3 0.0 42892607.0 1.5X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +#rows: 1, #cols: 1, cell: 300.0 MB: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +built-in UPPER 1492 1510 26 0.0 1492292577.0 1.0X +udf UPPER 1033 1034 1 0.0 1032584220.0 1.4X + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +#rows: 1, #cols: 200, cell: 1.0 MB: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +built-in UPPER 1271 1290 28 0.0 1270654457.0 1.0X +udf UPPER 1397 1558 228 0.0 1396607518.0 0.9X + + diff --git a/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt b/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt index 3f95fc73de078..ba1261bd77389 100644 --- a/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt @@ -1,22 +1,22 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor make_date(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare make_date() 2328 2360 31 43.0 23.3 1.0X -make_date(2019, 9, 16) 1883 1936 46 53.1 18.8 1.2X -make_date(*, *, *) 4034 4050 20 24.8 40.3 0.6X +prepare make_date() 2319 2381 55 43.1 23.2 1.0X +make_date(2019, 9, 16) 2021 2048 28 49.5 20.2 1.1X +make_date(*, *, *) 3857 3872 14 25.9 38.6 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor make_timestamp(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -prepare make_timestamp() 358 367 11 2.8 358.0 1.0X -make_timestamp(2019, 1, 2, 3, 4, 50.123456) 43 46 4 23.3 42.9 8.3X -make_timestamp(2019, 1, 2, 3, 4, 60.000000) 37 40 3 26.8 37.3 9.6X -make_timestamp(2019, 12, 31, 23, 59, 60.00) 37 48 10 27.3 36.6 9.8X -make_timestamp(*, *, *, 3, 4, 50.123456) 170 171 2 5.9 169.9 2.1X -make_timestamp(*, *, *, *, *, 0) 113 116 3 8.9 112.5 3.2X -make_timestamp(*, *, *, *, *, 60.0) 158 162 4 6.3 158.2 2.3X -make_timestamp(2019, 1, 2, *, *, *) 478 479 1 2.1 477.9 0.7X -make_timestamp(*, *, *, *, *, *) 491 495 6 2.0 491.5 0.7X +prepare make_timestamp() 346 351 4 2.9 346.3 1.0X +make_timestamp(2019, 1, 2, 3, 4, 50.123456) 39 43 4 25.8 38.8 8.9X +make_timestamp(2019, 1, 2, 3, 4, 60.000000) 42 48 8 23.8 42.0 8.2X +make_timestamp(2019, 12, 31, 23, 59, 60.00) 33 37 6 29.9 33.4 10.4X +make_timestamp(*, *, *, 3, 4, 50.123456) 160 162 1 6.2 160.2 2.2X +make_timestamp(*, *, *, *, *, 0) 103 109 6 9.7 102.6 3.4X +make_timestamp(*, *, *, *, *, 60.0) 144 148 5 6.9 144.2 2.4X +make_timestamp(2019, 1, 2, *, *, *) 422 424 3 2.4 422.0 0.8X +make_timestamp(*, *, *, *, *, *) 459 460 1 2.2 459.3 0.8X diff --git a/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt b/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt index 34855593dd93f..acfb4fa3ddbb1 100644 --- a/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt @@ -1,22 +1,22 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor make_date(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare make_date() 2195 2443 263 45.5 22.0 1.0X -make_date(2019, 9, 16) 1806 1860 81 55.4 18.1 1.2X -make_date(*, *, *) 4107 4186 74 24.4 41.1 0.5X +prepare make_date() 2164 2170 6 46.2 21.6 1.0X +make_date(2019, 9, 16) 1823 1836 11 54.9 18.2 1.2X +make_date(*, *, *) 4074 4085 16 24.5 40.7 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor make_timestamp(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -prepare make_timestamp() 354 364 9 2.8 354.3 1.0X -make_timestamp(2019, 1, 2, 3, 4, 50.123456) 44 48 3 22.5 44.5 8.0X -make_timestamp(2019, 1, 2, 3, 4, 60.000000) 48 53 4 20.8 48.1 7.4X -make_timestamp(2019, 12, 31, 23, 59, 60.00) 34 37 4 29.3 34.1 10.4X -make_timestamp(*, *, *, 3, 4, 50.123456) 146 158 16 6.9 146.0 2.4X -make_timestamp(*, *, *, *, *, 0) 107 114 9 9.4 106.5 3.3X -make_timestamp(*, *, *, *, *, 60.0) 145 150 8 6.9 144.7 2.4X -make_timestamp(2019, 1, 2, *, *, *) 453 454 2 2.2 452.7 0.8X -make_timestamp(*, *, *, *, *, *) 475 480 6 2.1 475.2 0.7X +prepare make_timestamp() 336 350 13 3.0 336.0 1.0X +make_timestamp(2019, 1, 2, 3, 4, 50.123456) 44 50 8 22.6 44.2 7.6X +make_timestamp(2019, 1, 2, 3, 4, 60.000000) 37 43 8 26.8 37.4 9.0X +make_timestamp(2019, 12, 31, 23, 59, 60.00) 37 38 0 26.8 37.3 9.0X +make_timestamp(*, *, *, 3, 4, 50.123456) 155 159 5 6.4 155.2 2.2X +make_timestamp(*, *, *, *, *, 0) 109 111 4 9.2 109.1 3.1X +make_timestamp(*, *, *, *, *, 60.0) 141 144 5 7.1 140.9 2.4X +make_timestamp(2019, 1, 2, *, *, *) 451 452 1 2.2 451.1 0.7X +make_timestamp(*, *, *, *, *, *) 480 483 2 2.1 479.8 0.7X diff --git a/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt b/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt index 8f1696638d097..1b9b8408c42ed 100644 --- a/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt @@ -2,45 +2,45 @@ Metadata Struct Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Vectorized Parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 614 644 14 8.1 122.7 1.0X -_metadata.file_path 737 752 10 6.8 147.5 0.8X -_metadata.file_name 737 751 17 6.8 147.3 0.8X -_metadata.file_size 638 678 14 7.8 127.5 1.0X -_metadata.file_block_start 632 654 11 7.9 126.5 1.0X -_metadata.file_block_length 665 676 7 7.5 133.0 0.9X -_metadata.file_modification_time 636 655 13 7.9 127.1 1.0X -_metadata.row_index 714 728 8 7.0 142.9 0.9X -_metadata 966 993 15 5.2 193.2 0.6X +no metadata columns 615 646 20 8.1 122.9 1.0X +_metadata.file_path 731 748 9 6.8 146.2 0.8X +_metadata.file_name 720 749 12 6.9 144.0 0.9X +_metadata.file_size 668 682 10 7.5 133.7 0.9X +_metadata.file_block_start 670 679 11 7.5 134.0 0.9X +_metadata.file_block_length 668 679 8 7.5 133.6 0.9X +_metadata.file_modification_time 647 674 8 7.7 129.5 0.9X +_metadata.row_index 702 721 11 7.1 140.4 0.9X +_metadata 965 991 17 5.2 192.9 0.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet-mr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 2804 2862 34 1.8 560.9 1.0X -_metadata.file_path 3567 3624 33 1.4 713.3 0.8X -_metadata.file_name 3614 3648 27 1.4 722.8 0.8X -_metadata.file_size 3459 3485 21 1.4 691.8 0.8X -_metadata.file_block_start 3460 3498 25 1.4 692.0 0.8X -_metadata.file_block_length 3396 3432 32 1.5 679.3 0.8X -_metadata.file_modification_time 3385 3416 19 1.5 677.1 0.8X -_metadata.row_index 3734 3762 18 1.3 746.8 0.8X -_metadata 4804 4837 20 1.0 960.8 0.6X +no metadata columns 2555 2601 25 2.0 511.1 1.0X +_metadata.file_path 3338 3382 19 1.5 667.6 0.8X +_metadata.file_name 3325 3365 20 1.5 665.0 0.8X +_metadata.file_size 3141 3164 16 1.6 628.2 0.8X +_metadata.file_block_start 3123 3172 22 1.6 624.7 0.8X +_metadata.file_block_length 3138 3158 11 1.6 627.6 0.8X +_metadata.file_modification_time 3120 3152 31 1.6 624.1 0.8X +_metadata.row_index 3664 3700 23 1.4 732.9 0.7X +_metadata 4819 4856 26 1.0 963.8 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 6626 6678 89 0.8 1325.2 1.0X -_metadata.file_path 7476 7498 14 0.7 1495.3 0.9X -_metadata.file_name 7468 7485 15 0.7 1493.7 0.9X -_metadata.file_size 7302 7326 18 0.7 1460.3 0.9X -_metadata.file_block_start 7303 7327 14 0.7 1460.5 0.9X -_metadata.file_block_length 7312 7337 14 0.7 1462.4 0.9X -_metadata.file_modification_time 7322 7340 11 0.7 1464.3 0.9X -_metadata 8135 8155 14 0.6 1627.0 0.8X +no metadata columns 7218 7289 102 0.7 1443.5 1.0X +_metadata.file_path 8149 8178 19 0.6 1629.8 0.9X +_metadata.file_name 8137 8164 22 0.6 1627.5 0.9X +_metadata.file_size 7942 7964 14 0.6 1588.5 0.9X +_metadata.file_block_start 7916 7959 20 0.6 1583.3 0.9X +_metadata.file_block_length 7931 7958 17 0.6 1586.2 0.9X +_metadata.file_modification_time 7934 7956 16 0.6 1586.7 0.9X +_metadata 8829 8857 27 0.6 1765.8 0.8X diff --git a/sql/core/benchmarks/MetadataStructBenchmark-results.txt b/sql/core/benchmarks/MetadataStructBenchmark-results.txt index 82429601dab29..bd14214994cd1 100644 --- a/sql/core/benchmarks/MetadataStructBenchmark-results.txt +++ b/sql/core/benchmarks/MetadataStructBenchmark-results.txt @@ -2,45 +2,45 @@ Metadata Struct Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Vectorized Parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 650 670 20 7.7 129.9 1.0X -_metadata.file_path 743 762 12 6.7 148.7 0.9X -_metadata.file_name 742 752 7 6.7 148.4 0.9X -_metadata.file_size 677 685 6 7.4 135.4 1.0X -_metadata.file_block_start 675 686 12 7.4 134.9 1.0X -_metadata.file_block_length 677 683 5 7.4 135.5 1.0X -_metadata.file_modification_time 673 682 7 7.4 134.7 1.0X -_metadata.row_index 718 728 8 7.0 143.6 0.9X -_metadata 1023 1033 6 4.9 204.6 0.6X +no metadata columns 639 660 20 7.8 127.8 1.0X +_metadata.file_path 745 760 13 6.7 148.9 0.9X +_metadata.file_name 738 747 8 6.8 147.6 0.9X +_metadata.file_size 672 678 4 7.4 134.3 1.0X +_metadata.file_block_start 671 678 4 7.4 134.3 1.0X +_metadata.file_block_length 670 677 5 7.5 134.0 1.0X +_metadata.file_modification_time 669 678 9 7.5 133.8 1.0X +_metadata.row_index 726 731 4 6.9 145.1 0.9X +_metadata 1022 1031 6 4.9 204.5 0.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Parquet-mr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 2582 2621 41 1.9 516.4 1.0X -_metadata.file_path 3488 3505 12 1.4 697.5 0.7X -_metadata.file_name 3481 3502 10 1.4 696.1 0.7X -_metadata.file_size 3193 3223 18 1.6 638.6 0.8X -_metadata.file_block_start 3198 3217 15 1.6 639.7 0.8X -_metadata.file_block_length 3191 3216 19 1.6 638.2 0.8X -_metadata.file_modification_time 3188 3204 13 1.6 637.7 0.8X -_metadata.row_index 3714 3736 18 1.3 742.8 0.7X -_metadata 4935 4958 24 1.0 986.9 0.5X +no metadata columns 2658 2694 18 1.9 531.7 1.0X +_metadata.file_path 3480 3504 16 1.4 696.1 0.8X +_metadata.file_name 3465 3486 17 1.4 693.0 0.8X +_metadata.file_size 3244 3268 18 1.5 648.8 0.8X +_metadata.file_block_start 3268 3291 15 1.5 653.6 0.8X +_metadata.file_block_length 3269 3296 33 1.5 653.8 0.8X +_metadata.file_modification_time 3275 3301 19 1.5 655.1 0.8X +_metadata.row_index 3727 3742 13 1.3 745.3 0.7X +_metadata 4986 5019 24 1.0 997.2 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor JSON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 7127 7171 21 0.7 1425.3 1.0X -_metadata.file_path 8122 8147 13 0.6 1624.4 0.9X -_metadata.file_name 8143 8166 24 0.6 1628.6 0.9X -_metadata.file_size 7914 7943 14 0.6 1582.8 0.9X -_metadata.file_block_start 7947 7978 16 0.6 1589.4 0.9X -_metadata.file_block_length 7964 7991 20 0.6 1592.8 0.9X -_metadata.file_modification_time 7950 7977 20 0.6 1590.1 0.9X -_metadata 8869 8888 15 0.6 1773.7 0.8X +no metadata columns 7065 7085 12 0.7 1413.0 1.0X +_metadata.file_path 8095 8116 11 0.6 1619.0 0.9X +_metadata.file_name 8133 8148 10 0.6 1626.6 0.9X +_metadata.file_size 7787 7810 10 0.6 1557.3 0.9X +_metadata.file_block_start 7787 7803 9 0.6 1557.4 0.9X +_metadata.file_block_length 7774 7802 13 0.6 1554.8 0.9X +_metadata.file_modification_time 7788 7803 11 0.6 1557.6 0.9X +_metadata 8705 8724 12 0.6 1741.0 0.8X diff --git a/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt b/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt index 220e9da2e088c..edc395387882b 100644 --- a/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt @@ -1,12 +1,12 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor metrics aggregation (50 metrics, 100000 tasks per stage): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -1 stage(s) 565 623 52 0.0 564588687.0 1.0X -2 stage(s) 1318 1347 41 0.0 1318133868.0 0.4X -3 stage(s) 2044 2136 130 0.0 2043877303.0 0.3X +1 stage(s) 602 678 78 0.0 601969935.0 1.0X +2 stage(s) 1150 1277 180 0.0 1149781938.0 0.5X +3 stage(s) 1992 2072 113 0.0 1992188122.0 0.3X Stage Count Stage Proc. Time Aggreg. Time - 1 339 58 - 2 333 213 - 3 376 256 + 1 339 53 + 2 344 156 + 3 387 299 diff --git a/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt b/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt index 5ca7125aa3bc0..d3e2611541552 100644 --- a/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt +++ b/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt @@ -1,12 +1,12 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor metrics aggregation (50 metrics, 100000 tasks per stage): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -1 stage(s) 684 710 28 0.0 683720517.0 1.0X -2 stage(s) 1368 1407 55 0.0 1367925138.0 0.5X -3 stage(s) 1606 1850 346 0.0 1605768734.0 0.4X +1 stage(s) 667 689 20 0.0 666605773.0 1.0X +2 stage(s) 1366 1403 52 0.0 1366457850.0 0.5X +3 stage(s) 2087 2127 57 0.0 2086587364.0 0.3X Stage Count Stage Proc. Time Aggreg. Time - 1 306 92 - 2 437 150 - 3 368 219 + 1 388 98 + 2 346 232 + 3 384 244 diff --git a/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt b/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt index 7228d0a184011..0bccc882d5d0d 100644 --- a/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt @@ -2,126 +2,126 @@ filter & aggregate without group ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor range/filter/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/filter/sum wholestage off 36694 36725 44 57.2 17.5 1.0X -range/filter/sum wholestage on 3483 3597 69 602.2 1.7 10.5X +range/filter/sum wholestage off 35356 36325 1371 59.3 16.9 1.0X +range/filter/sum wholestage on 2827 3470 362 741.9 1.3 12.5X ================================================================================================ range/limit/sum ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor range/limit/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/limit/sum wholestage off 70 95 35 7458.5 0.1 1.0X -range/limit/sum wholestage on 66 82 13 7909.4 0.1 1.1X +range/limit/sum wholestage off 59 60 1 8883.8 0.1 1.0X +range/limit/sum wholestage on 66 70 7 7984.2 0.1 0.9X ================================================================================================ sample ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sample with replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sample with replacement wholestage off 8132 8156 35 16.1 62.0 1.0X -sample with replacement wholestage on 5075 5185 154 25.8 38.7 1.6X +sample with replacement wholestage off 7811 7961 212 16.8 59.6 1.0X +sample with replacement wholestage on 5125 5152 40 25.6 39.1 1.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sample without replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -sample without replacement wholestage off 1885 1893 12 69.5 14.4 1.0X -sample without replacement wholestage on 651 668 20 201.2 5.0 2.9X +sample without replacement wholestage off 1837 1839 3 71.4 14.0 1.0X +sample without replacement wholestage on 660 672 10 198.5 5.0 2.8X ================================================================================================ collect ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor collect: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect 1 million 159 232 53 6.6 151.6 1.0X -collect 2 millions 295 441 85 3.6 281.2 0.5X -collect 4 millions 818 832 12 1.3 780.2 0.2X +collect 1 million 159 228 64 6.6 151.8 1.0X +collect 2 millions 331 404 65 3.2 316.1 0.5X +collect 4 millions 743 912 148 1.4 708.4 0.2X ================================================================================================ collect limit ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor collect limit: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect limit 1 million 147 224 53 7.1 140.6 1.0X -collect limit 2 millions 301 404 86 3.5 287.3 0.5X +collect limit 1 million 161 240 54 6.5 153.6 1.0X +collect limit 2 millions 302 421 79 3.5 287.8 0.5X ================================================================================================ generate explode ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate explode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode array wholestage off 12316 12347 45 1.4 734.1 1.0X -generate explode array wholestage on 2800 2856 69 6.0 166.9 4.4X +generate explode array wholestage off 12439 12453 20 1.3 741.4 1.0X +generate explode array wholestage on 2842 3047 135 5.9 169.4 4.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate explode map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode map wholestage off 23670 23767 138 0.7 1410.8 1.0X -generate explode map wholestage on 9745 9872 100 1.7 580.8 2.4X +generate explode map wholestage off 23498 23624 178 0.7 1400.6 1.0X +generate explode map wholestage on 9976 10151 128 1.7 594.6 2.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate posexplode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate posexplode array wholestage off 12583 12709 178 1.3 750.0 1.0X -generate posexplode array wholestage on 2992 3053 67 5.6 178.3 4.2X +generate posexplode array wholestage off 12883 13108 318 1.3 767.9 1.0X +generate posexplode array wholestage on 2971 3070 66 5.6 177.1 4.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate inline array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate inline array wholestage off 6914 6972 81 2.4 412.1 1.0X -generate inline array wholestage on 2418 2524 90 6.9 144.1 2.9X +generate inline array wholestage off 7289 7311 31 2.3 434.5 1.0X +generate inline array wholestage on 2378 2456 100 7.1 141.7 3.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate big struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate big struct array wholestage off 200 216 22 0.3 3334.6 1.0X -generate big struct array wholestage on 164 180 13 0.4 2733.2 1.2X +generate big struct array wholestage off 191 208 25 0.3 3181.0 1.0X +generate big struct array wholestage on 155 179 24 0.4 2575.5 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate big nested struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -generate big nested struct array wholestage off 18684 18738 77 0.0 311401.3 1.0X -generate big nested struct array wholestage on 152 161 10 0.4 2533.3 122.9X +generate big nested struct array wholestage off 17003 17476 669 0.0 283383.8 1.0X +generate big nested struct array wholestage on 146 149 3 0.4 2436.9 116.3X ================================================================================================ generate regular generator ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate stack: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate stack wholestage off 14179 14202 33 1.2 845.1 1.0X -generate stack wholestage on 3091 3114 26 5.4 184.2 4.6X +generate stack wholestage off 14560 14589 42 1.2 867.8 1.0X +generate stack wholestage on 3365 3428 53 5.0 200.6 4.3X diff --git a/sql/core/benchmarks/MiscBenchmark-results.txt b/sql/core/benchmarks/MiscBenchmark-results.txt index 8a3e9921dbe4b..8df4b3a8c4d0a 100644 --- a/sql/core/benchmarks/MiscBenchmark-results.txt +++ b/sql/core/benchmarks/MiscBenchmark-results.txt @@ -2,126 +2,126 @@ filter & aggregate without group ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor range/filter/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/filter/sum wholestage off 33428 35668 3169 62.7 15.9 1.0X -range/filter/sum wholestage on 2842 3756 511 737.8 1.4 11.8X +range/filter/sum wholestage off 36850 37568 1016 56.9 17.6 1.0X +range/filter/sum wholestage on 3456 3738 158 606.8 1.6 10.7X ================================================================================================ range/limit/sum ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor range/limit/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/limit/sum wholestage off 98 107 13 5332.3 0.2 1.0X -range/limit/sum wholestage on 67 77 11 7806.1 0.1 1.5X +range/limit/sum wholestage off 111 116 7 4736.2 0.2 1.0X +range/limit/sum wholestage on 91 94 3 5739.7 0.2 1.2X ================================================================================================ sample ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sample with replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sample with replacement wholestage off 8058 8215 221 16.3 61.5 1.0X -sample with replacement wholestage on 4994 5005 8 26.2 38.1 1.6X +sample with replacement wholestage off 7793 7904 158 16.8 59.5 1.0X +sample with replacement wholestage on 4935 5027 54 26.6 37.7 1.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor sample without replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -sample without replacement wholestage off 1974 1983 13 66.4 15.1 1.0X -sample without replacement wholestage on 701 713 12 186.9 5.4 2.8X +sample without replacement wholestage off 1857 1874 24 70.6 14.2 1.0X +sample without replacement wholestage on 707 721 14 185.3 5.4 2.6X ================================================================================================ collect ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor collect: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect 1 million 170 227 81 6.2 161.8 1.0X -collect 2 millions 361 470 71 2.9 344.1 0.5X -collect 4 millions 727 753 33 1.4 693.1 0.2X +collect 1 million 161 231 79 6.5 153.9 1.0X +collect 2 millions 307 439 78 3.4 292.6 0.5X +collect 4 millions 692 771 82 1.5 660.2 0.2X ================================================================================================ collect limit ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor collect limit: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect limit 1 million 153 229 120 6.9 145.6 1.0X -collect limit 2 millions 283 420 139 3.7 269.6 0.5X +collect limit 1 million 156 223 80 6.7 148.9 1.0X +collect limit 2 millions 290 412 108 3.6 276.8 0.5X ================================================================================================ generate explode ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate explode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode array wholestage off 13769 13835 94 1.2 820.7 1.0X -generate explode array wholestage on 2901 2973 67 5.8 172.9 4.7X +generate explode array wholestage off 12014 12148 189 1.4 716.1 1.0X +generate explode array wholestage on 2952 3034 65 5.7 175.9 4.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate explode map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode map wholestage off 25526 25775 352 0.7 1521.4 1.0X -generate explode map wholestage on 9201 9259 68 1.8 548.4 2.8X +generate explode map wholestage off 24447 24465 25 0.7 1457.1 1.0X +generate explode map wholestage on 9883 9992 65 1.7 589.1 2.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate posexplode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate posexplode array wholestage off 14288 14334 65 1.2 851.6 1.0X -generate posexplode array wholestage on 2959 3006 49 5.7 176.3 4.8X +generate posexplode array wholestage off 12403 12597 274 1.4 739.3 1.0X +generate posexplode array wholestage on 2935 3002 62 5.7 174.9 4.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate inline array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate inline array wholestage off 7176 7181 8 2.3 427.7 1.0X -generate inline array wholestage on 2383 2471 61 7.0 142.0 3.0X +generate inline array wholestage off 6683 6740 81 2.5 398.4 1.0X +generate inline array wholestage on 2455 2517 45 6.8 146.4 2.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate big struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate big struct array wholestage off 209 296 123 0.3 3483.4 1.0X -generate big struct array wholestage on 178 188 11 0.3 2965.6 1.2X +generate big struct array wholestage off 236 250 20 0.3 3938.8 1.0X +generate big struct array wholestage on 185 217 27 0.3 3089.8 1.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate big nested struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -generate big nested struct array wholestage off 18690 20677 2809 0.0 311503.1 1.0X -generate big nested struct array wholestage on 172 186 15 0.3 2860.7 108.9X +generate big nested struct array wholestage off 18566 21321 3897 0.0 309426.4 1.0X +generate big nested struct array wholestage on 188 206 33 0.3 3132.8 98.8X ================================================================================================ generate regular generator ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor generate stack: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate stack wholestage off 15372 15414 60 1.1 916.2 1.0X -generate stack wholestage on 3053 3069 15 5.5 182.0 5.0X +generate stack wholestage off 13373 13401 39 1.3 797.1 1.0X +generate stack wholestage on 3053 3068 13 5.5 182.0 4.4X diff --git a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt index d54a37baa5770..922ec22d5e0af 100644 --- a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v1 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 64 85 12 15.7 63.6 1.0X -Nested column 58 65 7 17.3 57.9 1.1X -Nested column in array 165 170 5 6.1 164.6 0.4X +Top-level column 57 69 8 17.6 56.7 1.0X +Nested column 55 64 6 18.1 55.3 1.0X +Nested column in array 165 174 6 6.1 165.3 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 236 264 18 4.2 235.7 1.0X -Nested column 241 259 11 4.2 240.7 1.0X -Nested column in array 518 537 11 1.9 518.3 0.5X +Top-level column 237 250 15 4.2 236.6 1.0X +Nested column 241 255 12 4.1 241.1 1.0X +Nested column in array 527 544 16 1.9 527.0 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 219 230 7 4.6 218.5 1.0X -Nested column 224 235 10 4.5 223.6 1.0X -Nested column in array 483 487 5 2.1 482.6 0.5X +Top-level column 207 219 8 4.8 206.6 1.0X +Nested column 218 229 7 4.6 218.1 0.9X +Nested column in array 476 480 4 2.1 476.0 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 218 226 5 4.6 218.5 1.0X -Nested column 240 251 7 4.2 240.2 0.9X -Nested column in array 511 515 4 2.0 510.7 0.4X +Top-level column 207 219 8 4.8 206.6 1.0X +Nested column 228 244 15 4.4 227.7 0.9X +Nested column in array 504 508 4 2.0 503.9 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 65 78 13 15.5 64.6 1.0X -Nested column 74 89 15 13.5 74.0 0.9X -Nested column in array 200 219 16 5.0 199.9 0.3X +Top-level column 65 91 21 15.3 65.2 1.0X +Nested column 75 86 6 13.4 74.6 0.9X +Nested column in array 207 234 16 4.8 206.6 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 302 318 22 3.3 301.5 1.0X -Nested column 353 368 17 2.8 352.9 0.9X -Nested column in array 720 755 35 1.4 720.5 0.4X +Top-level column 292 305 14 3.4 291.9 1.0X +Nested column 344 357 18 2.9 344.2 0.8X +Nested column in array 713 760 41 1.4 712.9 0.4X diff --git a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt index 77a9e92525691..262ce4adc26e0 100644 --- a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v1 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 67 83 11 14.8 67.4 1.0X -Nested column 58 65 7 17.3 57.6 1.2X -Nested column in array 158 163 5 6.3 158.2 0.4X +Top-level column 64 82 12 15.7 63.7 1.0X +Nested column 58 67 6 17.2 58.1 1.1X +Nested column in array 162 169 4 6.2 161.6 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 238 263 18 4.2 237.5 1.0X -Nested column 237 255 12 4.2 237.2 1.0X -Nested column in array 523 534 6 1.9 523.3 0.5X +Top-level column 231 253 18 4.3 230.7 1.0X +Nested column 233 252 13 4.3 233.0 1.0X +Nested column in array 501 520 19 2.0 500.6 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 212 221 8 4.7 212.3 1.0X -Nested column 219 230 9 4.6 219.4 1.0X -Nested column in array 470 477 5 2.1 470.4 0.5X +Top-level column 207 213 4 4.8 207.1 1.0X +Nested column 210 219 6 4.8 209.8 1.0X +Nested column in array 448 452 2 2.2 448.2 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 213 218 6 4.7 213.2 1.0X -Nested column 237 245 7 4.2 236.6 0.9X -Nested column in array 504 510 6 2.0 503.8 0.4X +Top-level column 202 211 8 4.9 202.3 1.0X +Nested column 227 233 5 4.4 227.4 0.9X +Nested column in array 480 487 7 2.1 480.2 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 66 78 8 15.1 66.4 1.0X -Nested column 73 84 12 13.7 73.0 0.9X -Nested column in array 202 234 31 4.9 202.3 0.3X +Top-level column 61 77 10 16.5 60.7 1.0X +Nested column 74 90 15 13.4 74.5 0.8X +Nested column in array 218 239 12 4.6 218.5 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 304 317 12 3.3 303.9 1.0X -Nested column 350 358 7 2.9 350.3 0.9X -Nested column in array 722 730 7 1.4 721.8 0.4X +Top-level column 289 300 9 3.5 289.2 1.0X +Nested column 340 348 4 2.9 340.4 0.8X +Nested column in array 685 699 11 1.5 685.2 0.4X diff --git a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt index 5585eabfe717b..4676ae5a6bb3c 100644 --- a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v2 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 66 84 14 15.1 66.0 1.0X -Nested column 57 68 7 17.6 56.7 1.2X -Nested column in array 167 173 5 6.0 166.9 0.4X +Top-level column 59 73 9 17.1 58.5 1.0X +Nested column 58 67 7 17.4 57.5 1.0X +Nested column in array 171 177 4 5.9 170.9 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 243 260 12 4.1 242.6 1.0X -Nested column 245 260 12 4.1 244.9 1.0X -Nested column in array 515 532 12 1.9 514.8 0.5X +Top-level column 233 256 18 4.3 232.9 1.0X +Nested column 229 241 8 4.4 229.5 1.0X +Nested column in array 520 551 30 1.9 520.3 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 218 230 7 4.6 218.1 1.0X -Nested column 223 234 14 4.5 223.4 1.0X -Nested column in array 472 484 7 2.1 472.1 0.5X +Top-level column 203 213 6 4.9 203.0 1.0X +Nested column 208 219 6 4.8 207.8 1.0X +Nested column in array 467 476 4 2.1 467.3 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 213 231 14 4.7 213.3 1.0X -Nested column 240 252 8 4.2 240.1 0.9X -Nested column in array 510 514 3 2.0 509.7 0.4X +Top-level column 201 206 5 5.0 201.2 1.0X +Nested column 238 244 4 4.2 238.0 0.8X +Nested column in array 504 522 30 2.0 503.6 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 65 77 10 15.3 65.3 1.0X -Nested column 74 89 15 13.6 73.7 0.9X -Nested column in array 206 242 28 4.8 206.2 0.3X +Top-level column 53 76 10 18.9 53.0 1.0X +Nested column 73 87 9 13.6 73.5 0.7X +Nested column in array 208 247 24 4.8 208.4 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 301 316 21 3.3 300.7 1.0X -Nested column 361 378 21 2.8 360.9 0.8X -Nested column in array 723 765 42 1.4 722.8 0.4X +Top-level column 289 297 12 3.5 289.3 1.0X +Nested column 334 344 14 3.0 333.8 0.9X +Nested column in array 702 767 41 1.4 701.8 0.4X diff --git a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt index e2eba2b51fb49..c7825eb9f4a8f 100644 --- a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v2 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 66 83 11 15.2 66.0 1.0X -Nested column 60 68 7 16.6 60.4 1.1X -Nested column in array 165 170 3 6.1 164.6 0.4X +Top-level column 62 78 12 16.2 61.8 1.0X +Nested column 58 65 5 17.3 57.8 1.1X +Nested column in array 164 173 6 6.1 164.1 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 252 265 16 4.0 252.0 1.0X -Nested column 243 258 14 4.1 242.8 1.0X -Nested column in array 503 532 14 2.0 503.4 0.5X +Top-level column 232 252 15 4.3 231.9 1.0X +Nested column 230 243 8 4.3 229.9 1.0X +Nested column in array 498 525 22 2.0 497.6 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 216 221 5 4.6 216.1 1.0X -Nested column 221 228 5 4.5 220.8 1.0X -Nested column in array 471 476 3 2.1 470.9 0.5X +Top-level column 210 215 6 4.8 209.9 1.0X +Nested column 210 220 9 4.8 210.3 1.0X +Nested column in array 449 455 5 2.2 449.2 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 213 219 5 4.7 213.2 1.0X -Nested column 238 247 7 4.2 237.9 0.9X -Nested column in array 504 510 5 2.0 503.9 0.4X +Top-level column 204 215 23 4.9 203.7 1.0X +Nested column 227 232 4 4.4 226.9 0.9X +Nested column in array 483 491 5 2.1 483.4 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 60 76 7 16.5 60.4 1.0X -Nested column 71 89 15 14.2 70.5 0.9X -Nested column in array 202 242 47 4.9 202.5 0.3X +Top-level column 69 88 23 14.5 68.8 1.0X +Nested column 73 92 16 13.8 72.7 0.9X +Nested column in array 223 242 15 4.5 223.2 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 309 317 12 3.2 309.4 1.0X -Nested column 355 362 5 2.8 354.8 0.9X -Nested column in array 710 732 19 1.4 710.5 0.4X +Top-level column 290 300 7 3.4 290.0 1.0X +Nested column 342 348 5 2.9 341.9 0.8X +Nested column in array 707 714 7 1.4 706.7 0.4X diff --git a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt index 743331fb4dae2..11fbaf8abd6cd 100644 --- a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt @@ -1,21 +1,21 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Can skip all row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 6375 6430 47 16.4 60.8 1.0X -With nested predicate Pushdown 50 65 14 2093.7 0.5 127.3X +Without nested predicate Pushdown 6543 6575 34 16.0 62.4 1.0X +With nested predicate Pushdown 53 69 9 1979.9 0.5 123.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Can skip some row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 6877 6916 20 15.2 65.6 1.0X -With nested predicate Pushdown 45 60 10 2345.3 0.4 153.8X +Without nested predicate Pushdown 7146 7174 30 14.7 68.1 1.0X +With nested predicate Pushdown 48 61 11 2176.9 0.5 148.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Can skip no row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 13281 13345 48 7.9 126.7 1.0X -With nested predicate Pushdown 13310 13352 34 7.9 126.9 1.0X +Without nested predicate Pushdown 13410 13505 61 7.8 127.9 1.0X +With nested predicate Pushdown 13459 13550 71 7.8 128.4 1.0X diff --git a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt index f6a914114a017..aecea8e69fe1d 100644 --- a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt +++ b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt @@ -1,21 +1,21 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Can skip all row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 7157 7297 99 14.7 68.3 1.0X -With nested predicate Pushdown 82 99 13 1279.1 0.8 87.3X +Without nested predicate Pushdown 7218 7274 50 14.5 68.8 1.0X +With nested predicate Pushdown 57 78 12 1844.1 0.5 126.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Can skip some row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 7499 7833 119 14.0 71.5 1.0X -With nested predicate Pushdown 61 74 6 1714.3 0.6 122.6X +Without nested predicate Pushdown 7511 7878 130 14.0 71.6 1.0X +With nested predicate Pushdown 68 79 10 1535.8 0.7 110.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Can skip no row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 14150 14216 75 7.4 134.9 1.0X -With nested predicate Pushdown 14150 14221 50 7.4 134.9 1.0X +Without nested predicate Pushdown 14244 14289 41 7.4 135.8 1.0X +With nested predicate Pushdown 14288 14318 24 7.3 136.3 1.0X diff --git a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt index 9f48b560d615a..e2dde58903157 100644 --- a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For Parquet ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 66 82 12 15.2 66.0 1.0X -Nested column 67 77 6 15.0 66.6 1.0X -Nested column in array 230 239 8 4.4 229.7 0.3X +Top-level column 65 77 12 15.5 64.7 1.0X +Nested column 66 74 12 15.1 66.3 1.0X +Nested column in array 221 255 19 4.5 220.9 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 248 266 13 4.0 247.7 1.0X -Nested column 242 259 12 4.1 242.3 1.0X -Nested column in array 564 594 22 1.8 563.9 0.4X +Top-level column 242 257 11 4.1 241.8 1.0X +Nested column 240 258 17 4.2 239.7 1.0X +Nested column in array 560 588 25 1.8 560.5 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 218 229 7 4.6 218.4 1.0X -Nested column 222 234 10 4.5 221.8 1.0X -Nested column in array 521 537 8 1.9 521.2 0.4X +Top-level column 218 227 14 4.6 218.1 1.0X +Nested column 220 230 14 4.5 219.9 1.0X +Nested column in array 509 525 14 2.0 509.5 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 216 224 7 4.6 216.4 1.0X -Nested column 237 251 10 4.2 236.9 0.9X -Nested column in array 559 568 7 1.8 558.7 0.4X +Top-level column 216 228 13 4.6 215.9 1.0X +Nested column 242 254 12 4.1 241.8 0.9X +Nested column in array 542 559 16 1.8 542.4 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 74 91 10 13.4 74.4 1.0X -Nested column 81 98 12 12.3 81.2 0.9X -Nested column in array 264 287 18 3.8 264.1 0.3X +Top-level column 73 85 13 13.8 72.6 1.0X +Nested column 75 96 17 13.3 75.2 1.0X +Nested column in array 274 310 31 3.6 274.3 0.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 303 370 49 3.3 302.9 1.0X -Nested column 432 447 11 2.3 432.0 0.7X -Nested column in array 779 833 37 1.3 779.2 0.4X +Top-level column 298 314 22 3.4 298.2 1.0X +Nested column 339 353 17 2.9 339.3 0.9X +Nested column in array 757 781 30 1.3 757.3 0.4X diff --git a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt index 094a254580f30..87e3441303e76 100644 --- a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For Parquet ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 68 84 11 14.6 68.4 1.0X -Nested column 69 76 5 14.5 69.2 1.0X -Nested column in array 224 229 5 4.5 224.0 0.3X +Top-level column 75 98 14 13.3 75.0 1.0X +Nested column 67 77 8 15.0 66.8 1.1X +Nested column in array 226 233 6 4.4 225.7 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 241 267 25 4.1 241.1 1.0X -Nested column 244 258 10 4.1 243.8 1.0X -Nested column in array 562 583 18 1.8 562.2 0.4X +Top-level column 250 274 20 4.0 250.3 1.0X +Nested column 244 261 14 4.1 244.3 1.0X +Nested column in array 552 569 8 1.8 551.9 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 220 226 3 4.5 220.2 1.0X -Nested column 223 230 8 4.5 223.0 1.0X -Nested column in array 525 530 4 1.9 525.2 0.4X +Top-level column 217 223 5 4.6 217.3 1.0X +Nested column 223 231 6 4.5 222.8 1.0X +Nested column in array 513 521 10 1.9 513.1 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 216 221 5 4.6 216.0 1.0X -Nested column 242 252 6 4.1 242.0 0.9X -Nested column in array 561 565 4 1.8 561.1 0.4X +Top-level column 215 220 3 4.7 215.0 1.0X +Nested column 240 244 3 4.2 239.7 0.9X +Nested column in array 551 559 10 1.8 551.1 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 74 84 8 13.5 73.8 1.0X -Nested column 79 94 12 12.6 79.4 0.9X -Nested column in array 263 283 13 3.8 263.3 0.3X +Top-level column 78 91 12 12.8 78.3 1.0X +Nested column 84 96 9 11.9 84.0 0.9X +Nested column in array 285 301 11 3.5 284.6 0.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 309 323 11 3.2 309.0 1.0X -Nested column 358 367 7 2.8 358.3 0.9X -Nested column in array 771 793 23 1.3 770.7 0.4X +Top-level column 303 315 10 3.3 302.6 1.0X +Nested column 356 360 3 2.8 356.5 0.8X +Nested column in array 759 778 14 1.3 758.8 0.4X diff --git a/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt index 3c57cee485c54..fec73347a3711 100644 --- a/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ Write primitive arrays in dataset ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write an array in Dataset: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 211 241 25 39.8 25.2 1.0X -Double 287 303 12 29.3 34.2 0.7X +Int 165 203 26 50.9 19.7 1.0X +Double 226 252 21 37.1 27.0 0.7X diff --git a/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt b/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt index 1f8ea79f262be..50acc6af2e6f9 100644 --- a/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt +++ b/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt @@ -2,11 +2,11 @@ Write primitive arrays in dataset ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write an array in Dataset: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 256 291 24 32.8 30.5 1.0X -Double 305 327 15 27.5 36.4 0.8X +Int 199 230 18 42.2 23.7 1.0X +Double 282 290 8 29.7 33.6 0.7X diff --git a/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt b/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt index 33fa0ff972d15..fe289c99daef8 100644 --- a/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt @@ -2,14 +2,14 @@ range ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor range: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -full scan 10051 10241 242 52.2 19.2 1.0X -limit after range 36 48 10 14566.1 0.1 279.3X -filter after range 1003 1012 7 522.9 1.9 10.0X -count after range 344 371 42 1522.3 0.7 29.2X -count after limit after range 40 50 10 13166.5 0.1 252.4X +full scan 10142 10196 93 51.7 19.3 1.0X +limit after range 44 47 3 12012.9 0.1 232.4X +filter after range 1005 1019 15 521.5 1.9 10.1X +count after range 345 354 9 1519.4 0.7 29.4X +count after limit after range 48 52 4 10846.5 0.1 209.8X diff --git a/sql/core/benchmarks/RangeBenchmark-results.txt b/sql/core/benchmarks/RangeBenchmark-results.txt index faca550c9e2d5..fdae5cc68e8ce 100644 --- a/sql/core/benchmarks/RangeBenchmark-results.txt +++ b/sql/core/benchmarks/RangeBenchmark-results.txt @@ -2,14 +2,14 @@ range ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor range: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -full scan 9920 10204 190 52.9 18.9 1.0X -limit after range 44 51 7 11786.7 0.1 223.0X -filter after range 1011 1031 16 518.5 1.9 9.8X -count after range 370 372 2 1417.0 0.7 26.8X -count after limit after range 47 49 1 11082.6 0.1 209.7X +full scan 9433 9964 355 55.6 18.0 1.0X +limit after range 53 56 2 9946.9 0.1 179.0X +filter after range 1004 1039 48 522.3 1.9 9.4X +count after range 348 353 6 1508.6 0.7 27.1X +count after limit after range 65 72 10 8068.8 0.1 145.2X diff --git a/sql/core/benchmarks/SetOperationsBenchmark-jdk21-results.txt b/sql/core/benchmarks/SetOperationsBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..37a2d749eb195 --- /dev/null +++ b/sql/core/benchmarks/SetOperationsBenchmark-jdk21-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +Set Operations Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +Parsing + Analysis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UNION ALL 319 415 124 0.0 21283.9 1.0X +EXCEPT ALL 259 266 7 0.1 17287.8 1.2X +INTERSECT ALL 257 263 4 0.1 17101.2 1.2X + + diff --git a/sql/core/benchmarks/SetOperationsBenchmark-results.txt b/sql/core/benchmarks/SetOperationsBenchmark-results.txt new file mode 100644 index 0000000000000..cb944d1d61fd0 --- /dev/null +++ b/sql/core/benchmarks/SetOperationsBenchmark-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +Set Operations Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure +AMD EPYC 7763 64-Core Processor +Parsing + Analysis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UNION ALL 325 445 92 0.0 21641.7 1.0X +EXCEPT ALL 272 277 5 0.1 18110.7 1.2X +INTERSECT ALL 269 276 6 0.1 17938.3 1.2X + + diff --git a/sql/core/benchmarks/SortBenchmark-jdk21-results.txt b/sql/core/benchmarks/SortBenchmark-jdk21-results.txt index 4125c72bce4ab..683aa1f400f5c 100644 --- a/sql/core/benchmarks/SortBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/SortBenchmark-jdk21-results.txt @@ -2,15 +2,15 @@ radix sort ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor radix sort 25000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -reference TimSort key prefix array 8456 8460 5 3.0 338.3 1.0X -reference Arrays.sort 2041 2067 37 12.2 81.6 4.1X -radix sort one byte 68 76 8 368.5 2.7 124.6X -radix sort two bytes 125 133 7 200.4 5.0 67.8X -radix sort eight bytes 479 494 17 52.2 19.2 17.7X -radix sort key prefix array 564 584 33 44.3 22.6 15.0X +reference TimSort key prefix array 8456 9400 1334 3.0 338.3 1.0X +reference Arrays.sort 2181 2207 37 11.5 87.2 3.9X +radix sort one byte 67 72 4 372.6 2.7 126.0X +radix sort two bytes 123 129 6 202.7 4.9 68.6X +radix sort eight bytes 473 478 6 52.9 18.9 17.9X +radix sort key prefix array 575 585 10 43.4 23.0 14.7X diff --git a/sql/core/benchmarks/SortBenchmark-results.txt b/sql/core/benchmarks/SortBenchmark-results.txt index 533049edd2237..49205fbe84798 100644 --- a/sql/core/benchmarks/SortBenchmark-results.txt +++ b/sql/core/benchmarks/SortBenchmark-results.txt @@ -2,15 +2,15 @@ radix sort ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor radix sort 25000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -reference TimSort key prefix array 8140 8157 23 3.1 325.6 1.0X -reference Arrays.sort 2063 2087 35 12.1 82.5 3.9X -radix sort one byte 64 73 6 393.0 2.5 128.0X -radix sort two bytes 116 129 8 216.1 4.6 70.4X -radix sort eight bytes 454 475 16 55.1 18.2 17.9X -radix sort key prefix array 885 896 11 28.3 35.4 9.2X +reference TimSort key prefix array 8207 8241 48 3.0 328.3 1.0X +reference Arrays.sort 2069 2095 37 12.1 82.8 4.0X +radix sort one byte 62 71 5 400.4 2.5 131.4X +radix sort two bytes 127 137 6 196.3 5.1 64.4X +radix sort eight bytes 488 500 8 51.2 19.5 16.8X +radix sort key prefix array 929 930 2 26.9 37.1 8.8X diff --git a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt index 4ab5f6d0061cc..9ec0af2d17a71 100644 --- a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt @@ -2,143 +2,143 @@ put rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -In-memory 10 14 1 1.0 1006.5 1.0X -RocksDB (trackTotalNumberOfRows: true) 43 45 2 0.2 4345.4 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 17 1 0.6 1547.6 0.7X +In-memory 10 12 1 1.0 1034.4 1.0X +RocksDB (trackTotalNumberOfRows: true) 45 47 2 0.2 4504.4 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1617.3 0.6X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 10 12 1 1.0 1011.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4441.2 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1521.7 0.7X +In-memory 9 11 1 1.1 903.2 1.0X +RocksDB (trackTotalNumberOfRows: true) 46 47 1 0.2 4592.3 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1614.6 0.6X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 1 1.1 940.8 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4425.1 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1515.2 0.6X +In-memory 9 11 1 1.1 941.9 1.0X +RocksDB (trackTotalNumberOfRows: true) 46 47 1 0.2 4572.3 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1589.0 0.6X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 11 2 1.1 932.2 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4400.3 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 17 1 0.7 1506.0 0.6X +In-memory 8 9 1 1.3 797.2 1.0X +RocksDB (trackTotalNumberOfRows: true) 45 46 1 0.2 4468.9 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1575.2 0.5X ================================================================================================ merge rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 532 547 8 0.0 53154.1 1.0X -RocksDB (trackTotalNumberOfRows: false) 174 180 3 0.1 17410.5 3.1X +RocksDB (trackTotalNumberOfRows: true) 566 584 6 0.0 56623.9 1.0X +RocksDB (trackTotalNumberOfRows: false) 175 185 3 0.1 17469.9 3.2X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 472 484 5 0.0 47228.8 1.0X -RocksDB (trackTotalNumberOfRows: false) 174 179 3 0.1 17433.5 2.7X +RocksDB (trackTotalNumberOfRows: true) 488 502 5 0.0 48798.8 1.0X +RocksDB (trackTotalNumberOfRows: false) 177 184 3 0.1 17675.0 2.8X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 422 434 5 0.0 42226.0 1.0X -RocksDB (trackTotalNumberOfRows: false) 172 179 3 0.1 17235.9 2.4X +RocksDB (trackTotalNumberOfRows: true) 424 437 6 0.0 42429.3 1.0X +RocksDB (trackTotalNumberOfRows: false) 176 182 3 0.1 17608.0 2.4X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 406 419 7 0.0 40646.7 1.0X -RocksDB (trackTotalNumberOfRows: false) 173 179 3 0.1 17265.8 2.4X +RocksDB (trackTotalNumberOfRows: true) 406 420 6 0.0 40630.6 1.0X +RocksDB (trackTotalNumberOfRows: false) 171 178 3 0.1 17137.0 2.4X ================================================================================================ delete rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(10000 rows are non-existing - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 1 0 27.0 37.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4447.0 0.0X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1453.0 0.0X +In-memory 0 1 0 26.6 37.6 1.0X +RocksDB (trackTotalNumberOfRows: true) 43 45 1 0.2 4303.7 0.0X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.6 1543.9 0.0X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(5000 rows are non-existing - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 1 1.3 796.5 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4384.0 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1463.5 0.5X +In-memory 7 7 1 1.5 650.9 1.0X +RocksDB (trackTotalNumberOfRows: true) 45 46 1 0.2 4469.5 0.1X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1496.4 0.4X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(1000 rows are non-existing - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 9 1 1.2 853.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 43 44 1 0.2 4278.0 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 15 1 0.7 1460.7 0.6X +In-memory 7 7 0 1.5 687.7 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4357.2 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1479.4 0.5X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(0 rows are non-existing - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 2 1.2 854.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 42 44 1 0.2 4183.1 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1457.0 0.6X +In-memory 7 7 0 1.4 697.0 1.0X +RocksDB (trackTotalNumberOfRows: true) 43 45 1 0.2 4332.4 0.2X +RocksDB (trackTotalNumberOfRows: false) 14 15 0 0.7 1448.6 0.5X ================================================================================================ evict rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 10000 rows (maxTimestampToEvictInMillis: 9999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 0 1.2 837.4 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4146.8 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1623.1 0.5X +In-memory 7 7 0 1.5 683.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 43 44 1 0.2 4257.5 0.2X +RocksDB (trackTotalNumberOfRows: false) 17 17 1 0.6 1669.2 0.4X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 5000 rows (maxTimestampToEvictInMillis: 4999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------ -In-memory 8 9 1 1.3 798.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 22 23 1 0.5 2201.4 0.4X -RocksDB (trackTotalNumberOfRows: false) 10 10 1 1.0 956.5 0.8X +In-memory 6 7 0 1.6 643.6 1.0X +RocksDB (trackTotalNumberOfRows: true) 23 24 1 0.4 2349.6 0.3X +RocksDB (trackTotalNumberOfRows: false) 10 10 0 1.0 999.9 0.6X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 1000 rows (maxTimestampToEvictInMillis: 999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 7 8 1 1.4 724.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 7 7 0 1.4 698.4 1.0X -RocksDB (trackTotalNumberOfRows: false) 5 5 0 2.2 450.9 1.6X +In-memory 6 6 0 1.7 590.6 1.0X +RocksDB (trackTotalNumberOfRows: true) 8 8 0 1.3 758.5 0.8X +RocksDB (trackTotalNumberOfRows: false) 5 5 0 2.0 491.5 1.2X -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 0 rows (maxTimestampToEvictInMillis: -1) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 0 0 24.0 41.6 1.0X -RocksDB (trackTotalNumberOfRows: true) 3 3 1 3.2 317.3 0.1X -RocksDB (trackTotalNumberOfRows: false) 3 3 0 3.2 317.2 0.1X +In-memory 0 0 0 23.7 42.1 1.0X +RocksDB (trackTotalNumberOfRows: true) 4 4 0 2.8 354.3 0.1X +RocksDB (trackTotalNumberOfRows: false) 4 4 0 2.8 354.8 0.1X diff --git a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt index 856985b5d071f..a8e4c83be80e1 100644 --- a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt +++ b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt @@ -2,143 +2,143 @@ put rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -In-memory 10 10 1 1.0 953.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 43 44 2 0.2 4269.8 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 16 1 0.6 1550.5 0.6X +In-memory 8 10 1 1.2 843.4 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4423.7 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1616.4 0.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 0 1.1 930.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4387.9 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1521.4 0.6X +In-memory 8 9 1 1.2 830.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 45 47 1 0.2 4506.3 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1576.7 0.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 0 1.1 918.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4441.6 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1521.7 0.6X +In-memory 8 8 0 1.2 808.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 45 46 1 0.2 4489.0 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1588.0 0.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 0 1.1 916.9 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4413.7 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1522.0 0.6X +In-memory 8 8 0 1.3 796.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4437.2 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1573.0 0.5X ================================================================================================ merge rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 542 553 6 0.0 54222.4 1.0X -RocksDB (trackTotalNumberOfRows: false) 174 179 3 0.1 17391.9 3.1X +RocksDB (trackTotalNumberOfRows: true) 549 562 6 0.0 54902.6 1.0X +RocksDB (trackTotalNumberOfRows: false) 179 184 2 0.1 17887.1 3.1X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 479 490 5 0.0 47921.1 1.0X -RocksDB (trackTotalNumberOfRows: false) 174 179 3 0.1 17446.2 2.7X +RocksDB (trackTotalNumberOfRows: true) 486 496 6 0.0 48554.8 1.0X +RocksDB (trackTotalNumberOfRows: false) 180 185 3 0.1 17973.1 2.7X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 423 433 5 0.0 42311.4 1.0X -RocksDB (trackTotalNumberOfRows: false) 173 178 3 0.1 17309.1 2.4X +RocksDB (trackTotalNumberOfRows: true) 429 440 4 0.0 42859.9 1.0X +RocksDB (trackTotalNumberOfRows: false) 178 184 2 0.1 17776.5 2.4X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 408 419 5 0.0 40762.3 1.0X -RocksDB (trackTotalNumberOfRows: false) 174 183 3 0.1 17377.7 2.3X +RocksDB (trackTotalNumberOfRows: true) 409 423 4 0.0 40946.3 1.0X +RocksDB (trackTotalNumberOfRows: false) 178 183 2 0.1 17820.2 2.3X ================================================================================================ delete rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(10000 rows are non-existing - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 0 0 26.1 38.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 46 1 0.2 4444.2 0.0X -RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1489.6 0.0X +In-memory 1 1 0 19.6 51.0 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4403.8 0.0X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1527.7 0.0X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(5000 rows are non-existing - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 8 0 1.3 788.8 1.0X -RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4425.4 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1499.2 0.5X +In-memory 7 7 0 1.5 671.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 45 46 1 0.2 4484.7 0.1X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1516.5 0.4X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(1000 rows are non-existing - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 0 1.2 841.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 43 44 1 0.2 4336.9 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1493.6 0.6X +In-memory 7 8 0 1.4 727.7 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4433.9 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1512.5 0.5X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(0 rows are non-existing - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 0 1.2 848.9 1.0X -RocksDB (trackTotalNumberOfRows: true) 42 43 1 0.2 4216.8 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1467.4 0.6X +In-memory 7 8 1 1.4 740.3 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4390.9 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1500.6 0.5X ================================================================================================ evict rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 10000 rows (maxTimestampToEvictInMillis: 9999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 0 1.2 836.6 1.0X -RocksDB (trackTotalNumberOfRows: true) 42 43 2 0.2 4182.0 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 17 0 0.6 1645.0 0.5X +In-memory 7 7 0 1.5 688.0 1.0X +RocksDB (trackTotalNumberOfRows: true) 43 44 1 0.2 4337.8 0.2X +RocksDB (trackTotalNumberOfRows: false) 17 17 0 0.6 1678.8 0.4X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 5000 rows (maxTimestampToEvictInMillis: 4999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------ -In-memory 8 8 0 1.3 785.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 23 23 1 0.4 2258.3 0.3X -RocksDB (trackTotalNumberOfRows: false) 10 10 0 1.0 999.7 0.8X +In-memory 6 7 0 1.5 645.3 1.0X +RocksDB (trackTotalNumberOfRows: true) 24 24 1 0.4 2370.3 0.3X +RocksDB (trackTotalNumberOfRows: false) 11 11 0 0.9 1082.4 0.6X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 1000 rows (maxTimestampToEvictInMillis: 999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 7 8 0 1.4 726.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 7 8 0 1.4 736.8 1.0X -RocksDB (trackTotalNumberOfRows: false) 5 5 0 2.1 487.0 1.5X +In-memory 6 6 0 1.7 587.3 1.0X +RocksDB (trackTotalNumberOfRows: true) 8 8 0 1.3 788.1 0.7X +RocksDB (trackTotalNumberOfRows: false) 6 6 0 1.8 554.1 1.1X -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor evicting 0 rows (maxTimestampToEvictInMillis: -1) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 0 0 22.8 43.9 1.0X -RocksDB (trackTotalNumberOfRows: true) 4 4 0 2.8 354.8 0.1X -RocksDB (trackTotalNumberOfRows: false) 4 4 0 2.8 353.1 0.1X +In-memory 0 0 0 23.9 41.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 4 4 0 2.6 387.4 0.1X +RocksDB (trackTotalNumberOfRows: false) 4 4 0 2.6 389.4 0.1X diff --git a/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt b/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt index 04720fb50b41c..ac6a3ea26717f 100644 --- a/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt @@ -2,10 +2,10 @@ SQL string functions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor regexp_replace: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -regexp_replace('*-*', '(\\d+)', 'num') 503 527 32 2.0 503.0 1.0X +regexp_replace('*-*', '(\\d+)', 'num') 505 521 11 2.0 505.0 1.0X diff --git a/sql/core/benchmarks/StringFunctionsBenchmark-results.txt b/sql/core/benchmarks/StringFunctionsBenchmark-results.txt index c1b9bdb4ea3da..7ecccc7a826f4 100644 --- a/sql/core/benchmarks/StringFunctionsBenchmark-results.txt +++ b/sql/core/benchmarks/StringFunctionsBenchmark-results.txt @@ -2,10 +2,10 @@ SQL string functions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor regexp_replace: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -regexp_replace('*-*', '(\\d+)', 'num') 512 529 24 2.0 512.0 1.0X +regexp_replace('*-*', '(\\d+)', 'num') 509 533 15 2.0 509.1 1.0X diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt index 8a79199cd92ed..8e75b2b3d4697 100644 --- a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt @@ -1,810 +1,810 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q1 756 854 165 0.6 1639.1 1.0X +q1 672 845 214 0.7 1456.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q2 841 929 113 2.7 376.8 1.0X +q2 890 937 57 2.5 398.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q3 239 288 37 12.4 80.6 1.0X +q3 244 297 49 12.2 82.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q4 5033 5148 163 1.0 965.7 1.0X +q4 4653 5006 500 1.1 892.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5 1125 1352 321 5.0 199.9 1.0X +q5 1406 1469 89 4.0 249.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6 1095 1121 37 2.8 350.9 1.0X +q6 1122 1134 17 2.8 359.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q7 601 631 48 8.1 122.9 1.0X +q7 567 603 39 8.6 116.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q8 453 501 54 6.8 146.1 1.0X +q8 455 510 51 6.8 146.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q9 895 929 36 0.0 25559860.1 1.0X +q9 874 935 90 0.0 24957685.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10 1917 1986 98 1.1 925.6 1.0X +q10 1916 2021 148 1.1 925.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11 1897 2025 181 2.0 502.9 1.0X +q11 1781 2094 443 2.1 472.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12 164 205 47 4.9 203.0 1.0X +q12 165 209 61 4.9 204.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q13 778 820 49 6.3 157.8 1.0X +q13 767 808 44 6.4 155.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a 4952 5091 196 1.0 965.3 1.0X +q14a 4824 5200 532 1.1 940.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14b 3747 3786 56 1.4 730.4 1.0X +q14b 4151 4159 11 1.2 809.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q15 443 505 48 3.8 266.7 1.0X +q15 452 485 38 3.7 271.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q16 618 652 44 2.5 395.3 1.0X +q16 589 628 35 2.7 377.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q17 1531 1632 144 3.1 325.8 1.0X +q17 1392 1400 12 3.4 296.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18 1107 1243 193 3.3 307.3 1.0X +q18 1055 1107 73 3.4 293.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q19 299 335 42 10.4 95.9 1.0X +q19 302 326 28 10.3 96.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20 180 205 33 8.5 117.3 1.0X +q20 176 213 52 8.7 115.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q21 595 639 32 19.9 50.3 1.0X +q21 576 621 51 20.5 48.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22 3548 3579 43 3.3 299.8 1.0X +q22 3218 3259 58 3.7 271.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23a 6699 6712 20 0.8 1280.9 1.0X +q23a 5971 6291 453 0.9 1141.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23b 6338 6683 488 0.8 1211.9 1.0X +q23b 6194 6443 353 0.8 1184.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24a 118 248 53 28.3 35.4 1.0X +q24a 214 254 46 15.6 64.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24b 214 264 41 15.6 64.1 1.0X +q24b 155 229 49 21.6 46.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q25 1381 1456 106 3.4 293.9 1.0X +q25 1379 1525 207 3.4 293.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q26 344 378 37 10.0 99.6 1.0X +q26 318 362 48 10.8 92.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27 531 580 41 9.2 108.5 1.0X +q27 538 555 14 9.1 110.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q28 1202 1337 190 2.4 417.6 1.0X +q28 1187 1291 147 2.4 412.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q29 1540 1703 230 3.1 327.7 1.0X +q29 1566 1636 99 3.0 333.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q30 402 447 43 0.7 1364.7 1.0X +q30 381 444 53 0.8 1292.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q31 839 851 14 4.4 225.5 1.0X +q31 770 871 89 4.8 207.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q32 198 246 53 7.7 129.0 1.0X +q32 192 213 28 8.0 125.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q33 405 447 56 12.8 78.2 1.0X +q33 393 423 33 13.2 75.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34 353 384 32 8.7 115.3 1.0X +q34 350 393 35 8.7 114.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35 1296 1367 101 1.6 625.7 1.0X +q35 1281 1303 32 1.6 618.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36 544 583 43 5.5 183.2 1.0X +q36 516 544 35 5.8 173.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q37 765 807 45 17.4 57.6 1.0X +q37 780 797 21 17.0 58.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q38 696 740 39 7.5 133.5 1.0X +q38 639 690 49 8.2 122.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39a 1244 1409 233 9.5 105.1 1.0X +q39a 1442 1443 1 8.2 121.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39b 1269 1285 23 9.3 107.2 1.0X +q39b 1298 1305 9 9.1 109.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q40 398 446 51 4.2 237.8 1.0X +q40 319 386 46 5.2 190.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q41 140 170 37 0.1 7757.0 1.0X +q41 143 156 20 0.1 7952.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q42 155 169 24 19.2 52.2 1.0X +q42 147 165 34 20.2 49.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q43 308 335 30 9.6 104.4 1.0X +q43 304 337 39 9.7 103.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q44 338 403 45 8.6 116.6 1.0X +q44 325 371 39 8.9 112.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q45 196 226 31 4.9 204.6 1.0X +q45 197 227 27 4.9 204.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q46 462 505 67 6.7 148.4 1.0X +q46 452 485 35 6.9 145.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47 1578 1797 310 1.9 531.1 1.0X +q47 1500 1649 211 2.0 504.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q48 924 945 25 5.3 187.7 1.0X +q48 825 848 38 6.0 167.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49 670 761 131 8.4 119.3 1.0X +q49 544 603 56 10.3 96.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q50 784 841 51 4.1 241.8 1.0X +q50 564 636 69 5.7 174.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51 2769 2809 56 1.3 754.1 1.0X +q51 2540 2716 250 1.4 691.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q52 153 186 33 19.5 51.3 1.0X +q52 140 157 21 21.2 47.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q53 277 340 52 10.7 93.1 1.0X +q53 265 290 42 11.2 89.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q54 1250 1262 17 4.2 236.7 1.0X +q54 1253 1262 13 4.2 237.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q55 154 168 19 19.3 51.7 1.0X +q55 143 155 17 20.8 48.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q56 408 473 67 12.7 78.8 1.0X +q56 415 460 29 12.5 80.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57 778 837 65 2.0 508.0 1.0X +q57 758 807 45 2.0 494.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q58 412 543 195 12.4 80.4 1.0X +q58 438 561 203 11.7 85.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q59 669 747 118 4.4 226.6 1.0X +q59 635 700 58 4.7 215.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q60 427 473 27 12.1 82.4 1.0X +q60 416 452 46 12.5 80.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q61 556 586 30 5.6 178.2 1.0X +q61 537 574 54 5.8 172.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q62 183 204 25 4.3 230.9 1.0X +q62 177 201 32 4.5 223.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q63 281 301 18 10.6 94.7 1.0X +q63 276 306 31 10.8 93.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64 2377 2586 296 2.9 343.5 1.0X +q64 2141 2374 330 3.2 309.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q65 707 782 71 4.2 237.8 1.0X +q65 607 670 64 4.9 204.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q66 510 565 54 4.5 219.8 1.0X +q66 530 565 37 4.4 228.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67 5734 5829 134 0.5 1930.2 1.0X +q67 5561 5583 31 0.5 1871.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q68 539 559 20 5.8 173.2 1.0X +q68 454 485 22 6.8 146.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q69 1756 1826 99 1.2 848.0 1.0X +q69 1669 1751 115 1.2 806.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70 587 620 35 5.0 198.7 1.0X +q70 547 581 36 5.4 185.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q71 352 394 43 14.8 67.5 1.0X +q71 344 368 38 15.2 65.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72 136543 138257 2425 0.1 8896.5 1.0X +q72 119142 119748 857 0.1 7762.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q73 349 380 32 8.8 114.1 1.0X +q73 328 355 27 9.3 107.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74 1353 1607 359 2.8 358.8 1.0X +q74 1250 1720 665 3.0 331.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75 1482 1683 285 3.8 263.0 1.0X +q75 1347 1539 272 4.2 239.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q76 298 335 28 17.2 58.1 1.0X +q76 291 319 33 17.6 56.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77 610 791 184 9.2 108.7 1.0X +q77 544 719 162 10.3 96.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78 2709 2767 82 2.1 482.4 1.0X +q78 2167 2456 408 2.6 386.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q79 422 449 29 7.3 137.8 1.0X +q79 414 442 26 7.4 135.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80 1318 1555 335 4.3 233.5 1.0X +q80 1401 1542 198 4.0 248.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q81 371 421 42 1.0 1012.0 1.0X +q81 336 416 66 1.1 916.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q82 1019 1064 64 14.4 69.2 1.0X +q82 1003 1036 46 14.7 68.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q83 230 286 27 2.6 387.4 1.0X +q83 235 267 31 2.5 394.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q84 685 705 25 3.5 289.6 1.0X +q84 659 707 82 3.6 278.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q85 1656 1798 200 1.7 584.3 1.0X +q85 1993 2062 97 1.4 703.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86 184 204 30 4.4 227.5 1.0X +q86 189 209 23 4.3 233.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q87 660 727 65 7.9 126.7 1.0X +q87 693 729 31 7.5 133.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q88 1241 1402 229 2.4 417.3 1.0X +q88 1156 1373 307 2.6 388.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q89 309 357 53 9.6 103.9 1.0X +q89 293 334 36 10.1 98.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q90 132 159 19 6.1 162.8 1.0X +q90 113 136 20 7.2 139.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q91 327 365 38 7.0 142.4 1.0X +q91 330 355 20 7.0 143.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q92 135 160 19 6.0 166.3 1.0X +q92 133 173 70 6.1 164.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q93 529 549 38 6.0 166.9 1.0X +q93 423 452 41 7.5 133.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q94 307 364 42 2.7 364.6 1.0X +q94 303 329 24 2.8 359.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q95 5173 5213 57 0.2 6143.9 1.0X +q95 5193 5248 78 0.2 6167.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q96 163 179 17 18.3 54.7 1.0X +q96 160 174 17 18.6 53.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q97 1257 1323 93 3.5 286.2 1.0X +q97 1153 1170 24 3.8 262.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98 268 304 31 11.1 90.4 1.0X +q98 267 305 45 11.1 89.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q99 264 295 38 5.7 174.4 1.0X +q99 262 285 22 5.8 172.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5a-v2.7 1294 1378 120 4.3 229.9 1.0X +q5a-v2.7 1134 1159 35 5.0 201.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6-v2.7 952 977 22 3.3 305.1 1.0X +q6-v2.7 916 932 26 3.4 293.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10a-v2.7 1806 1867 88 1.1 871.9 1.0X +q10a-v2.7 1757 1846 126 1.2 848.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11-v2.7 1867 2114 350 2.0 494.9 1.0X +q11-v2.7 1725 2043 449 2.2 457.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12-v2.7 128 143 18 6.4 157.4 1.0X +q12-v2.7 125 139 19 6.5 154.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14-v2.7 3604 3857 358 1.4 702.5 1.0X +q14-v2.7 3771 3829 82 1.4 735.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a-v2.7 6933 7124 270 0.7 1351.6 1.0X +q14a-v2.7 6402 6605 287 0.8 1248.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18a-v2.7 1805 2075 382 2.0 501.2 1.0X +q18a-v2.7 1831 2129 421 2.0 508.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20-v2.7 157 175 20 9.7 102.6 1.0X +q20-v2.7 153 174 18 10.0 100.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22-v2.7 13585 13655 98 0.9 1147.8 1.0X +q22-v2.7 12999 13185 264 0.9 1098.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22a-v2.7 2043 2046 4 5.8 172.6 1.0X +q22a-v2.7 1909 1970 87 6.2 161.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24-v2.7 207 238 34 16.1 61.9 1.0X +q24-v2.7 190 235 32 17.5 57.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27a-v2.7 1452 1553 143 3.4 296.8 1.0X +q27a-v2.7 1710 1727 23 2.9 349.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34-v2.7 363 390 39 8.4 118.7 1.0X +q34-v2.7 336 369 35 9.1 109.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35-v2.7 1310 1320 14 1.6 632.5 1.0X +q35-v2.7 1195 1215 28 1.7 577.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35a-v2.7 1283 1290 10 1.6 619.7 1.0X +q35a-v2.7 1174 1214 56 1.8 567.0 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36a-v2.7 492 538 44 6.0 165.5 1.0X +q36a-v2.7 481 510 50 6.2 161.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47-v2.7 1573 1759 262 1.9 529.6 1.0X +q47-v2.7 1567 1672 148 1.9 527.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49-v2.7 560 618 39 10.0 99.8 1.0X +q49-v2.7 534 636 127 10.5 95.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51a-v2.7 15223 15578 502 0.2 4146.0 1.0X +q51a-v2.7 14944 15519 814 0.2 4069.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57-v2.7 756 801 50 2.0 494.0 1.0X +q57-v2.7 686 789 120 2.2 448.1 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64-v2.7 2553 2715 230 2.7 368.8 1.0X +q64-v2.7 1878 2187 436 3.7 271.4 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67a-v2.7 7363 7679 446 0.4 2478.4 1.0X +q67a-v2.7 6928 7238 439 0.4 2331.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70a-v2.7 630 674 35 4.7 213.5 1.0X +q70a-v2.7 618 674 52 4.8 209.3 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72-v2.7 137936 138063 180 0.1 8987.2 1.0X +q72-v2.7 125912 126950 1468 0.1 8203.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74-v2.7 1287 1678 553 2.9 341.3 1.0X +q74-v2.7 1078 1567 691 3.5 285.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75-v2.7 1379 1615 333 4.1 244.9 1.0X +q75-v2.7 1297 1566 380 4.3 230.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77a-v2.7 898 1007 155 6.3 159.8 1.0X +q77a-v2.7 885 1077 187 6.3 157.6 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78-v2.7 2389 2652 372 2.4 425.5 1.0X +q78-v2.7 2429 2558 183 2.3 432.5 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80a-v2.7 1616 2053 618 3.5 286.2 1.0X +q80a-v2.7 1613 1917 430 3.5 285.7 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86a-v2.7 237 278 36 3.4 292.6 1.0X +q86a-v2.7 240 265 36 3.4 295.8 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98-v2.7 259 296 65 11.5 87.0 1.0X +q98-v2.7 260 287 22 11.4 87.7 1.0X diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt index 4831dffceecd1..7458fd93a4f38 100644 --- a/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt @@ -1,810 +1,810 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q1 600 728 155 0.8 1300.2 1.0X +q1 661 864 259 0.7 1432.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q2 823 845 23 2.7 368.5 1.0X +q2 817 864 59 2.7 366.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q3 225 259 25 13.2 75.6 1.0X +q3 234 277 31 12.7 78.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q4 4365 4759 557 1.2 837.4 1.0X +q4 4856 5073 308 1.1 931.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5 1027 1178 214 5.5 182.5 1.0X +q5 1118 1254 191 5.0 198.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6 1062 1102 56 2.9 340.4 1.0X +q6 1228 1362 191 2.5 393.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q7 583 611 30 8.4 119.1 1.0X +q7 756 776 17 6.5 154.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q8 462 483 23 6.7 149.0 1.0X +q8 567 619 61 5.5 182.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q9 878 890 11 0.0 25071759.3 1.0X +q9 907 945 55 0.0 25911119.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10 1901 2002 143 1.1 917.8 1.0X +q10 2016 2062 65 1.0 973.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11 1901 2269 521 2.0 504.1 1.0X +q11 1845 2147 427 2.0 489.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12 228 263 28 3.5 281.8 1.0X +q12 202 240 29 4.0 250.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q13 856 885 27 5.8 173.6 1.0X +q13 852 889 33 5.8 172.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a 4584 5058 670 1.1 893.5 1.0X +q14a 4637 4894 363 1.1 904.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14b 3771 3852 115 1.4 735.1 1.0X +q14b 3528 3702 246 1.5 687.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q15 383 418 39 4.3 230.4 1.0X +q15 402 413 12 4.1 241.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q16 645 781 127 2.4 412.7 1.0X +q16 750 791 37 2.1 480.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q17 1451 1466 21 3.2 308.8 1.0X +q17 1691 1730 55 2.8 359.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18 1305 1446 200 2.8 362.3 1.0X +q18 1414 1569 219 2.5 392.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q19 359 415 50 8.7 114.9 1.0X +q19 376 403 43 8.3 120.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20 185 203 26 8.3 121.1 1.0X +q20 190 204 14 8.1 123.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q21 699 727 27 16.9 59.0 1.0X +q21 672 712 54 17.6 56.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22 3295 3404 154 3.6 278.4 1.0X +q22 3351 3407 80 3.5 283.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23a 5808 5881 103 0.9 1110.6 1.0X +q23a 5597 5928 467 0.9 1070.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23b 5902 5980 111 0.9 1128.5 1.0X +q23b 5925 6131 291 0.9 1133.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24a 146 308 88 22.8 43.8 1.0X +q24a 231 263 30 14.4 69.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24b 210 253 43 15.9 62.9 1.0X +q24b 217 257 49 15.4 65.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q25 1249 1294 63 3.8 265.8 1.0X +q25 1468 1480 17 3.2 312.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q26 391 436 39 8.8 113.3 1.0X +q26 426 443 14 8.1 123.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27 515 575 51 9.5 105.2 1.0X +q27 603 618 16 8.1 123.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q28 1188 1378 268 2.4 412.6 1.0X +q28 1402 1654 356 2.1 486.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q29 1239 1246 9 3.8 263.8 1.0X +q29 1496 1569 104 3.1 318.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q30 473 511 36 0.6 1606.0 1.0X +q30 453 511 72 0.7 1537.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q31 1043 1236 273 3.6 280.2 1.0X +q31 992 1280 407 3.8 266.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q32 254 310 63 6.0 165.6 1.0X +q32 247 291 37 6.2 161.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q33 484 537 38 10.7 93.5 1.0X +q33 407 441 28 12.7 78.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34 362 393 55 8.4 118.3 1.0X +q34 409 425 20 7.5 133.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35 1405 1427 31 1.5 678.3 1.0X +q35 1442 1460 26 1.4 696.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36 559 580 18 5.3 188.1 1.0X +q36 570 582 20 5.2 191.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q37 931 945 23 14.3 70.1 1.0X +q37 896 901 6 14.8 67.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q38 759 830 85 6.9 145.7 1.0X +q38 888 1151 371 5.9 170.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39a 1611 1833 314 7.3 136.1 1.0X +q39a 1533 1756 315 7.7 129.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39b 1526 1610 118 7.8 129.0 1.0X +q39b 1514 1748 331 7.8 127.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q40 331 358 19 5.1 197.7 1.0X +q40 355 377 20 4.7 212.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q41 164 180 13 0.1 9114.0 1.0X +q41 166 184 10 0.1 9202.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q42 169 196 20 17.6 57.0 1.0X +q42 151 163 13 19.6 51.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q43 330 344 15 9.0 111.7 1.0X +q43 306 328 16 9.6 103.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q44 401 431 38 7.2 138.4 1.0X +q44 338 347 9 8.6 116.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q45 207 249 24 4.6 215.6 1.0X +q45 187 214 28 5.1 194.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q46 523 534 14 5.9 168.2 1.0X +q46 471 491 21 6.6 151.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47 1714 1776 88 1.7 576.8 1.0X +q47 1780 1899 168 1.7 599.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q48 887 897 13 5.6 180.1 1.0X +q48 937 945 7 5.3 190.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49 681 732 49 8.2 121.3 1.0X +q49 728 753 41 7.7 129.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q50 672 694 37 4.8 207.3 1.0X +q50 754 786 28 4.3 232.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51 2761 2791 41 1.3 752.1 1.0X +q51 2470 2795 459 1.5 672.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q52 159 174 17 18.6 53.7 1.0X +q52 151 158 6 19.7 50.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q53 279 320 31 10.7 93.9 1.0X +q53 299 311 23 9.9 100.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q54 1304 1308 5 4.0 247.0 1.0X +q54 1292 1341 70 4.1 244.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q55 161 186 13 18.5 54.1 1.0X +q55 169 186 14 17.6 56.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q56 525 550 22 9.9 101.4 1.0X +q56 491 507 11 10.5 94.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57 761 854 105 2.0 496.8 1.0X +q57 1032 1203 241 1.5 674.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q58 483 529 39 10.6 94.1 1.0X +q58 509 580 99 10.1 99.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q59 680 692 14 4.3 230.3 1.0X +q59 753 754 2 3.9 255.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q60 500 565 49 10.4 96.5 1.0X +q60 581 692 167 8.9 112.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q61 579 630 56 5.4 185.4 1.0X +q61 578 605 52 5.4 185.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q62 190 215 18 4.2 239.6 1.0X +q62 176 186 4 4.5 221.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q63 268 282 14 11.1 90.3 1.0X +q63 320 351 28 9.3 107.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64 2422 2684 371 2.9 350.0 1.0X +q64 2602 2865 372 2.7 376.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q65 767 815 45 3.9 258.3 1.0X +q65 603 668 53 4.9 202.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q66 681 707 26 3.4 293.7 1.0X +q66 538 600 82 4.3 232.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67 5411 5483 101 0.5 1821.5 1.0X +q67 5377 5515 195 0.6 1809.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q68 511 535 24 6.1 164.3 1.0X +q68 495 550 38 6.3 159.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q69 1532 1706 247 1.4 739.6 1.0X +q69 1742 1842 140 1.2 841.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70 548 560 9 5.4 185.5 1.0X +q70 651 667 14 4.5 220.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q71 408 420 15 12.8 78.3 1.0X +q71 435 455 16 12.0 83.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72 93843 95348 2129 0.2 6114.3 1.0X +q72 123701 123925 317 0.1 8059.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q73 389 404 11 7.9 127.1 1.0X +q73 336 373 31 9.1 109.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74 1330 1616 405 2.8 352.6 1.0X +q74 1419 1584 233 2.7 376.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75 1481 1759 394 3.8 262.9 1.0X +q75 1665 1951 404 3.4 295.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q76 325 364 30 15.8 63.4 1.0X +q76 358 369 11 14.3 69.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77 573 780 185 9.8 102.0 1.0X +q77 711 859 128 7.9 126.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78 2164 2460 420 2.6 385.3 1.0X +q78 2705 2843 195 2.1 481.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q79 450 464 11 6.8 147.0 1.0X +q79 484 492 8 6.3 158.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80 1596 1722 178 3.5 282.8 1.0X +q80 1533 1855 455 3.7 271.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q81 408 458 53 0.9 1113.9 1.0X +q81 364 447 82 1.0 991.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q82 1177 1192 22 12.5 80.0 1.0X +q82 1176 1196 27 12.5 79.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q83 284 324 30 2.1 477.7 1.0X +q83 323 348 20 1.8 542.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q84 776 778 2 3.0 328.0 1.0X +q84 742 767 28 3.2 313.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q85 1721 2185 656 1.6 607.0 1.0X +q85 1817 2120 428 1.6 641.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86 207 227 14 3.9 255.4 1.0X +q86 210 226 9 3.8 259.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q87 731 791 74 7.1 140.3 1.0X +q87 919 1068 211 5.7 176.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q88 1414 1665 355 2.1 475.6 1.0X +q88 1471 1642 243 2.0 494.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q89 346 391 43 8.6 116.6 1.0X +q89 313 362 62 9.5 105.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q90 146 171 21 5.5 180.3 1.0X +q90 145 165 20 5.6 178.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q91 365 393 26 6.3 159.1 1.0X +q91 388 429 29 5.9 169.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q92 153 183 22 5.3 189.5 1.0X +q92 152 173 17 5.3 188.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q93 430 445 9 7.4 135.8 1.0X +q93 501 515 10 6.3 158.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q94 356 377 23 2.4 422.7 1.0X +q94 367 387 12 2.3 436.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q95 5268 5437 240 0.2 6256.5 1.0X +q95 5374 5470 137 0.2 6382.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q96 188 207 20 15.8 63.3 1.0X +q96 186 208 19 16.0 62.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q97 1214 1299 120 3.6 276.4 1.0X +q97 1318 1354 51 3.3 300.1 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98 314 351 41 9.5 105.8 1.0X +q98 304 334 33 9.8 102.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q99 312 321 11 4.8 206.2 1.0X +q99 310 330 19 4.9 205.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5a-v2.7 1495 1520 36 3.8 265.7 1.0X +q5a-v2.7 1498 1674 250 3.8 266.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6-v2.7 1002 1015 18 3.1 321.0 1.0X +q6-v2.7 980 1001 18 3.2 314.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10a-v2.7 1792 1914 172 1.2 865.6 1.0X +q10a-v2.7 1863 2003 199 1.1 899.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11-v2.7 1809 2158 493 2.1 479.7 1.0X +q11-v2.7 1811 2162 497 2.1 480.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12-v2.7 146 168 18 5.5 180.8 1.0X +q12-v2.7 140 162 24 5.8 173.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14-v2.7 4035 4204 239 1.3 786.6 1.0X +q14-v2.7 3748 3965 307 1.4 730.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a-v2.7 7068 7371 429 0.7 1377.8 1.0X +q14a-v2.7 7129 7256 179 0.7 1389.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18a-v2.7 2182 2292 156 1.7 605.7 1.0X +q18a-v2.7 2101 2292 270 1.7 583.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20-v2.7 179 193 11 8.5 117.1 1.0X +q20-v2.7 162 182 16 9.5 105.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22-v2.7 13689 13818 183 0.9 1156.5 1.0X +q22-v2.7 14202 14286 118 0.8 1199.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22a-v2.7 1981 2114 189 6.0 167.3 1.0X +q22a-v2.7 2164 2288 176 5.5 182.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24-v2.7 228 270 21 14.6 68.3 1.0X +q24-v2.7 254 278 26 13.1 76.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27a-v2.7 1361 1495 189 3.6 278.3 1.0X +q27a-v2.7 1449 1664 304 3.4 296.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34-v2.7 403 409 8 7.6 131.6 1.0X +q34-v2.7 404 415 11 7.6 131.9 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35-v2.7 1326 1387 87 1.6 640.1 1.0X +q35-v2.7 1433 1462 41 1.4 691.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35a-v2.7 1296 1311 20 1.6 626.0 1.0X +q35a-v2.7 1346 1394 68 1.5 650.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36a-v2.7 533 556 16 5.6 179.5 1.0X +q36a-v2.7 575 607 44 5.2 193.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47-v2.7 1618 1744 178 1.8 544.6 1.0X +q47-v2.7 1841 2000 226 1.6 619.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49-v2.7 617 669 35 9.1 109.8 1.0X +q49-v2.7 620 682 63 9.1 110.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51a-v2.7 13989 14478 692 0.3 3809.9 1.0X +q51a-v2.7 14407 14835 605 0.3 3923.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57-v2.7 799 867 82 1.9 521.6 1.0X +q57-v2.7 966 1157 271 1.6 630.8 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64-v2.7 2391 2749 506 2.9 345.5 1.0X +q64-v2.7 2494 2897 570 2.8 360.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67a-v2.7 7040 7300 367 0.4 2369.8 1.0X +q67a-v2.7 7449 7556 152 0.4 2507.3 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70a-v2.7 702 723 33 4.2 237.8 1.0X +q70a-v2.7 713 751 47 4.1 241.6 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72-v2.7 92914 94378 2071 0.2 6053.8 1.0X +q72-v2.7 121071 121592 736 0.1 7888.4 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74-v2.7 1301 1522 314 2.9 344.9 1.0X +q74-v2.7 1213 1347 190 3.1 321.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75-v2.7 1729 1814 121 3.3 306.9 1.0X +q75-v2.7 1379 1739 509 4.1 244.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77a-v2.7 809 874 92 6.9 144.1 1.0X +q77a-v2.7 1231 1282 72 4.6 219.2 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78-v2.7 2053 2458 573 2.7 365.5 1.0X +q78-v2.7 2218 2579 511 2.5 395.0 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80a-v2.7 1612 1662 72 3.5 285.5 1.0X +q80a-v2.7 1873 1912 56 3.0 331.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86a-v2.7 244 278 29 3.3 300.6 1.0X +q86a-v2.7 232 262 24 3.5 286.7 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98-v2.7 289 308 19 10.3 97.2 1.0X +q98-v2.7 285 300 12 10.4 95.8 1.0X diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt index c746cde05060a..4888365b839e0 100644 --- a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ TakeOrderedAndProject ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -TakeOrderedAndProject with SMJ for doExecute 214 243 27 0.0 21428.5 1.0X -TakeOrderedAndProject with SMJ for executeCollect 97 102 4 0.1 9748.1 2.2X +TakeOrderedAndProject with SMJ for doExecute 160 200 39 0.1 15968.9 1.0X +TakeOrderedAndProject with SMJ for executeCollect 96 101 5 0.1 9562.9 1.7X diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt index 1fa4496d6aea0..0cbc1823e7d29 100644 --- a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt @@ -2,11 +2,11 @@ TakeOrderedAndProject ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -TakeOrderedAndProject with SMJ for doExecute 262 286 31 0.0 26165.4 1.0X -TakeOrderedAndProject with SMJ for executeCollect 107 113 7 0.1 10681.8 2.4X +TakeOrderedAndProject with SMJ for doExecute 194 275 70 0.1 19414.1 1.0X +TakeOrderedAndProject with SMJ for executeCollect 118 119 2 0.1 11785.9 1.6X diff --git a/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt b/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt index 269fdd7c815a2..7c2ab96ac4ec2 100644 --- a/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt @@ -2,21 +2,21 @@ Top-K Computation ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark Top-K: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 9148 9493 278 2.3 436.2 1.0X -ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1698 1731 48 12.4 81.0 5.4X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12103 12157 56 1.7 577.1 0.8X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5049 5211 98 4.2 240.8 1.8X -RANK (PARTITION: , WindowGroupLimit: false) 9596 9842 149 2.2 457.6 1.0X -RANK (PARTITION: , WindowGroupLimit: true) 1896 2059 112 11.1 90.4 4.8X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12338 12642 150 1.7 588.3 0.7X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4985 5179 95 4.2 237.7 1.8X -DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9389 9628 171 2.2 447.7 1.0X -DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1849 1900 71 11.3 88.2 4.9X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12027 12393 186 1.7 573.5 0.8X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5018 5083 47 4.2 239.3 1.8X +ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 8651 8928 175 2.4 412.5 1.0X +ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1629 1647 13 12.9 77.7 5.3X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11038 11214 143 1.9 526.3 0.8X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5061 5109 33 4.1 241.3 1.7X +RANK (PARTITION: , WindowGroupLimit: false) 9458 9640 159 2.2 451.0 0.9X +RANK (PARTITION: , WindowGroupLimit: true) 1728 1749 16 12.1 82.4 5.0X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11732 11986 175 1.8 559.4 0.7X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4823 4980 69 4.3 230.0 1.8X +DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9262 9491 104 2.3 441.6 0.9X +DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1961 1978 19 10.7 93.5 4.4X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11727 11982 139 1.8 559.2 0.7X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4876 4978 120 4.3 232.5 1.8X diff --git a/sql/core/benchmarks/TopKBenchmark-results.txt b/sql/core/benchmarks/TopKBenchmark-results.txt index 76efbf1397b08..4b335ce3e2d98 100644 --- a/sql/core/benchmarks/TopKBenchmark-results.txt +++ b/sql/core/benchmarks/TopKBenchmark-results.txt @@ -2,21 +2,21 @@ Top-K Computation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Benchmark Top-K: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 9462 9625 131 2.2 451.2 1.0X -ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1653 1694 28 12.7 78.8 5.7X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11977 12058 82 1.8 571.1 0.8X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5021 5081 35 4.2 239.4 1.9X -RANK (PARTITION: , WindowGroupLimit: false) 10017 10132 79 2.1 477.6 0.9X -RANK (PARTITION: , WindowGroupLimit: true) 1948 1984 22 10.8 92.9 4.9X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12477 12533 73 1.7 594.9 0.8X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5033 5090 36 4.2 240.0 1.9X -DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9757 9841 63 2.1 465.3 1.0X -DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1968 1996 30 10.7 93.8 4.8X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12419 12483 47 1.7 592.2 0.8X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5060 5128 53 4.1 241.3 1.9X +ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 9179 9279 81 2.3 437.7 1.0X +ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1609 1637 16 13.0 76.7 5.7X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11629 11673 34 1.8 554.5 0.8X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5008 5038 23 4.2 238.8 1.8X +RANK (PARTITION: , WindowGroupLimit: false) 9720 9859 339 2.2 463.5 0.9X +RANK (PARTITION: , WindowGroupLimit: true) 1955 1990 15 10.7 93.2 4.7X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12101 12208 135 1.7 577.0 0.8X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 5019 5127 167 4.2 239.3 1.8X +DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9673 9748 62 2.2 461.3 0.9X +DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1972 2007 31 10.6 94.0 4.7X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12099 12148 42 1.7 576.9 0.8X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4982 5028 28 4.2 237.5 1.8X diff --git a/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt b/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt index 81efa0b9b3a72..7c2f56761d1cd 100644 --- a/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt @@ -2,58 +2,58 @@ UDF with mixed input types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to string wholestage off 129 165 50 0.8 1291.3 1.0X -long/nullable int/string to string wholestage on 64 74 6 1.6 638.6 2.0X +long/nullable int/string to string wholestage off 137 144 9 0.7 1370.6 1.0X +long/nullable int/string to string wholestage on 72 82 11 1.4 719.9 1.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to option wholestage off 47 66 28 2.2 465.1 1.0X -long/nullable int/string to option wholestage on 34 39 6 2.9 343.2 1.4X +long/nullable int/string to option wholestage off 43 49 9 2.3 427.1 1.0X +long/nullable int/string to option wholestage on 37 42 6 2.7 374.6 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -long/nullable int/string to primitive wholestage off 30 31 1 3.3 299.6 1.0X -long/nullable int/string to primitive wholestage on 28 29 2 3.6 280.4 1.1X +long/nullable int/string to primitive wholestage off 34 39 7 2.9 340.1 1.0X +long/nullable int/string to primitive wholestage on 32 35 4 3.2 315.5 1.1X ================================================================================================ UDF with primitive types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to string wholestage off 30 31 2 3.4 297.4 1.0X -long/nullable int to string wholestage on 28 35 5 3.5 283.9 1.0X +long/nullable int to string wholestage off 30 30 0 3.3 301.5 1.0X +long/nullable int to string wholestage on 31 33 1 3.3 306.2 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to option wholestage off 22 26 6 4.6 219.3 1.0X -long/nullable int to option wholestage on 21 23 1 4.7 214.0 1.0X +long/nullable int to option wholestage off 22 25 4 4.5 224.4 1.0X +long/nullable int to option wholestage on 23 28 6 4.4 228.9 1.0X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -long/nullable int to primitive wholestage off 18 19 1 5.6 179.0 1.0X -long/nullable int to primitive wholestage on 18 19 1 5.6 179.7 1.0X +long/nullable int to primitive wholestage off 24 25 2 4.2 235.5 1.0X +long/nullable int to primitive wholestage on 19 20 1 5.3 189.2 1.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor UDF identity overhead: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Baseline 13 19 8 7.9 125.8 1.0X -With identity UDF 16 18 2 6.2 160.4 0.8X +Baseline 13 15 2 7.9 126.9 1.0X +With identity UDF 18 20 3 5.7 176.4 0.7X diff --git a/sql/core/benchmarks/UDFBenchmark-results.txt b/sql/core/benchmarks/UDFBenchmark-results.txt index 818b51532da74..4e6c9dfdc74e6 100644 --- a/sql/core/benchmarks/UDFBenchmark-results.txt +++ b/sql/core/benchmarks/UDFBenchmark-results.txt @@ -2,58 +2,58 @@ UDF with mixed input types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to string wholestage off 165 166 1 0.6 1648.2 1.0X -long/nullable int/string to string wholestage on 87 114 18 1.1 869.9 1.9X +long/nullable int/string to string wholestage off 130 141 15 0.8 1299.8 1.0X +long/nullable int/string to string wholestage on 92 97 6 1.1 922.8 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to option wholestage off 66 68 3 1.5 659.8 1.0X -long/nullable int/string to option wholestage on 53 63 6 1.9 525.8 1.3X +long/nullable int/string to option wholestage off 52 57 7 1.9 523.5 1.0X +long/nullable int/string to option wholestage on 42 49 5 2.4 420.5 1.2X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -long/nullable int/string to primitive wholestage off 34 40 8 3.0 338.5 1.0X -long/nullable int/string to primitive wholestage on 28 30 1 3.6 280.4 1.2X +long/nullable int/string to primitive wholestage off 30 34 5 3.3 301.7 1.0X +long/nullable int/string to primitive wholestage on 31 33 2 3.2 312.3 1.0X ================================================================================================ UDF with primitive types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to string wholestage off 28 30 2 3.5 284.0 1.0X -long/nullable int to string wholestage on 29 33 5 3.4 293.0 1.0X +long/nullable int to string wholestage off 31 32 1 3.2 312.6 1.0X +long/nullable int to string wholestage on 31 32 1 3.2 309.5 1.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to option wholestage off 22 23 1 4.6 218.1 1.0X -long/nullable int to option wholestage on 22 23 1 4.5 224.7 1.0X +long/nullable int to option wholestage off 23 23 0 4.4 226.9 1.0X +long/nullable int to option wholestage on 24 25 2 4.2 240.8 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor long/nullable int to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -long/nullable int to primitive wholestage off 16 16 0 6.4 157.3 1.0X -long/nullable int to primitive wholestage on 18 21 4 5.7 175.0 0.9X +long/nullable int to primitive wholestage off 18 20 3 5.5 180.6 1.0X +long/nullable int to primitive wholestage on 19 21 3 5.2 193.0 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor UDF identity overhead: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Baseline 14 16 1 7.1 141.4 1.0X -With identity UDF 14 16 3 6.9 144.3 1.0X +Baseline 13 16 2 7.5 133.4 1.0X +With identity UDF 17 18 1 6.1 165.2 0.8X diff --git a/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt b/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt index d11fd0406e1b4..8148196e6b688 100644 --- a/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt @@ -2,32 +2,32 @@ Benchmark UnsafeArrayData ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 74 74 1 2281.5 0.4 1.0X -Double 158 158 0 1064.2 0.9 0.5X +Int 76 77 1 2202.3 0.5 1.0X +Double 159 159 0 1055.9 0.9 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 12 13 1 1709.6 0.6 1.0X -Double 28 33 4 737.4 1.4 0.4X +Int 12 13 1 1797.9 0.6 1.0X +Double 28 33 4 748.6 1.3 0.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Get primitive array from UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 19 21 1 3322.4 0.3 1.0X -Double 39 42 2 1600.5 0.6 0.5X +Int 20 21 1 3164.3 0.3 1.0X +Double 40 42 1 1561.8 0.6 0.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Create UnsafeArrayData from primitive array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Int 20 24 2 3069.8 0.3 1.0X -Double 44 49 3 1444.2 0.7 0.5X +Int 20 22 1 3198.7 0.3 1.0X +Double 42 45 2 1508.3 0.7 0.5X diff --git a/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt index 79032e13c0de3..1c9c5f4f5b4c3 100644 --- a/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt +++ b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt @@ -2,32 +2,32 @@ Benchmark UnsafeArrayData ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Read UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 76 76 0 2215.1 0.5 1.0X -Double 158 158 0 1062.9 0.9 0.5X +Int 76 77 3 2206.0 0.5 1.0X +Double 158 159 1 1060.4 0.9 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Write UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 12 14 1 1690.2 0.6 1.0X -Double 31 33 1 687.1 1.5 0.4X +Int 13 15 1 1671.6 0.6 1.0X +Double 28 34 3 738.8 1.4 0.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Get primitive array from UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 21 23 2 3025.8 0.3 1.0X -Double 45 48 1 1410.8 0.7 0.5X +Int 19 22 2 3250.9 0.3 1.0X +Double 40 43 2 1572.4 0.6 0.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Create UnsafeArrayData from primitive array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Int 22 24 1 2902.4 0.3 1.0X -Double 46 49 1 1374.3 0.7 0.5X +Int 20 23 2 3201.2 0.3 1.0X +Double 43 46 1 1460.5 0.7 0.5X diff --git a/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt b/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt index c5756342d99d5..eac137fad9594 100644 --- a/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt @@ -2,25 +2,25 @@ Add 2 columns and drop 2 columns at 3 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Add 2 columns and drop 2 columns at 3 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 3 4 1 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1 2 0 0.0 Infinity 1.9X -To non-nullable StructTypes using non-performant method 18 21 2 0.0 Infinity 0.1X -To nullable StructTypes using non-performant method 792 836 39 0.0 Infinity 0.0X +To non-nullable StructTypes using performant method 1 2 1 0.0 Infinity 1.0X +To nullable StructTypes using performant method 1 1 0 0.0 Infinity 1.4X +To non-nullable StructTypes using non-performant method 15 16 1 0.0 Infinity 0.1X +To nullable StructTypes using non-performant method 542 561 13 0.0 Infinity 0.0X ================================================================================================ Add 50 columns and drop 50 columns at 100 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Add 50 columns and drop 50 columns at 100 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 1088 1112 34 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1150 1151 1 0.0 Infinity 0.9X +To non-nullable StructTypes using performant method 168 172 4 0.0 Infinity 1.0X +To nullable StructTypes using performant method 196 200 5 0.0 Infinity 0.9X diff --git a/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt b/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt index 7bc440e192516..6e137bec68e30 100644 --- a/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt +++ b/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt @@ -2,25 +2,25 @@ Add 2 columns and drop 2 columns at 3 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Add 2 columns and drop 2 columns at 3 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 2 3 1 0.0 Infinity 1.0X +To non-nullable StructTypes using performant method 2 2 1 0.0 Infinity 1.0X To nullable StructTypes using performant method 1 1 0 0.0 Infinity 1.4X -To non-nullable StructTypes using non-performant method 18 19 2 0.0 Infinity 0.1X -To nullable StructTypes using non-performant method 846 885 45 0.0 Infinity 0.0X +To non-nullable StructTypes using non-performant method 16 17 1 0.0 Infinity 0.1X +To nullable StructTypes using non-performant method 565 597 25 0.0 Infinity 0.0X ================================================================================================ Add 50 columns and drop 50 columns at 100 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Add 50 columns and drop 50 columns at 100 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 1087 1109 31 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1123 1190 95 0.0 Infinity 1.0X +To non-nullable StructTypes using performant method 178 188 8 0.0 Infinity 1.0X +To nullable StructTypes using performant method 207 212 5 0.0 Infinity 0.9X diff --git a/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt b/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt index 49cf58086a51c..143c6c57232e1 100644 --- a/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt @@ -1,44 +1,44 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -native_long_add 9542 9554 11 52.4 19.1 1.0X -java_long_add_default 22433 22756 485 22.3 44.9 0.4X -java_long_add_magic 11747 11782 44 42.6 23.5 0.8X -java_long_add_static_magic 11539 11594 48 43.3 23.1 0.8X -scala_long_add_default 23789 25196 2336 21.0 47.6 0.4X -scala_long_add_magic 11714 11758 38 42.7 23.4 0.8X +native_long_add 9807 10151 549 51.0 19.6 1.0X +java_long_add_default 22932 22997 56 21.8 45.9 0.4X +java_long_add_magic 11408 11651 246 43.8 22.8 0.9X +java_long_add_static_magic 11451 11487 52 43.7 22.9 0.9X +scala_long_add_default 23554 23574 22 21.2 47.1 0.4X +scala_long_add_magic 11686 11710 33 42.8 23.4 0.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 10296 10347 45 48.6 20.6 1.0X -java_long_add_default 22464 23279 1403 22.3 44.9 0.5X -java_long_add_magic 11775 11807 33 42.5 23.5 0.9X -java_long_add_static_magic 10049 10065 16 49.8 20.1 1.0X -scala_long_add_default 22436 24439 3455 22.3 44.9 0.5X -scala_long_add_magic 11815 11895 108 42.3 23.6 0.9X +native_long_add 10397 10439 41 48.1 20.8 1.0X +java_long_add_default 22679 22712 33 22.0 45.4 0.5X +java_long_add_magic 11595 11695 100 43.1 23.2 0.9X +java_long_add_static_magic 10111 10146 38 49.5 20.2 1.0X +scala_long_add_default 22592 22624 27 22.1 45.2 0.5X +scala_long_add_magic 11593 11648 52 43.1 23.2 0.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 22445 22924 448 22.3 44.9 1.0X -java_long_add_default 26468 26478 10 18.9 52.9 0.8X -java_long_add_magic 32917 32937 26 15.2 65.8 0.7X -java_long_add_static_magic 31424 31496 108 15.9 62.8 0.7X -scala_long_add_default 26265 26358 100 19.0 52.5 0.9X -scala_long_add_magic 33764 34033 423 14.8 67.5 0.7X +native_long_add 22769 22882 160 22.0 45.5 1.0X +java_long_add_default 27959 28164 261 17.9 55.9 0.8X +java_long_add_magic 32664 32703 36 15.3 65.3 0.7X +java_long_add_static_magic 31044 31293 407 16.1 62.1 0.7X +scala_long_add_default 26593 26614 18 18.8 53.2 0.9X +scala_long_add_magic 32909 33049 170 15.2 65.8 0.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 21582 22336 1304 23.2 43.2 1.0X -java_long_add_default 25194 25472 475 19.8 50.4 0.9X -java_long_add_magic 32678 32727 43 15.3 65.4 0.7X -java_long_add_static_magic 30357 30481 214 16.5 60.7 0.7X -scala_long_add_default 25166 25413 392 19.9 50.3 0.9X -scala_long_add_magic 32759 32773 12 15.3 65.5 0.7X +native_long_add 22966 22981 26 21.8 45.9 1.0X +java_long_add_default 26581 26697 133 18.8 53.2 0.9X +java_long_add_magic 32925 33042 131 15.2 65.9 0.7X +java_long_add_static_magic 31046 32306 2072 16.1 62.1 0.7X +scala_long_add_default 26648 26670 19 18.8 53.3 0.9X +scala_long_add_magic 32969 33052 129 15.2 65.9 0.7X diff --git a/sql/core/benchmarks/V2FunctionBenchmark-results.txt b/sql/core/benchmarks/V2FunctionBenchmark-results.txt index dca57e380c1a2..8dcacf05fa0eb 100644 --- a/sql/core/benchmarks/V2FunctionBenchmark-results.txt +++ b/sql/core/benchmarks/V2FunctionBenchmark-results.txt @@ -1,44 +1,44 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -native_long_add 9323 9391 60 53.6 18.6 1.0X -java_long_add_default 22346 22797 425 22.4 44.7 0.4X -java_long_add_magic 10786 10800 13 46.4 21.6 0.9X -java_long_add_static_magic 10625 10748 169 47.1 21.2 0.9X -scala_long_add_default 22788 22840 47 21.9 45.6 0.4X -scala_long_add_magic 10709 10767 51 46.7 21.4 0.9X +native_long_add 9192 9271 105 54.4 18.4 1.0X +java_long_add_default 22377 22680 265 22.3 44.8 0.4X +java_long_add_magic 10753 10776 28 46.5 21.5 0.9X +java_long_add_static_magic 10564 11517 825 47.3 21.1 0.9X +scala_long_add_default 23011 23250 273 21.7 46.0 0.4X +scala_long_add_magic 10654 10734 97 46.9 21.3 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 9743 9901 137 51.3 19.5 1.0X -java_long_add_default 22268 22278 10 22.5 44.5 0.4X -java_long_add_magic 10735 10785 44 46.6 21.5 0.9X -java_long_add_static_magic 9964 10028 94 50.2 19.9 1.0X -scala_long_add_default 21995 22058 63 22.7 44.0 0.4X -scala_long_add_magic 10726 10757 42 46.6 21.5 0.9X +native_long_add 10044 10057 12 49.8 20.1 1.0X +java_long_add_default 22261 22343 81 22.5 44.5 0.5X +java_long_add_magic 10632 10644 17 47.0 21.3 0.9X +java_long_add_static_magic 9940 9974 59 50.3 19.9 1.0X +scala_long_add_default 22279 22349 68 22.4 44.6 0.5X +scala_long_add_magic 10616 10639 21 47.1 21.2 0.9X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 22837 22861 31 21.9 45.7 1.0X -java_long_add_default 28062 28099 41 17.8 56.1 0.8X -java_long_add_magic 32026 33081 1131 15.6 64.1 0.7X -java_long_add_static_magic 32031 32038 8 15.6 64.1 0.7X -scala_long_add_default 26219 26263 63 19.1 52.4 0.9X -scala_long_add_magic 32113 32182 65 15.6 64.2 0.7X +native_long_add 22642 23179 680 22.1 45.3 1.0X +java_long_add_default 27400 27497 102 18.2 54.8 0.8X +java_long_add_magic 31896 31958 66 15.7 63.8 0.7X +java_long_add_static_magic 30630 31059 389 16.3 61.3 0.7X +scala_long_add_default 26240 26339 156 19.1 52.5 0.9X +scala_long_add_magic 32268 32325 90 15.5 64.5 0.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 22056 22271 294 22.7 44.1 1.0X -java_long_add_default 25840 25884 40 19.3 51.7 0.9X -java_long_add_magic 31928 31992 55 15.7 63.9 0.7X -java_long_add_static_magic 31464 31507 46 15.9 62.9 0.7X -scala_long_add_default 25851 25932 107 19.3 51.7 0.9X -scala_long_add_magic 32315 32881 629 15.5 64.6 0.7X +native_long_add 21853 22201 538 22.9 43.7 1.0X +java_long_add_default 25860 25886 34 19.3 51.7 0.8X +java_long_add_magic 32191 32350 218 15.5 64.4 0.7X +java_long_add_static_magic 30755 30812 52 16.3 61.5 0.7X +scala_long_add_default 25872 25923 69 19.3 51.7 0.8X +scala_long_add_magic 31910 31922 14 15.7 63.8 0.7X diff --git a/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt b/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt index c4b6ef29d7074..91e51c6833aa5 100644 --- a/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt @@ -2,157 +2,157 @@ parsing large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor parsing large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 select expressions 1 2 1 0.0 1196151.0 1.0X -100 select expressions 2 3 1 0.0 2095800.0 0.6X -2500 select expressions 36 39 4 0.0 35701821.0 0.0X +1 select expressions 1 1 0 0.0 669297.0 1.0X +100 select expressions 3 3 1 0.0 2920356.0 0.2X +2500 select expressions 63 65 1 0.0 63383411.0 0.0X ================================================================================================ optimize large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor optimize large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -100 columns 4 5 1 0.0 4384067.0 1.0X -1000 columns 28 29 1 0.0 27845199.0 0.2X -10000 columns 287 294 7 0.0 286788665.0 0.0X +100 columns 6 7 1 0.0 6257029.0 1.0X +1000 columns 48 49 1 0.0 47583298.0 0.1X +10000 columns 488 504 11 0.0 487843016.0 0.0X ================================================================================================ many column field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor many column field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 cols x 100000 rows (read in-mem) 16 22 5 6.4 157.3 1.0X -1 cols x 100000 rows (exec in-mem) 18 23 5 5.6 179.9 0.9X -1 cols x 100000 rows (read parquet) 30 37 7 3.3 302.7 0.5X -1 cols x 100000 rows (write parquet) 98 106 8 1.0 978.0 0.2X -100 cols x 1000 rows (read in-mem) 12 17 4 8.1 123.9 1.3X -100 cols x 1000 rows (exec in-mem) 15 19 5 6.5 153.2 1.0X -100 cols x 1000 rows (read parquet) 24 30 7 4.1 244.2 0.6X -100 cols x 1000 rows (write parquet) 93 103 9 1.1 932.3 0.2X -2500 cols x 40 rows (read in-mem) 55 58 4 1.8 545.9 0.3X -2500 cols x 40 rows (exec in-mem) 100 107 6 1.0 995.7 0.2X -2500 cols x 40 rows (read parquet) 306 308 3 0.3 3060.5 0.1X -2500 cols x 40 rows (write parquet) 135 144 10 0.7 1349.9 0.1X +1 cols x 100000 rows (read in-mem) 15 22 5 6.8 147.1 1.0X +1 cols x 100000 rows (exec in-mem) 16 23 5 6.3 158.4 0.9X +1 cols x 100000 rows (read parquet) 28 35 7 3.6 281.5 0.5X +1 cols x 100000 rows (write parquet) 90 102 8 1.1 897.3 0.2X +100 cols x 1000 rows (read in-mem) 12 16 4 8.3 120.4 1.2X +100 cols x 1000 rows (exec in-mem) 15 18 4 6.8 146.1 1.0X +100 cols x 1000 rows (read parquet) 24 28 7 4.2 237.6 0.6X +100 cols x 1000 rows (write parquet) 85 92 6 1.2 847.2 0.2X +2500 cols x 40 rows (read in-mem) 43 45 5 2.3 426.0 0.3X +2500 cols x 40 rows (exec in-mem) 71 74 4 1.4 708.9 0.2X +2500 cols x 40 rows (read parquet) 295 301 4 0.3 2945.7 0.0X +2500 cols x 40 rows (write parquet) 115 119 5 0.9 1153.7 0.1X ================================================================================================ wide shallowly nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor wide shallowly nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 20 26 6 5.0 201.6 1.0X -1 wide x 100000 rows (exec in-mem) 22 28 7 4.5 223.4 0.9X -1 wide x 100000 rows (read parquet) 25 31 8 4.0 249.3 0.8X -1 wide x 100000 rows (write parquet) 99 109 9 1.0 992.7 0.2X -100 wide x 1000 rows (read in-mem) 15 19 5 6.6 151.9 1.3X -100 wide x 1000 rows (exec in-mem) 23 28 6 4.4 229.1 0.9X -100 wide x 1000 rows (read parquet) 22 29 7 4.5 223.1 0.9X -100 wide x 1000 rows (write parquet) 95 103 7 1.1 947.8 0.2X -2500 wide x 40 rows (read in-mem) 23 27 5 4.3 231.0 0.9X -2500 wide x 40 rows (exec in-mem) 192 201 7 0.5 1920.9 0.1X -2500 wide x 40 rows (read parquet) 68 73 6 1.5 681.4 0.3X -2500 wide x 40 rows (write parquet) 102 107 7 1.0 1019.1 0.2X +1 wide x 100000 rows (read in-mem) 20 25 6 4.9 203.7 1.0X +1 wide x 100000 rows (exec in-mem) 21 26 7 4.7 211.7 1.0X +1 wide x 100000 rows (read parquet) 22 30 8 4.5 221.6 0.9X +1 wide x 100000 rows (write parquet) 94 103 9 1.1 939.9 0.2X +100 wide x 1000 rows (read in-mem) 15 18 5 6.8 147.8 1.4X +100 wide x 1000 rows (exec in-mem) 16 19 6 6.3 159.1 1.3X +100 wide x 1000 rows (read parquet) 22 27 7 4.6 216.4 0.9X +100 wide x 1000 rows (write parquet) 88 97 10 1.1 880.7 0.2X +2500 wide x 40 rows (read in-mem) 20 26 7 4.9 204.7 1.0X +2500 wide x 40 rows (exec in-mem) 22 27 7 4.6 216.8 0.9X +2500 wide x 40 rows (read parquet) 66 68 5 1.5 658.4 0.3X +2500 wide x 40 rows (write parquet) 94 103 10 1.1 941.6 0.2X ================================================================================================ deeply nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 deep x 100000 rows (read in-mem) 16 20 6 6.3 158.7 1.0X -1 deep x 100000 rows (exec in-mem) 18 20 4 5.6 177.3 0.9X -1 deep x 100000 rows (read parquet) 19 23 6 5.4 185.3 0.9X -1 deep x 100000 rows (write parquet) 95 104 8 1.1 951.5 0.2X -100 deep x 1000 rows (read in-mem) 44 47 3 2.3 444.0 0.4X -100 deep x 1000 rows (exec in-mem) 452 466 13 0.2 4520.5 0.0X -100 deep x 1000 rows (read parquet) 433 442 13 0.2 4329.1 0.0X -100 deep x 1000 rows (write parquet) 122 129 6 0.8 1224.3 0.1X -250 deep x 400 rows (read in-mem) 191 195 3 0.5 1909.7 0.1X -250 deep x 400 rows (exec in-mem) 2893 2909 23 0.0 28927.4 0.0X -250 deep x 400 rows (read parquet) 2595 2598 4 0.0 25951.8 0.0X -250 deep x 400 rows (write parquet) 268 273 4 0.4 2675.1 0.1X +1 deep x 100000 rows (read in-mem) 15 19 6 6.5 154.5 1.0X +1 deep x 100000 rows (exec in-mem) 18 21 6 5.7 175.5 0.9X +1 deep x 100000 rows (read parquet) 18 23 7 5.6 178.4 0.9X +1 deep x 100000 rows (write parquet) 89 96 8 1.1 887.6 0.2X +100 deep x 1000 rows (read in-mem) 43 46 5 2.3 432.1 0.4X +100 deep x 1000 rows (exec in-mem) 445 453 8 0.2 4448.2 0.0X +100 deep x 1000 rows (read parquet) 445 453 6 0.2 4449.9 0.0X +100 deep x 1000 rows (write parquet) 117 129 9 0.9 1168.8 0.1X +250 deep x 400 rows (read in-mem) 192 195 3 0.5 1915.7 0.1X +250 deep x 400 rows (exec in-mem) 2694 2700 9 0.0 26937.2 0.0X +250 deep x 400 rows (read parquet) 2683 2688 6 0.0 26833.1 0.0X +250 deep x 400 rows (write parquet) 267 273 4 0.4 2667.1 0.1X ================================================================================================ bushy struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor bushy struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -1 x 1 deep x 100000 rows (read in-mem) 13 16 4 7.4 134.4 1.0X -1 x 1 deep x 100000 rows (exec in-mem) 15 19 5 6.6 150.9 0.9X -1 x 1 deep x 100000 rows (read parquet) 18 22 5 5.6 177.6 0.8X -1 x 1 deep x 100000 rows (write parquet) 92 98 7 1.1 921.7 0.1X -128 x 8 deep x 1000 rows (read in-mem) 13 16 4 7.4 134.3 1.0X -128 x 8 deep x 1000 rows (exec in-mem) 25 28 5 4.0 248.9 0.5X -128 x 8 deep x 1000 rows (read parquet) 21 26 6 4.7 213.3 0.6X -128 x 8 deep x 1000 rows (write parquet) 91 98 8 1.1 911.6 0.1X -1024 x 11 deep x 100 rows (read in-mem) 19 22 4 5.3 187.6 0.7X -1024 x 11 deep x 100 rows (exec in-mem) 129 133 6 0.8 1286.3 0.1X -1024 x 11 deep x 100 rows (read parquet) 36 40 4 2.8 363.4 0.4X -1024 x 11 deep x 100 rows (write parquet) 96 102 10 1.0 962.5 0.1X +1 x 1 deep x 100000 rows (read in-mem) 13 15 4 7.7 130.2 1.0X +1 x 1 deep x 100000 rows (exec in-mem) 15 18 4 6.6 150.8 0.9X +1 x 1 deep x 100000 rows (read parquet) 18 21 5 5.6 177.4 0.7X +1 x 1 deep x 100000 rows (write parquet) 88 94 6 1.1 875.7 0.1X +128 x 8 deep x 1000 rows (read in-mem) 13 16 5 7.6 131.7 1.0X +128 x 8 deep x 1000 rows (exec in-mem) 15 18 5 6.7 148.7 0.9X +128 x 8 deep x 1000 rows (read parquet) 22 26 6 4.6 215.6 0.6X +128 x 8 deep x 1000 rows (write parquet) 86 93 6 1.2 864.9 0.2X +1024 x 11 deep x 100 rows (read in-mem) 18 21 6 5.7 176.2 0.7X +1024 x 11 deep x 100 rows (exec in-mem) 19 23 6 5.4 185.8 0.7X +1024 x 11 deep x 100 rows (read parquet) 35 37 4 2.9 349.8 0.4X +1024 x 11 deep x 100 rows (write parquet) 91 94 5 1.1 912.5 0.1X ================================================================================================ wide array field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor wide array field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 15 18 4 6.6 151.1 1.0X -1 wide x 100000 rows (exec in-mem) 17 19 4 5.8 171.5 0.9X -1 wide x 100000 rows (read parquet) 17 21 5 5.8 172.7 0.9X -1 wide x 100000 rows (write parquet) 93 103 8 1.1 931.5 0.2X -100 wide x 1000 rows (read in-mem) 11 13 4 8.9 112.1 1.3X -100 wide x 1000 rows (exec in-mem) 13 15 4 7.8 128.9 1.2X -100 wide x 1000 rows (read parquet) 17 21 5 5.9 170.7 0.9X -100 wide x 1000 rows (write parquet) 90 98 9 1.1 900.2 0.2X -2500 wide x 40 rows (read in-mem) 11 13 3 9.0 111.5 1.4X -2500 wide x 40 rows (exec in-mem) 13 16 4 7.7 129.6 1.2X -2500 wide x 40 rows (read parquet) 17 19 4 5.9 168.5 0.9X -2500 wide x 40 rows (write parquet) 91 98 7 1.1 906.0 0.2X +1 wide x 100000 rows (read in-mem) 15 17 5 6.7 148.8 1.0X +1 wide x 100000 rows (exec in-mem) 17 19 4 5.9 170.7 0.9X +1 wide x 100000 rows (read parquet) 17 21 6 5.8 172.8 0.9X +1 wide x 100000 rows (write parquet) 88 93 9 1.1 878.8 0.2X +100 wide x 1000 rows (read in-mem) 11 14 5 9.1 110.1 1.4X +100 wide x 1000 rows (exec in-mem) 13 16 5 7.9 127.1 1.2X +100 wide x 1000 rows (read parquet) 17 21 6 5.9 168.2 0.9X +100 wide x 1000 rows (write parquet) 83 90 8 1.2 832.1 0.2X +2500 wide x 40 rows (read in-mem) 11 15 6 9.1 109.8 1.4X +2500 wide x 40 rows (exec in-mem) 12 17 6 8.0 125.0 1.2X +2500 wide x 40 rows (read parquet) 17 23 7 6.0 167.5 0.9X +2500 wide x 40 rows (write parquet) 84 94 8 1.2 841.2 0.2X ================================================================================================ wide map field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor wide map field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 12 14 3 8.3 121.0 1.0X -1 wide x 100000 rows (exec in-mem) 16 18 3 6.3 159.0 0.8X -1 wide x 100000 rows (read parquet) 21 24 5 4.7 213.2 0.6X -1 wide x 100000 rows (write parquet) 91 96 6 1.1 905.2 0.1X -100 wide x 1000 rows (read in-mem) 8 9 3 13.3 75.2 1.6X -100 wide x 1000 rows (exec in-mem) 10 12 3 10.4 96.1 1.3X -100 wide x 1000 rows (read parquet) 19 21 4 5.3 187.5 0.6X -100 wide x 1000 rows (write parquet) 86 90 5 1.2 858.4 0.1X -2500 wide x 40 rows (read in-mem) 9 11 2 10.8 92.8 1.3X -2500 wide x 40 rows (exec in-mem) 11 13 3 9.0 111.5 1.1X -2500 wide x 40 rows (read parquet) 19 22 4 5.2 191.8 0.6X -2500 wide x 40 rows (write parquet) 90 94 5 1.1 899.2 0.1X +1 wide x 100000 rows (read in-mem) 12 15 4 8.3 121.2 1.0X +1 wide x 100000 rows (exec in-mem) 16 21 5 6.3 159.3 0.8X +1 wide x 100000 rows (read parquet) 21 24 5 4.8 208.1 0.6X +1 wide x 100000 rows (write parquet) 84 89 5 1.2 842.0 0.1X +100 wide x 1000 rows (read in-mem) 8 9 2 13.1 76.5 1.6X +100 wide x 1000 rows (exec in-mem) 10 11 3 10.5 95.3 1.3X +100 wide x 1000 rows (read parquet) 18 20 6 5.6 178.2 0.7X +100 wide x 1000 rows (write parquet) 80 85 6 1.2 801.0 0.2X +2500 wide x 40 rows (read in-mem) 9 10 3 10.7 93.4 1.3X +2500 wide x 40 rows (exec in-mem) 11 12 2 8.9 111.7 1.1X +2500 wide x 40 rows (read parquet) 18 21 6 5.5 183.2 0.7X +2500 wide x 40 rows (write parquet) 82 89 8 1.2 822.1 0.1X diff --git a/sql/core/benchmarks/WideSchemaBenchmark-results.txt b/sql/core/benchmarks/WideSchemaBenchmark-results.txt index e61b27a7c727f..4931872223cfe 100644 --- a/sql/core/benchmarks/WideSchemaBenchmark-results.txt +++ b/sql/core/benchmarks/WideSchemaBenchmark-results.txt @@ -2,157 +2,157 @@ parsing large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor parsing large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 select expressions 1 2 1 0.0 1177503.0 1.0X -100 select expressions 2 3 1 0.0 2179549.0 0.5X -2500 select expressions 40 43 4 0.0 39575214.0 0.0X +1 select expressions 1 1 0 0.0 671442.0 1.0X +100 select expressions 3 3 0 0.0 3181250.0 0.2X +2500 select expressions 69 72 2 0.0 69457245.0 0.0X ================================================================================================ optimize large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor optimize large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -100 columns 5 5 1 0.0 4710103.0 1.0X -1000 columns 31 32 1 0.0 30879997.0 0.2X -10000 columns 309 327 10 0.0 309351929.0 0.0X +100 columns 7 8 1 0.0 6678458.0 1.0X +1000 columns 53 55 2 0.0 53079798.0 0.1X +10000 columns 530 538 7 0.0 529951923.0 0.0X ================================================================================================ many column field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor many column field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 cols x 100000 rows (read in-mem) 16 22 3 6.1 163.3 1.0X -1 cols x 100000 rows (exec in-mem) 16 22 3 6.1 162.8 1.0X -1 cols x 100000 rows (read parquet) 29 36 6 3.4 290.4 0.6X -1 cols x 100000 rows (write parquet) 96 105 8 1.0 961.4 0.2X -100 cols x 1000 rows (read in-mem) 13 16 3 7.8 128.8 1.3X -100 cols x 1000 rows (exec in-mem) 16 19 3 6.3 159.4 1.0X -100 cols x 1000 rows (read parquet) 24 28 4 4.2 240.6 0.7X -100 cols x 1000 rows (write parquet) 93 98 5 1.1 931.7 0.2X -2500 cols x 40 rows (read in-mem) 57 61 4 1.8 566.2 0.3X -2500 cols x 40 rows (exec in-mem) 105 108 5 0.9 1054.4 0.2X -2500 cols x 40 rows (read parquet) 285 287 2 0.4 2852.5 0.1X -2500 cols x 40 rows (write parquet) 136 143 7 0.7 1358.0 0.1X +1 cols x 100000 rows (read in-mem) 18 23 4 5.6 177.9 1.0X +1 cols x 100000 rows (exec in-mem) 17 23 4 5.8 171.6 1.0X +1 cols x 100000 rows (read parquet) 30 36 6 3.3 302.9 0.6X +1 cols x 100000 rows (write parquet) 91 101 11 1.1 909.2 0.2X +100 cols x 1000 rows (read in-mem) 12 16 3 8.0 124.3 1.4X +100 cols x 1000 rows (exec in-mem) 15 19 3 6.5 154.0 1.2X +100 cols x 1000 rows (read parquet) 25 29 4 4.0 248.6 0.7X +100 cols x 1000 rows (write parquet) 87 96 8 1.1 871.9 0.2X +2500 cols x 40 rows (read in-mem) 46 50 4 2.2 461.8 0.4X +2500 cols x 40 rows (exec in-mem) 77 81 4 1.3 766.0 0.2X +2500 cols x 40 rows (read parquet) 285 290 3 0.4 2849.8 0.1X +2500 cols x 40 rows (write parquet) 127 134 6 0.8 1265.8 0.1X ================================================================================================ wide shallowly nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor wide shallowly nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 21 25 3 4.8 208.5 1.0X -1 wide x 100000 rows (exec in-mem) 23 26 4 4.4 228.5 0.9X -1 wide x 100000 rows (read parquet) 23 28 4 4.3 231.2 0.9X -1 wide x 100000 rows (write parquet) 100 109 6 1.0 1002.6 0.2X -100 wide x 1000 rows (read in-mem) 15 18 4 6.7 148.9 1.4X -100 wide x 1000 rows (exec in-mem) 21 25 4 4.7 214.8 1.0X -100 wide x 1000 rows (read parquet) 22 26 4 4.6 218.0 1.0X -100 wide x 1000 rows (write parquet) 98 102 5 1.0 975.5 0.2X -2500 wide x 40 rows (read in-mem) 23 27 3 4.4 227.3 0.9X -2500 wide x 40 rows (exec in-mem) 195 199 4 0.5 1951.3 0.1X -2500 wide x 40 rows (read parquet) 71 75 5 1.4 707.3 0.3X -2500 wide x 40 rows (write parquet) 107 110 4 0.9 1065.6 0.2X +1 wide x 100000 rows (read in-mem) 24 30 6 4.3 235.0 1.0X +1 wide x 100000 rows (exec in-mem) 25 29 4 4.0 252.3 0.9X +1 wide x 100000 rows (read parquet) 25 29 4 3.9 254.4 0.9X +1 wide x 100000 rows (write parquet) 101 110 7 1.0 1010.2 0.2X +100 wide x 1000 rows (read in-mem) 20 23 4 5.1 195.0 1.2X +100 wide x 1000 rows (exec in-mem) 19 22 3 5.2 192.1 1.2X +100 wide x 1000 rows (read parquet) 25 28 4 4.0 249.4 0.9X +100 wide x 1000 rows (write parquet) 96 102 7 1.0 957.9 0.2X +2500 wide x 40 rows (read in-mem) 24 27 4 4.2 240.1 1.0X +2500 wide x 40 rows (exec in-mem) 25 29 5 3.9 253.4 0.9X +2500 wide x 40 rows (read parquet) 73 77 4 1.4 727.0 0.3X +2500 wide x 40 rows (write parquet) 106 111 4 0.9 1055.1 0.2X ================================================================================================ deeply nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 deep x 100000 rows (read in-mem) 17 20 3 5.8 171.8 1.0X -1 deep x 100000 rows (exec in-mem) 18 21 3 5.5 183.1 0.9X -1 deep x 100000 rows (read parquet) 19 22 4 5.4 186.1 0.9X -1 deep x 100000 rows (write parquet) 96 102 6 1.0 962.2 0.2X -100 deep x 1000 rows (read in-mem) 31 33 3 3.2 314.7 0.5X -100 deep x 1000 rows (exec in-mem) 462 469 4 0.2 4622.7 0.0X -100 deep x 1000 rows (read parquet) 458 465 8 0.2 4576.2 0.0X -100 deep x 1000 rows (write parquet) 110 116 4 0.9 1100.9 0.2X -250 deep x 400 rows (read in-mem) 123 127 4 0.8 1230.2 0.1X -250 deep x 400 rows (exec in-mem) 2940 2943 4 0.0 29395.9 0.0X -250 deep x 400 rows (read parquet) 2723 2741 25 0.0 27229.1 0.0X -250 deep x 400 rows (write parquet) 206 219 11 0.5 2055.2 0.1X +1 deep x 100000 rows (read in-mem) 17 20 4 5.8 172.3 1.0X +1 deep x 100000 rows (exec in-mem) 20 23 4 5.1 195.7 0.9X +1 deep x 100000 rows (read parquet) 21 24 4 4.7 211.9 0.8X +1 deep x 100000 rows (write parquet) 93 100 7 1.1 931.1 0.2X +100 deep x 1000 rows (read in-mem) 39 41 3 2.6 389.5 0.4X +100 deep x 1000 rows (exec in-mem) 430 434 6 0.2 4300.2 0.0X +100 deep x 1000 rows (read parquet) 439 450 6 0.2 4388.3 0.0X +100 deep x 1000 rows (write parquet) 114 118 4 0.9 1141.3 0.2X +250 deep x 400 rows (read in-mem) 155 160 5 0.6 1552.9 0.1X +250 deep x 400 rows (exec in-mem) 2583 2589 9 0.0 25828.0 0.0X +250 deep x 400 rows (read parquet) 2598 2615 24 0.0 25976.7 0.0X +250 deep x 400 rows (write parquet) 233 249 13 0.4 2334.4 0.1X ================================================================================================ bushy struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor bushy struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -1 x 1 deep x 100000 rows (read in-mem) 13 15 3 7.6 132.0 1.0X -1 x 1 deep x 100000 rows (exec in-mem) 15 18 4 6.8 147.3 0.9X -1 x 1 deep x 100000 rows (read parquet) 17 19 3 5.8 172.4 0.8X -1 x 1 deep x 100000 rows (write parquet) 97 100 4 1.0 965.0 0.1X -128 x 8 deep x 1000 rows (read in-mem) 15 17 3 6.9 145.3 0.9X -128 x 8 deep x 1000 rows (exec in-mem) 26 28 3 3.9 257.4 0.5X -128 x 8 deep x 1000 rows (read parquet) 22 24 3 4.5 221.1 0.6X -128 x 8 deep x 1000 rows (write parquet) 92 95 5 1.1 916.0 0.1X -1024 x 11 deep x 100 rows (read in-mem) 19 22 3 5.3 188.5 0.7X -1024 x 11 deep x 100 rows (exec in-mem) 126 128 2 0.8 1257.4 0.1X -1024 x 11 deep x 100 rows (read parquet) 37 39 3 2.7 368.9 0.4X -1024 x 11 deep x 100 rows (write parquet) 97 102 5 1.0 971.1 0.1X +1 x 1 deep x 100000 rows (read in-mem) 15 18 3 6.5 153.9 1.0X +1 x 1 deep x 100000 rows (exec in-mem) 17 19 3 5.9 168.3 0.9X +1 x 1 deep x 100000 rows (read parquet) 20 23 4 5.0 200.6 0.8X +1 x 1 deep x 100000 rows (write parquet) 92 96 4 1.1 919.2 0.2X +128 x 8 deep x 1000 rows (read in-mem) 16 19 4 6.1 164.8 0.9X +128 x 8 deep x 1000 rows (exec in-mem) 16 19 4 6.2 161.9 1.0X +128 x 8 deep x 1000 rows (read parquet) 22 26 4 4.5 223.7 0.7X +128 x 8 deep x 1000 rows (write parquet) 90 95 7 1.1 900.9 0.2X +1024 x 11 deep x 100 rows (read in-mem) 19 21 3 5.4 186.8 0.8X +1024 x 11 deep x 100 rows (exec in-mem) 21 23 3 4.8 206.9 0.7X +1024 x 11 deep x 100 rows (read parquet) 37 40 4 2.7 373.4 0.4X +1024 x 11 deep x 100 rows (write parquet) 96 105 11 1.0 965.0 0.2X ================================================================================================ wide array field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor wide array field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 15 17 3 6.8 147.6 1.0X -1 wide x 100000 rows (exec in-mem) 17 19 3 6.0 167.8 0.9X -1 wide x 100000 rows (read parquet) 17 20 3 5.9 170.6 0.9X -1 wide x 100000 rows (write parquet) 93 96 3 1.1 926.3 0.2X -100 wide x 1000 rows (read in-mem) 11 12 3 9.1 109.4 1.3X -100 wide x 1000 rows (exec in-mem) 12 14 3 8.0 125.0 1.2X -100 wide x 1000 rows (read parquet) 17 19 3 6.0 165.8 0.9X -100 wide x 1000 rows (write parquet) 89 94 4 1.1 885.3 0.2X -2500 wide x 40 rows (read in-mem) 11 12 3 9.4 106.5 1.4X -2500 wide x 40 rows (exec in-mem) 12 14 3 8.2 121.9 1.2X -2500 wide x 40 rows (read parquet) 16 18 3 6.2 162.4 0.9X -2500 wide x 40 rows (write parquet) 89 94 5 1.1 885.5 0.2X +1 wide x 100000 rows (read in-mem) 17 20 4 5.7 174.7 1.0X +1 wide x 100000 rows (exec in-mem) 19 23 4 5.1 194.6 0.9X +1 wide x 100000 rows (read parquet) 20 26 10 5.1 196.4 0.9X +1 wide x 100000 rows (write parquet) 92 98 5 1.1 920.3 0.2X +100 wide x 1000 rows (read in-mem) 12 15 4 8.1 123.2 1.4X +100 wide x 1000 rows (exec in-mem) 15 19 4 6.9 145.0 1.2X +100 wide x 1000 rows (read parquet) 19 23 4 5.3 190.3 0.9X +100 wide x 1000 rows (write parquet) 89 95 4 1.1 894.3 0.2X +2500 wide x 40 rows (read in-mem) 13 16 4 7.8 128.9 1.4X +2500 wide x 40 rows (exec in-mem) 15 17 3 6.7 149.5 1.2X +2500 wide x 40 rows (read parquet) 19 21 4 5.4 185.9 0.9X +2500 wide x 40 rows (write parquet) 88 93 7 1.1 877.3 0.2X ================================================================================================ wide map field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor wide map field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 12 13 2 8.3 120.7 1.0X -1 wide x 100000 rows (exec in-mem) 15 16 2 6.8 147.6 0.8X -1 wide x 100000 rows (read parquet) 20 26 4 5.0 201.5 0.6X -1 wide x 100000 rows (write parquet) 89 93 4 1.1 894.7 0.1X -100 wide x 1000 rows (read in-mem) 7 8 1 13.5 74.1 1.6X -100 wide x 1000 rows (exec in-mem) 9 10 2 10.9 91.6 1.3X -100 wide x 1000 rows (read parquet) 18 20 3 5.6 177.1 0.7X -100 wide x 1000 rows (write parquet) 84 87 4 1.2 843.3 0.1X -2500 wide x 40 rows (read in-mem) 9 10 1 11.0 91.3 1.3X -2500 wide x 40 rows (exec in-mem) 11 12 2 9.2 108.4 1.1X -2500 wide x 40 rows (read parquet) 18 20 3 5.6 180.1 0.7X -2500 wide x 40 rows (write parquet) 88 92 4 1.1 881.3 0.1X +1 wide x 100000 rows (read in-mem) 14 15 2 7.3 136.4 1.0X +1 wide x 100000 rows (exec in-mem) 17 18 2 6.1 165.2 0.8X +1 wide x 100000 rows (read parquet) 22 25 5 4.6 217.3 0.6X +1 wide x 100000 rows (write parquet) 87 91 6 1.2 866.7 0.2X +100 wide x 1000 rows (read in-mem) 8 10 3 12.5 80.3 1.7X +100 wide x 1000 rows (exec in-mem) 10 12 2 9.7 103.2 1.3X +100 wide x 1000 rows (read parquet) 21 24 4 4.9 205.3 0.7X +100 wide x 1000 rows (write parquet) 82 87 6 1.2 821.1 0.2X +2500 wide x 40 rows (read in-mem) 10 12 3 9.7 103.1 1.3X +2500 wide x 40 rows (exec in-mem) 12 14 3 8.2 121.4 1.1X +2500 wide x 40 rows (read parquet) 20 22 4 5.0 199.0 0.7X +2500 wide x 40 rows (write parquet) 84 89 7 1.2 842.7 0.2X diff --git a/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt b/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt index 04f1737afb586..62aea5f496f92 100644 --- a/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt @@ -2,16 +2,16 @@ projection on wide table ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor projection on wide table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -split threshold 10 2606 2701 94 0.4 2485.4 1.0X -split threshold 100 2174 2193 24 0.5 2073.8 1.2X -split threshold 1024 1652 1662 9 0.6 1575.6 1.6X -split threshold 2048 1618 1625 6 0.6 1543.3 1.6X -split threshold 4096 1713 1734 12 0.6 1633.6 1.5X -split threshold 8192 2321 2336 25 0.5 2213.4 1.1X -split threshold 65536 20726 20950 265 0.1 19765.7 0.1X +split threshold 10 2606 2655 71 0.4 2485.4 1.0X +split threshold 100 2142 2160 14 0.5 2043.1 1.2X +split threshold 1024 1632 1711 124 0.6 1556.0 1.6X +split threshold 2048 1608 1623 14 0.7 1533.1 1.6X +split threshold 4096 1725 1741 21 0.6 1644.7 1.5X +split threshold 8192 2456 2464 7 0.4 2342.5 1.1X +split threshold 65536 21150 21518 353 0.0 20170.3 0.1X diff --git a/sql/core/benchmarks/WideTableBenchmark-results.txt b/sql/core/benchmarks/WideTableBenchmark-results.txt index 1dda0fdd03fb9..e3f5c9bebeee6 100644 --- a/sql/core/benchmarks/WideTableBenchmark-results.txt +++ b/sql/core/benchmarks/WideTableBenchmark-results.txt @@ -2,16 +2,16 @@ projection on wide table ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor projection on wide table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -split threshold 10 2543 2625 73 0.4 2425.1 1.0X -split threshold 100 2035 2074 30 0.5 1940.5 1.2X -split threshold 1024 1641 1658 12 0.6 1565.4 1.5X -split threshold 2048 1609 1625 12 0.7 1534.9 1.6X -split threshold 4096 1668 1681 15 0.6 1590.4 1.5X -split threshold 8192 2119 2153 50 0.5 2021.2 1.2X -split threshold 65536 21512 21816 366 0.0 20515.1 0.1X +split threshold 10 2549 2628 72 0.4 2431.4 1.0X +split threshold 100 2035 2068 32 0.5 1940.7 1.3X +split threshold 1024 1674 1703 27 0.6 1596.6 1.5X +split threshold 2048 1612 1618 6 0.7 1537.6 1.6X +split threshold 4096 1663 1686 17 0.6 1585.9 1.5X +split threshold 8192 2151 2162 13 0.5 2051.2 1.2X +split threshold 65536 21995 22268 233 0.0 20976.1 0.1X diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetColumnVector.java index 7f5b69a09e90c..7fb8be7caf286 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetColumnVector.java @@ -34,6 +34,7 @@ import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.VariantType; +import org.apache.spark.types.variant.VariantSchema; /** * Contains necessary information representing a Parquet column, either of primitive or nested type. @@ -43,6 +44,14 @@ final class ParquetColumnVector { private final List children; private final WritableColumnVector vector; + // Describes the file schema of the Parquet variant column. When it is not null, `children` + // contains only one child that reads the underlying file content. This `ParquetColumnVector` + // should assemble Spark variant values from the file content. + private VariantSchema variantSchema; + // Only meaningful if `variantSchema` is not null. See `SparkShreddingUtils.getFieldsToExtract` + // for its meaning. + private FieldToExtract[] fieldsToExtract; + /** * Repetition & Definition levels * These are allocated only for leaf columns; for non-leaf columns, they simply maintain @@ -101,7 +110,19 @@ final class ParquetColumnVector { } } - if (isPrimitive) { + if (column.variantFileType().isDefined()) { + ParquetColumn fileContentCol = column.variantFileType().get(); + WritableColumnVector fileContent = memoryMode == MemoryMode.OFF_HEAP + ? new OffHeapColumnVector(capacity, fileContentCol.sparkType()) + : new OnHeapColumnVector(capacity, fileContentCol.sparkType()); + ParquetColumnVector contentVector = new ParquetColumnVector(fileContentCol, + fileContent, capacity, memoryMode, missingColumns, false, null); + children.add(contentVector); + variantSchema = SparkShreddingUtils.buildVariantSchema(fileContentCol.sparkType()); + fieldsToExtract = SparkShreddingUtils.getFieldsToExtract(column.sparkType(), variantSchema); + repetitionLevels = contentVector.repetitionLevels; + definitionLevels = contentVector.definitionLevels; + } else if (isPrimitive) { if (column.repetitionLevel() > 0) { repetitionLevels = allocateLevelsVector(capacity, memoryMode); } @@ -167,6 +188,17 @@ private static void getLeavesHelper(ParquetColumnVector vector, List Encoder[T]): Dataset[T] = { + val dataset = new Dataset(sparkSession, logicalPlan, encoderGenerator) + // Eagerly bind the encoder so we verify that the encoder matches the underlying + // schema. The user will get an error if this is not the case. + // optimization: it is guaranteed that [[InternalRow]] can be converted to [[Row]] so + // do not do this check in that case. this check can be expensive since it requires running + // the whole [[Analyzer]] to resolve the deserializer + if (!dataset.queryExecution.isLazyAnalysis + && dataset.encoder.clsTag.runtimeClass != classOf[Row]) { + dataset.resolvedEnc + } + dataset + } + def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = sparkSession.withActive { val qe = sparkSession.sessionState.executePlan(logicalPlan) @@ -225,7 +242,7 @@ class Dataset[T] private[sql]( queryExecution.sparkSession } - import sparkSession.RichColumn + import sparkSession.toRichColumn // A globally unique id of this Dataset. private[sql] val id = Dataset.curId.getAndIncrement() @@ -241,8 +258,13 @@ class Dataset[T] private[sql]( this(queryExecution, () => encoder) } + def this( + sparkSession: SparkSession, logicalPlan: LogicalPlan, encoderGenerator: () => Encoder[T]) = { + this(sparkSession.sessionState.executePlan(logicalPlan), encoderGenerator) + } + def this(sparkSession: SparkSession, logicalPlan: LogicalPlan, encoder: Encoder[T]) = { - this(sparkSession.sessionState.executePlan(logicalPlan), encoder) + this(sparkSession, logicalPlan, () => encoder) } def this(sqlContext: SQLContext, logicalPlan: LogicalPlan, encoder: Encoder[T]) = { @@ -280,9 +302,9 @@ class Dataset[T] private[sql]( // The resolved `ExpressionEncoder` which can be used to turn rows to objects of type T, after // collecting rows to the driver side. - private lazy val resolvedEnc = { - exprEnc.resolveAndBind(logicalPlan.output, sparkSession.sessionState.analyzer) - } + private lazy val resolvedEnc = exprEnc.resolveAndBind( + queryExecution.commandExecuted.output, sparkSession.sessionState.analyzer) + private implicit def classTag: ClassTag[T] = encoder.clsTag @@ -508,16 +530,8 @@ class Dataset[T] private[sql]( /** @inheritdoc */ @scala.annotation.varargs - def toDF(colNames: String*): DataFrame = { - require(schema.size == colNames.size, - "The number of columns doesn't match.\n" + - s"Old column names (${schema.size}): " + schema.fields.map(_.name).mkString(", ") + "\n" + - s"New column names (${colNames.size}): " + colNames.mkString(", ")) - - val newCols = logicalPlan.output.zip(colNames).map { case (oldAttribute, newName) => - Column(oldAttribute).as(newName) - } - select(newCols : _*) + def toDF(colNames: String*): DataFrame = withPlan { + UnresolvedSubqueryColumnAliases(colNames, logicalPlan) } /** @inheritdoc */ @@ -709,6 +723,38 @@ class Dataset[T] private[sql]( new Dataset(sparkSession, joinWith, joinEncoder) } + private[sql] def lateralJoin( + right: DS[_], joinExprs: Option[Column], joinType: JoinType): DataFrame = { + withPlan { + LateralJoin( + logicalPlan, + LateralSubquery(right.logicalPlan), + joinType, + joinExprs.map(_.expr) + ) + } + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_]): DataFrame = { + lateralJoin(right, None, Inner) + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_], joinExprs: Column): DataFrame = { + lateralJoin(right, Some(joinExprs), Inner) + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_], joinType: String): DataFrame = { + lateralJoin(right, None, LateralJoinType(joinType)) + } + + /** @inheritdoc */ + def lateralJoin(right: DS[_], joinExprs: Column, joinType: String): DataFrame = { + lateralJoin(right, Some(joinExprs), LateralJoinType(joinType)) + } + // TODO(SPARK-22947): Fix the DataFrame API. private[sql] def joinAsOf( other: Dataset[_], @@ -822,7 +868,7 @@ class Dataset[T] private[sql]( } /** @inheritdoc */ - def as(alias: String): Dataset[T] = withTypedPlan { + def as(alias: String): Dataset[T] = withSameTypedPlan { SubqueryAlias(alias, logicalPlan) } @@ -877,7 +923,7 @@ class Dataset[T] private[sql]( } /** @inheritdoc */ - def filter(condition: Column): Dataset[T] = withTypedPlan { + def filter(condition: Column): Dataset[T] = withSameTypedPlan { Filter(condition.expr, logicalPlan) } @@ -981,7 +1027,7 @@ class Dataset[T] private[sql]( /** @inheritdoc */ def transpose(indexColumn: Column): DataFrame = withPlan { UnresolvedTranspose( - Seq(indexColumn.named), + Seq(indexColumn.expr), logicalPlan ) } @@ -994,23 +1040,42 @@ class Dataset[T] private[sql]( ) } + /** + * Converts the DataFrame into a `TableArg` object, which can be used as a table argument + * in a user-defined table function (UDTF). + * + * After obtaining a `TableArg` from a DataFrame using this method, you can specify + * partitioning and ordering for the table argument by calling methods such as `partitionBy`, + * `orderBy`, and `withSinglePartition` on the `TableArg` instance. + * - partitionBy(*cols): Partitions the data based on the specified columns. + * This method cannot be called after withSinglePartition() has been called. + * - orderBy(*cols): Orders the data within partitions based on the specified columns. + * - withSinglePartition(): Indicates that the data should be treated as a single partition. + * This method cannot be called after partitionBy() has been called. + * + * @group untypedrel + * @since 4.0.0 + */ + def asTable(): TableArg = { + new TableArg( + FunctionTableSubqueryArgumentExpression(plan = logicalPlan), + sparkSession + ) + } + /** @inheritdoc */ def scalar(): Column = { - Column(ExpressionColumnNode( - ScalarSubqueryExpr(SubExprUtils.removeLazyOuterReferences(logicalPlan), - hasExplicitOuterRefs = true))) + Column(ExpressionColumnNode(ScalarSubqueryExpr(logicalPlan))) } /** @inheritdoc */ def exists(): Column = { - Column(ExpressionColumnNode( - Exists(SubExprUtils.removeLazyOuterReferences(logicalPlan), - hasExplicitOuterRefs = true))) + Column(ExpressionColumnNode(Exists(logicalPlan))) } /** @inheritdoc */ @scala.annotation.varargs - def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = withTypedPlan { + def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = withSameTypedPlan { CollectMetrics(name, (expr +: exprs).map(_.named), logicalPlan, id) } @@ -1022,12 +1087,12 @@ class Dataset[T] private[sql]( } /** @inheritdoc */ - def limit(n: Int): Dataset[T] = withTypedPlan { + def limit(n: Int): Dataset[T] = withSameTypedPlan { Limit(Literal(n), logicalPlan) } /** @inheritdoc */ - def offset(n: Int): Dataset[T] = withTypedPlan { + def offset(n: Int): Dataset[T] = withSameTypedPlan { Offset(Literal(n), logicalPlan) } @@ -1114,7 +1179,7 @@ class Dataset[T] private[sql]( /** @inheritdoc */ def sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T] = { - withTypedPlan { + withSameTypedPlan { Sample(0.0, fraction, withReplacement, seed, logicalPlan) } } @@ -1210,29 +1275,14 @@ class Dataset[T] private[sql]( require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") - SchemaUtils.checkColumnNameDuplication( - colNames, - sparkSession.sessionState.conf.caseSensitiveAnalysis) - - val resolver = sparkSession.sessionState.analyzer.resolver - val output = queryExecution.analyzed.output - - val columnSeq = colNames.zip(cols) - - val replacedAndExistingColumns = output.map { field => - columnSeq.find { case (colName, _) => - resolver(field.name, colName) - } match { - case Some((colName: String, col: Column)) => col.as(colName) - case _ => Column(field) - } + withPlan { + Project( + Seq( + UnresolvedStarWithColumns( + colNames = colNames, + exprs = cols.map(_.expr))), + logicalPlan) } - - val newColumns = columnSeq.filter { case (colName, col) => - !output.exists(f => resolver(f.name, colName)) - }.map { case (colName, col) => col.as(colName) } - - select(replacedAndExistingColumns ++ newColumns : _*) } /** @inheritdoc */ @@ -1259,26 +1309,13 @@ class Dataset[T] private[sql]( require(colNames.size == newColNames.size, s"The size of existing column names: ${colNames.size} isn't equal to " + s"the size of new column names: ${newColNames.size}") - - val resolver = sparkSession.sessionState.analyzer.resolver - val output: Seq[NamedExpression] = queryExecution.analyzed.output - var shouldRename = false - - val projectList = colNames.zip(newColNames).foldLeft(output) { - case (attrs, (existingName, newName)) => - attrs.map(attr => - if (resolver(attr.name, existingName)) { - shouldRename = true - Alias(attr, newName)() - } else { - attr - } - ) - } - if (shouldRename) { - withPlan(Project(projectList, logicalPlan)) - } else { - toDF() + withPlan { + Project( + Seq( + UnresolvedStarWithColumnsRenames( + existingNames = colNames, + newNames = newColNames)), + logicalPlan) } } @@ -1312,7 +1349,7 @@ class Dataset[T] private[sql]( def dropDuplicates(): Dataset[T] = dropDuplicates(this.columns) /** @inheritdoc */ - def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan { + def dropDuplicates(colNames: Seq[String]): Dataset[T] = withSameTypedPlan { val groupCols = groupColsFromDropDuplicates(colNames) Deduplicate(groupCols, logicalPlan) } @@ -1323,7 +1360,7 @@ class Dataset[T] private[sql]( } /** @inheritdoc */ - def dropDuplicatesWithinWatermark(colNames: Seq[String]): Dataset[T] = withTypedPlan { + def dropDuplicatesWithinWatermark(colNames: Seq[String]): Dataset[T] = withSameTypedPlan { val groupCols = groupColsFromDropDuplicates(colNames) // UnsupportedOperationChecker will fail the query if this is called with batch Dataset. DeduplicateWithinWatermark(groupCols, logicalPlan) @@ -1483,7 +1520,7 @@ class Dataset[T] private[sql]( } /** @inheritdoc */ - def repartition(numPartitions: Int): Dataset[T] = withTypedPlan { + def repartition(numPartitions: Int): Dataset[T] = withSameTypedPlan { Repartition(numPartitions, shuffle = true, logicalPlan) } @@ -1498,7 +1535,7 @@ class Dataset[T] private[sql]( s"""Invalid partitionExprs specified: $sortOrders |For range partitioning use repartitionByRange(...) instead. """.stripMargin) - withTypedPlan { + withSameTypedPlan { RepartitionByExpression(partitionExprs.map(_.expr), logicalPlan, numPartitions) } } @@ -1511,13 +1548,13 @@ class Dataset[T] private[sql]( case expr: SortOrder => expr case expr: Expression => SortOrder(expr, Ascending) }) - withTypedPlan { + withSameTypedPlan { RepartitionByExpression(sortOrder, logicalPlan, numPartitions) } } /** @inheritdoc */ - def coalesce(numPartitions: Int): Dataset[T] = withTypedPlan { + def coalesce(numPartitions: Int): Dataset[T] = withSameTypedPlan { Repartition(numPartitions, shuffle = false, logicalPlan) } @@ -1590,6 +1627,7 @@ class Dataset[T] private[sql]( name = TableIdentifier(identifier.last), userSpecifiedColumns = Nil, comment = None, + collation = None, properties = Map.empty, originalText = None, plan = logicalPlan, @@ -2211,7 +2249,7 @@ class Dataset[T] private[sql]( SortOrder(expr, Ascending) } } - withTypedPlan { + withSameTypedPlan { Sort(sortOrder, global = global, logicalPlan) } } @@ -2226,6 +2264,11 @@ class Dataset[T] private[sql]( Dataset(sparkSession, logicalPlan) } + /** A convenient function to wrap a logical plan and produce a Dataset. */ + @inline private def withSameTypedPlan(logicalPlan: LogicalPlan): Dataset[T] = { + Dataset(sparkSession, logicalPlan, encoderGenerator) + } + /** A convenient function to wrap a set based logical plan and produce a Dataset. */ @inline private def withSetOperator[U : Encoder](logicalPlan: LogicalPlan): Dataset[U] = { if (isUnTyped) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 392c3edab9895..6dcf01d3a9db2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql import org.apache.spark.api.java.function._ -import org.apache.spark.sql.catalyst.analysis.{EliminateEventTimeWatermark, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{agnosticEncoderFor, ProductEncoder} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.Attribute @@ -289,11 +289,11 @@ class KeyValueGroupedDataset[K, V] private[sql]( transformWithState ) - Dataset[U](sparkSession, EliminateEventTimeWatermark( + Dataset[U](sparkSession, UpdateEventTimeWatermarkColumn( UnresolvedAttribute(eventTimeColumnName), None, - transformWithStateDataset.logicalPlan))) + transformWithStateDataset.logicalPlan)) } /** @inheritdoc */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 0974df55a6d84..b8c4b03fc13d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkRuntimeException import org.apache.spark.annotation.Stable import org.apache.spark.api.python.PythonEvalType import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedAttribute} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.util.toPrettySQL import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.execution.QueryExecution -import org.apache.spark.sql.internal.ExpressionUtils.{column, generateAlias} +import org.apache.spark.sql.internal.ExpressionUtils.generateAlias import org.apache.spark.sql.internal.TypedAggUtils.withInputType import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{NumericType, StructType} @@ -114,7 +114,7 @@ class RelationalGroupedDataset protected[sql]( namedExpr } } - columnExprs.map(column) + columnExprs.map(Column(_)) } /** @inheritdoc */ @@ -238,7 +238,7 @@ class RelationalGroupedDataset protected[sql]( broadcastVars: Array[Broadcast[Object]], outputSchema: StructType): DataFrame = { val groupingNamedExpressions = groupingExprs.map(alias) - val groupingCols = groupingNamedExpressions.map(column) + val groupingCols = groupingNamedExpressions.map(Column(_)) val groupingDataFrame = df.select(groupingCols : _*) val groupingAttributes = groupingNamedExpressions.map(_.toAttribute) Dataset.ofRows( @@ -475,7 +475,8 @@ class RelationalGroupedDataset protected[sql]( outputStructType: StructType, outputModeStr: String, timeModeStr: String, - initialState: RelationalGroupedDataset): DataFrame = { + initialState: RelationalGroupedDataset, + eventTimeColumnName: String): DataFrame = { def exprToAttr(expr: Seq[Expression]): Seq[Attribute] = { expr.map { case ne: NamedExpression => ne @@ -529,7 +530,30 @@ class RelationalGroupedDataset protected[sql]( initialStateSchema = initialState.df.schema ) } - Dataset.ofRows(df.sparkSession, plan) + if (eventTimeColumnName.isEmpty) { + Dataset.ofRows(df.sparkSession, plan) + } else { + updateEventTimeColumnAfterTransformWithState(plan, eventTimeColumnName) + } + } + + /** + * Creates a new dataset with updated eventTimeColumn after the transformWithState + * logical node. + */ + private def updateEventTimeColumnAfterTransformWithState( + transformWithStateInPandas: LogicalPlan, + eventTimeColumnName: String): DataFrame = { + val transformWithStateDataset = Dataset.ofRows( + df.sparkSession, + transformWithStateInPandas + ) + + Dataset.ofRows(df.sparkSession, + UpdateEventTimeWatermarkColumn( + UnresolvedAttribute(eventTimeColumnName), + None, + transformWithStateDataset.logicalPlan)) } override def toString: String = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 636899a7acb06..1318563f8c93b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -17,21 +17,18 @@ package org.apache.spark.sql -import java.util.Properties +import java.util.{List => JList, Map => JMap, Properties} -import scala.collection.immutable import scala.reflect.runtime.universe.TypeTag import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Experimental, Stable, Unstable} import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} -import org.apache.spark.internal.Logging import org.apache.spark.internal.config.ConfigEntry import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst._ -import org.apache.spark.sql.catalyst.analysis.{CurrentNamespace, UnresolvedNamespace} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.ShowTables +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.internal.{SessionState, SharedState, SQLConf} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.streaming.{DataStreamReader, StreamingQueryManager} @@ -41,8 +38,8 @@ import org.apache.spark.sql.util.ExecutionListenerManager /** * The entry point for working with structured data (rows and columns) in Spark 1.x. * - * As of Spark 2.0, this is replaced by [[SparkSession]]. However, we are keeping the class - * here for backward compatibility. + * As of Spark 2.0, this is replaced by [[SparkSession]]. However, we are keeping the class here + * for backward compatibility. * * @groupname basic Basic Operations * @groupname ddl_ops Persistent Catalog DDL @@ -56,8 +53,8 @@ import org.apache.spark.sql.util.ExecutionListenerManager * @since 1.0.0 */ @Stable -class SQLContext private[sql](val sparkSession: SparkSession) - extends Logging with Serializable { +class SQLContext private[sql] (override val sparkSession: SparkSession) + extends api.SQLContext(sparkSession) { self => @@ -77,980 +74,325 @@ class SQLContext private[sql](val sparkSession: SparkSession) // TODO: move this logic into SparkSession private[sql] def sessionState: SessionState = sparkSession.sessionState + private[sql] def sharedState: SharedState = sparkSession.sharedState + @deprecated("Use SparkSession.sessionState.conf instead", "4.0.0") private[sql] def conf: SQLConf = sessionState.conf - def sparkContext: SparkContext = sparkSession.sparkContext - - /** - * Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary - * tables, registered functions, but sharing the same `SparkContext`, cached data and - * other things. - * - * @since 1.6.0 - */ - def newSession(): SQLContext = sparkSession.newSession().sqlContext - - /** - * An interface to register custom [[org.apache.spark.sql.util.QueryExecutionListener]]s - * that listen for execution metrics. - */ + /** @inheritdoc */ def listenerManager: ExecutionListenerManager = sparkSession.listenerManager - /** - * Set Spark SQL configuration properties. - * - * @group config - * @since 1.0.0 - */ + /** @inheritdoc */ def setConf(props: Properties): Unit = { sessionState.conf.setConf(props) } - /** - * Set the given Spark SQL configuration property. - */ private[sql] def setConf[T](entry: ConfigEntry[T], value: T): Unit = { sessionState.conf.setConf(entry, value) } - /** - * Set the given Spark SQL configuration property. - * - * @group config - * @since 1.0.0 - */ - def setConf(key: String, value: String): Unit = { - sparkSession.conf.set(key, value) - } - - /** - * Return the value of Spark SQL configuration property for the given key. - * - * @group config - * @since 1.0.0 - */ - def getConf(key: String): String = { - sparkSession.conf.get(key) - } - - /** - * Return the value of Spark SQL configuration property for the given key. If the key is not set - * yet, return `defaultValue`. - * - * @group config - * @since 1.0.0 - */ - def getConf(key: String, defaultValue: String): String = { - sparkSession.conf.get(key, defaultValue) - } - - /** - * Return all the configuration properties that have been set (i.e. not the default). - * This creates a new copy of the config properties in the form of a Map. - * - * @group config - * @since 1.0.0 - */ - def getAllConfs: immutable.Map[String, String] = { - sparkSession.conf.getAll - } - - /** - * :: Experimental :: - * A collection of methods that are considered experimental, but can be used to hook into - * the query planner for advanced functionality. - * - * @group basic - * @since 1.3.0 - */ + /** @inheritdoc */ @Experimental @transient @Unstable def experimental: ExperimentalMethods = sparkSession.experimental - /** - * Returns a `DataFrame` with no rows or columns. - * - * @group basic - * @since 1.3.0 - */ - def emptyDataFrame: DataFrame = sparkSession.emptyDataFrame - - /** - * A collection of methods for registering user-defined functions (UDF). - * - * The following example registers a Scala closure as UDF: - * {{{ - * sqlContext.udf.register("myUDF", (arg1: Int, arg2: String) => arg2 + arg1) - * }}} - * - * The following example registers a UDF in Java: - * {{{ - * sqlContext.udf().register("myUDF", - * (Integer arg1, String arg2) -> arg2 + arg1, - * DataTypes.StringType); - * }}} - * - * @note The user-defined functions must be deterministic. Due to optimization, - * duplicate invocations may be eliminated or the function may even be invoked more times than - * it is present in the query. - * - * @group basic - * @since 1.3.0 - */ + /** @inheritdoc */ def udf: UDFRegistration = sparkSession.udf - /** - * Returns true if the table is currently cached in-memory. - * @group cachemgmt - * @since 1.3.0 - */ - def isCached(tableName: String): Boolean = { - sparkSession.catalog.isCached(tableName) - } - - /** - * Caches the specified table in-memory. - * @group cachemgmt - * @since 1.3.0 - */ - def cacheTable(tableName: String): Unit = { - sparkSession.catalog.cacheTable(tableName) - } - - /** - * Removes the specified table from the in-memory cache. - * @group cachemgmt - * @since 1.3.0 - */ - def uncacheTable(tableName: String): Unit = { - sparkSession.catalog.uncacheTable(tableName) - } - - /** - * Removes all cached tables from the in-memory cache. - * @since 1.3.0 - */ - def clearCache(): Unit = { - sparkSession.catalog.clearCache() - } - // scalastyle:off // Disable style checker so "implicits" object can start with lowercase i - /** - * (Scala-specific) Implicit methods available in Scala for converting - * common Scala objects into `DataFrame`s. - * - * {{{ - * val sqlContext = new SQLContext(sc) - * import sqlContext.implicits._ - * }}} - * - * @group basic - * @since 1.3.0 - */ + + /** @inheritdoc */ object implicits extends SQLImplicits { + /** @inheritdoc */ override protected def session: SparkSession = sparkSession } + // scalastyle:on /** - * Creates a DataFrame from an RDD of Product (e.g. case classes, tuples). - * - * @group dataframes - * @since 1.3.0 + * Creates a DataFrame from an RDD[Row]. User can specify whether the input rows should be + * converted to Catalyst rows. */ - def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = { - sparkSession.createDataFrame(rdd) + private[sql] def internalCreateDataFrame( + catalystRows: RDD[InternalRow], + schema: StructType, + isStreaming: Boolean = false): DataFrame = { + sparkSession.internalCreateDataFrame(catalystRows, schema, isStreaming) } - /** - * Creates a DataFrame from a local Seq of Product. - * - * @group dataframes - * @since 1.3.0 - */ - def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = { - sparkSession.createDataFrame(data) - } + /** @inheritdoc */ + def read: DataFrameReader = sparkSession.read - /** - * Convert a `BaseRelation` created for external data sources into a `DataFrame`. - * - * @group dataframes - * @since 1.3.0 - */ - def baseRelationToDataFrame(baseRelation: BaseRelation): DataFrame = { - sparkSession.baseRelationToDataFrame(baseRelation) - } + /** @inheritdoc */ + def readStream: DataStreamReader = sparkSession.readStream /** - * :: DeveloperApi :: - * Creates a `DataFrame` from an `RDD` containing [[Row]]s using the given schema. - * It is important to make sure that the structure of every [[Row]] of the provided RDD matches - * the provided schema. Otherwise, there will be runtime exception. - * Example: - * {{{ - * import org.apache.spark.sql._ - * import org.apache.spark.sql.types._ - * val sqlContext = new org.apache.spark.sql.SQLContext(sc) - * - * val schema = - * StructType( - * StructField("name", StringType, false) :: - * StructField("age", IntegerType, true) :: Nil) - * - * val people = - * sc.textFile("examples/src/main/resources/people.txt").map( - * _.split(",")).map(p => Row(p(0), p(1).trim.toInt)) - * val dataFrame = sqlContext.createDataFrame(people, schema) - * dataFrame.printSchema - * // root - * // |-- name: string (nullable = false) - * // |-- age: integer (nullable = true) - * - * dataFrame.createOrReplaceTempView("people") - * sqlContext.sql("select name from people").collect.foreach(println) - * }}} - * - * @group dataframes - * @since 1.3.0 + * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist + * only during the lifetime of this instance of SQLContext. */ - @DeveloperApi - def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = { - sparkSession.createDataFrame(rowRDD, schema) + private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = { + df.createOrReplaceTempView(tableName) } /** - * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an - * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) - * that is generally created automatically through implicits from a `SparkSession`, or can be - * created explicitly by calling static methods on [[Encoders]]. - * - * == Example == - * - * {{{ - * - * import spark.implicits._ - * case class Person(name: String, age: Long) - * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) - * val ds = spark.createDataset(data) - * - * ds.show() - * // +-------+---+ - * // | name|age| - * // +-------+---+ - * // |Michael| 29| - * // | Andy| 30| - * // | Justin| 19| - * // +-------+---+ - * }}} + * Returns a `StreamingQueryManager` that allows managing all the + * [[org.apache.spark.sql.streaming.StreamingQuery StreamingQueries]] active on `this` context. * * @since 2.0.0 - * @group dataset */ - def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = { - sparkSession.createDataset(data) - } + def streams: StreamingQueryManager = sparkSession.streams - /** - * Creates a [[Dataset]] from an RDD of a given type. This method requires an - * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) - * that is generally created automatically through implicits from a `SparkSession`, or can be - * created explicitly by calling static methods on [[Encoders]]. - * - * @since 2.0.0 - * @group dataset - */ - def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = { - sparkSession.createDataset(data) - } + /** @inheritdoc */ + override def sparkContext: SparkContext = super.sparkContext - /** - * Creates a [[Dataset]] from a `java.util.List` of a given type. This method requires an - * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) - * that is generally created automatically through implicits from a `SparkSession`, or can be - * created explicitly by calling static methods on [[Encoders]]. - * - * == Java Example == - * - * {{{ - * List data = Arrays.asList("hello", "world"); - * Dataset ds = spark.createDataset(data, Encoders.STRING()); - * }}} - * - * @since 2.0.0 - * @group dataset - */ - def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = { - sparkSession.createDataset(data) - } + /** @inheritdoc */ + override def newSession(): SQLContext = sparkSession.newSession().sqlContext - /** - * Creates a DataFrame from an RDD[Row]. User can specify whether the input rows should be - * converted to Catalyst rows. - */ - private[sql] - def internalCreateDataFrame( - catalystRows: RDD[InternalRow], - schema: StructType, - isStreaming: Boolean = false) = { - sparkSession.internalCreateDataFrame(catalystRows, schema, isStreaming) - } + /** @inheritdoc */ + override def emptyDataFrame: Dataset[Row] = super.emptyDataFrame - /** - * :: DeveloperApi :: - * Creates a `DataFrame` from a `JavaRDD` containing [[Row]]s using the given schema. - * It is important to make sure that the structure of every [[Row]] of the provided RDD matches - * the provided schema. Otherwise, there will be runtime exception. - * - * @group dataframes - * @since 1.3.0 - */ - @DeveloperApi - def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = { - sparkSession.createDataFrame(rowRDD, schema) - } + /** @inheritdoc */ + override def createDataFrame[A <: Product: TypeTag](rdd: RDD[A]): Dataset[Row] = + super.createDataFrame(rdd) - /** - * :: DeveloperApi :: - * Creates a `DataFrame` from a `java.util.List` containing [[Row]]s using the given schema. - * It is important to make sure that the structure of every [[Row]] of the provided List matches - * the provided schema. Otherwise, there will be runtime exception. - * - * @group dataframes - * @since 1.6.0 - */ + /** @inheritdoc */ + override def createDataFrame[A <: Product: TypeTag](data: Seq[A]): Dataset[Row] = + super.createDataFrame(data) + + /** @inheritdoc */ + override def baseRelationToDataFrame(baseRelation: BaseRelation): Dataset[Row] = + super.baseRelationToDataFrame(baseRelation) + + /** @inheritdoc */ @DeveloperApi - def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = { - sparkSession.createDataFrame(rows, schema) - } + override def createDataFrame(rowRDD: RDD[Row], schema: StructType): Dataset[Row] = + super.createDataFrame(rowRDD, schema) - /** - * Applies a schema to an RDD of Java Beans. - * - * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, - * SELECT * queries will return the columns in an undefined order. - * @group dataframes - * @since 1.3.0 - */ - def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = { - sparkSession.createDataFrame(rdd, beanClass) - } + /** @inheritdoc */ + override def createDataset[T: Encoder](data: Seq[T]): Dataset[T] = super.createDataset(data) - /** - * Applies a schema to an RDD of Java Beans. - * - * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, - * SELECT * queries will return the columns in an undefined order. - * @group dataframes - * @since 1.3.0 - */ - def createDataFrame(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = { - sparkSession.createDataFrame(rdd, beanClass) - } + /** @inheritdoc */ + override def createDataset[T: Encoder](data: RDD[T]): Dataset[T] = super.createDataset(data) - /** - * Applies a schema to a List of Java Beans. - * - * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, - * SELECT * queries will return the columns in an undefined order. - * @group dataframes - * @since 1.6.0 - */ - def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = { - sparkSession.createDataFrame(data, beanClass) - } + /** @inheritdoc */ + override def createDataset[T: Encoder](data: JList[T]): Dataset[T] = + super.createDataset(data) - /** - * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a - * `DataFrame`. - * {{{ - * sqlContext.read.parquet("/path/to/file.parquet") - * sqlContext.read.schema(schema).json("/path/to/file.json") - * }}} - * - * @group genericdata - * @since 1.4.0 - */ - def read: DataFrameReader = sparkSession.read + /** @inheritdoc */ + @DeveloperApi + override def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): Dataset[Row] = + super.createDataFrame(rowRDD, schema) + /** @inheritdoc */ + @DeveloperApi + override def createDataFrame(rows: JList[Row], schema: StructType): Dataset[Row] = + super.createDataFrame(rows, schema) - /** - * Returns a `DataStreamReader` that can be used to read streaming data in as a `DataFrame`. - * {{{ - * sparkSession.readStream.parquet("/path/to/directory/of/parquet/files") - * sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files") - * }}} - * - * @since 2.0.0 - */ - def readStream: DataStreamReader = sparkSession.readStream + /** @inheritdoc */ + override def createDataFrame(rdd: RDD[_], beanClass: Class[_]): Dataset[Row] = + super.createDataFrame(rdd, beanClass) + /** @inheritdoc */ + override def createDataFrame(rdd: JavaRDD[_], beanClass: Class[_]): Dataset[Row] = + super.createDataFrame(rdd, beanClass) - /** - * Creates an external table from the given path and returns the corresponding DataFrame. - * It will use the default data source configured by spark.sql.sources.default. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable(tableName: String, path: String): DataFrame = { - sparkSession.catalog.createTable(tableName, path) - } + /** @inheritdoc */ + override def createDataFrame(data: JList[_], beanClass: Class[_]): Dataset[Row] = + super.createDataFrame(data, beanClass) - /** - * Creates an external table from the given path based on a data source - * and returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( + /** @inheritdoc */ + override def createExternalTable(tableName: String, path: String): Dataset[Row] = + super.createExternalTable(tableName, path) + + /** @inheritdoc */ + override def createExternalTable( tableName: String, path: String, - source: String): DataFrame = { - sparkSession.catalog.createTable(tableName, path, source) + source: String): Dataset[Row] = { + super.createExternalTable(tableName, path, source) } - /** - * Creates an external table from the given path based on a data source and a set of options. - * Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( + /** @inheritdoc */ + override def createExternalTable( tableName: String, source: String, - options: java.util.Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, options) + options: JMap[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, options) } - /** - * (Scala-specific) - * Creates an external table from the given path based on a data source and a set of options. - * Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( + /** @inheritdoc */ + override def createExternalTable( tableName: String, source: String, - options: Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, options) + options: Map[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, options) } - /** - * Create an external table from the given path based on a data source, a schema and - * a set of options. Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( + /** @inheritdoc */ + override def createExternalTable( tableName: String, source: String, schema: StructType, - options: java.util.Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, schema, options) + options: JMap[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, schema, options) } - /** - * (Scala-specific) - * Create an external table from the given path based on a data source, a schema and - * a set of options. Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( + /** @inheritdoc */ + override def createExternalTable( tableName: String, source: String, schema: StructType, - options: Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, schema, options) - } - - /** - * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist - * only during the lifetime of this instance of SQLContext. - */ - private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = { - df.createOrReplaceTempView(tableName) + options: Map[String, String]): Dataset[Row] = { + super.createExternalTable(tableName, source, schema, options) } - /** - * Drops the temporary table with the given table name in the catalog. If the table has been - * cached/persisted before, it's also unpersisted. - * - * @param tableName the name of the table to be unregistered. - * @group basic - * @since 1.3.0 - */ - def dropTempTable(tableName: String): Unit = { - sparkSession.catalog.dropTempView(tableName) - } + /** @inheritdoc */ + override def range(end: Long): Dataset[Row] = super.range(end) - /** - * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements - * in a range from 0 to `end` (exclusive) with step value 1. - * - * @since 1.4.1 - * @group dataframe - */ - def range(end: Long): DataFrame = sparkSession.range(end).toDF() - - /** - * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements - * in a range from `start` to `end` (exclusive) with step value 1. - * - * @since 1.4.0 - * @group dataframe - */ - def range(start: Long, end: Long): DataFrame = sparkSession.range(start, end).toDF() - - /** - * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements - * in a range from `start` to `end` (exclusive) with a step value. - * - * @since 2.0.0 - * @group dataframe - */ - def range(start: Long, end: Long, step: Long): DataFrame = { - sparkSession.range(start, end, step).toDF() - } - - /** - * Creates a `DataFrame` with a single `LongType` column named `id`, containing elements - * in an range from `start` to `end` (exclusive) with an step value, with partition number - * specified. - * - * @since 1.4.0 - * @group dataframe - */ - def range(start: Long, end: Long, step: Long, numPartitions: Int): DataFrame = { - sparkSession.range(start, end, step, numPartitions).toDF() - } - - /** - * Executes a SQL query using Spark, returning the result as a `DataFrame`. - * This API eagerly runs DDL/DML commands, but not for SELECT queries. - * - * @group basic - * @since 1.3.0 - */ - def sql(sqlText: String): DataFrame = sparkSession.sql(sqlText) - - /** - * Returns the specified table as a `DataFrame`. - * - * @group ddl_ops - * @since 1.3.0 - */ - def table(tableName: String): DataFrame = { - sparkSession.table(tableName) - } + /** @inheritdoc */ + override def range(start: Long, end: Long): Dataset[Row] = super.range(start, end) - /** - * Returns a `DataFrame` containing names of existing tables in the current database. - * The returned DataFrame has three columns, database, tableName and isTemporary (a Boolean - * indicating if a table is a temporary one or not). - * - * @group ddl_ops - * @since 1.3.0 - */ - def tables(): DataFrame = { - Dataset.ofRows(sparkSession, ShowTables(CurrentNamespace, None)) - } + /** @inheritdoc */ + override def range(start: Long, end: Long, step: Long): Dataset[Row] = + super.range(start, end, step) - /** - * Returns a `DataFrame` containing names of existing tables in the given database. - * The returned DataFrame has three columns, database, tableName and isTemporary (a Boolean - * indicating if a table is a temporary one or not). - * - * @group ddl_ops - * @since 1.3.0 - */ - def tables(databaseName: String): DataFrame = { - Dataset.ofRows(sparkSession, ShowTables(UnresolvedNamespace(Seq(databaseName)), None)) - } + /** @inheritdoc */ + override def range(start: Long, end: Long, step: Long, numPartitions: Int): Dataset[Row] = + super.range(start, end, step, numPartitions) - /** - * Returns a `StreamingQueryManager` that allows managing all the - * [[org.apache.spark.sql.streaming.StreamingQuery StreamingQueries]] active on `this` context. - * - * @since 2.0.0 - */ - def streams: StreamingQueryManager = sparkSession.streams + /** @inheritdoc */ + override def sql(sqlText: String): Dataset[Row] = super.sql(sqlText) - /** - * Returns the names of tables in the current database as an array. - * - * @group ddl_ops - * @since 1.3.0 - */ - def tableNames(): Array[String] = { - tableNames(sparkSession.catalog.currentDatabase) - } + /** @inheritdoc */ + override def table(tableName: String): Dataset[Row] = super.table(tableName) - /** - * Returns the names of tables in the given database as an array. - * - * @group ddl_ops - * @since 1.3.0 - */ - def tableNames(databaseName: String): Array[String] = { - sessionState.catalog.listTables(databaseName).map(_.table).toArray - } + /** @inheritdoc */ + override def tables(): DataFrame = super.tables() - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - // Deprecated methods - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// + /** @inheritdoc */ + override def tables(databaseName: String): DataFrame = super.tables(databaseName) - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = { - createDataFrame(rowRDD, schema) - } + /** @inheritdoc */ + override def applySchema(rowRDD: RDD[Row], schema: StructType): Dataset[Row] = + super.applySchema(rowRDD, schema) - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = { - createDataFrame(rowRDD, schema) - } + /** @inheritdoc */ + override def applySchema(rowRDD: JavaRDD[Row], schema: StructType): Dataset[Row] = + super.applySchema(rowRDD, schema) - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = { - createDataFrame(rdd, beanClass) - } + /** @inheritdoc */ + override def applySchema(rdd: RDD[_], beanClass: Class[_]): Dataset[Row] = + super.applySchema(rdd, beanClass) - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = { - createDataFrame(rdd, beanClass) - } + /** @inheritdoc */ + override def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): Dataset[Row] = + super.applySchema(rdd, beanClass) - /** - * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty - * `DataFrame` if no paths are passed in. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().parquet()`. - */ - @deprecated("Use read.parquet() instead.", "1.4.0") + /** @inheritdoc */ @scala.annotation.varargs - def parquetFile(paths: String*): DataFrame = { - if (paths.isEmpty) { - emptyDataFrame - } else { - read.parquet(paths : _*) - } - } + override def parquetFile(paths: String*): Dataset[Row] = super.parquetFile(paths: _*) - /** - * Loads a JSON file (one object per line), returning the result as a `DataFrame`. - * It goes through the entire dataset once to determine the schema. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonFile(path: String): DataFrame = { - read.json(path) - } + /** @inheritdoc */ + override def jsonFile(path: String): Dataset[Row] = super.jsonFile(path) - /** - * Loads a JSON file (one object per line) and applies the given schema, - * returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonFile(path: String, schema: StructType): DataFrame = { - read.schema(schema).json(path) - } + /** @inheritdoc */ + override def jsonFile(path: String, schema: StructType): Dataset[Row] = + super.jsonFile(path, schema) - /** - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonFile(path: String, samplingRatio: Double): DataFrame = { - read.option("samplingRatio", samplingRatio.toString).json(path) - } + /** @inheritdoc */ + override def jsonFile(path: String, samplingRatio: Double): Dataset[Row] = + super.jsonFile(path, samplingRatio) - /** - * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * `DataFrame`. - * It goes through the entire dataset once to determine the schema. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: RDD[String]): DataFrame = read.json(json) + /** @inheritdoc */ + override def jsonRDD(json: RDD[String]): Dataset[Row] = read.json(json) - /** - * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * `DataFrame`. - * It goes through the entire dataset once to determine the schema. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json) + /** @inheritdoc */ + override def jsonRDD(json: JavaRDD[String]): Dataset[Row] = read.json(json) - /** - * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema, - * returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: RDD[String], schema: StructType): DataFrame = { - read.schema(schema).json(json) - } + /** @inheritdoc */ + override def jsonRDD(json: RDD[String], schema: StructType): Dataset[Row] = + super.jsonRDD(json, schema) - /** - * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the given - * schema, returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = { - read.schema(schema).json(json) - } + /** @inheritdoc */ + override def jsonRDD(json: JavaRDD[String], schema: StructType): Dataset[Row] = + super.jsonRDD(json, schema) - /** - * Loads an RDD[String] storing JSON objects (one object per record) inferring the - * schema, returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = { - read.option("samplingRatio", samplingRatio.toString).json(json) - } + /** @inheritdoc */ + override def jsonRDD(json: RDD[String], samplingRatio: Double): Dataset[Row] = + super.jsonRDD(json, samplingRatio) - /** - * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the - * schema, returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = { - read.option("samplingRatio", samplingRatio.toString).json(json) - } + /** @inheritdoc */ + override def jsonRDD(json: JavaRDD[String], samplingRatio: Double): Dataset[Row] = + super.jsonRDD(json, samplingRatio) - /** - * Returns the dataset stored at path as a DataFrame, - * using the default data source configured by spark.sql.sources.default. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().load(path)`. - */ - @deprecated("Use read.load(path) instead.", "1.4.0") - def load(path: String): DataFrame = { - read.load(path) - } + /** @inheritdoc */ + override def load(path: String): Dataset[Row] = super.load(path) - /** - * Returns the dataset stored at path as a DataFrame, using the given data source. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().format(source).load(path)`. - */ - @deprecated("Use read.format(source).load(path) instead.", "1.4.0") - def load(path: String, source: String): DataFrame = { - read.format(source).load(path) - } + /** @inheritdoc */ + override def load(path: String, source: String): Dataset[Row] = super.load(path, source) - /** - * (Java-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`. - */ - @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") - def load(source: String, options: java.util.Map[String, String]): DataFrame = { - read.options(options).format(source).load() - } + /** @inheritdoc */ + override def load(source: String, options: JMap[String, String]): Dataset[Row] = + super.load(source, options) - /** - * (Scala-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`. - */ - @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") - def load(source: String, options: Map[String, String]): DataFrame = { - read.options(options).format(source).load() - } + /** @inheritdoc */ + override def load(source: String, options: Map[String, String]): Dataset[Row] = + super.load(source, options) - /** - * (Java-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame, using the given schema as the schema of the DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by - * `read().format(source).schema(schema).options(options).load()`. - */ - @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") - def load( + /** @inheritdoc */ + override def load( source: String, schema: StructType, - options: java.util.Map[String, String]): DataFrame = { - read.format(source).schema(schema).options(options).load() + options: JMap[String, String]): Dataset[Row] = { + super.load(source, schema, options) } - /** - * (Scala-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame, using the given schema as the schema of the DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by - * `read().format(source).schema(schema).options(options).load()`. - */ - @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") - def load(source: String, schema: StructType, options: Map[String, String]): DataFrame = { - read.format(source).schema(schema).options(options).load() + /** @inheritdoc */ + override def load( + source: String, + schema: StructType, + options: Map[String, String]): Dataset[Row] = { + super.load(source, schema, options) } - /** - * Construct a `DataFrame` representing the database table accessible via JDBC URL - * url named table. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().jdbc()`. - */ - @deprecated("Use read.jdbc() instead.", "1.4.0") - def jdbc(url: String, table: String): DataFrame = { - read.jdbc(url, table, new Properties) - } + /** @inheritdoc */ + override def jdbc(url: String, table: String): Dataset[Row] = super.jdbc(url, table) - /** - * Construct a `DataFrame` representing the database table accessible via JDBC URL - * url named table. Partitions of the table will be retrieved in parallel based on the parameters - * passed to this function. - * - * @param columnName the name of a column of integral type that will be used for partitioning. - * @param lowerBound the minimum value of `columnName` used to decide partition stride - * @param upperBound the maximum value of `columnName` used to decide partition stride - * @param numPartitions the number of partitions. the range `minValue`-`maxValue` will be split - * evenly into this many partitions - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().jdbc()`. - */ - @deprecated("Use read.jdbc() instead.", "1.4.0") - def jdbc( + /** @inheritdoc */ + override def jdbc( url: String, table: String, columnName: String, lowerBound: Long, upperBound: Long, - numPartitions: Int): DataFrame = { - read.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, new Properties) + numPartitions: Int): Dataset[Row] = { + super.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions) } - /** - * Construct a `DataFrame` representing the database table accessible via JDBC URL - * url named table. The theParts parameter gives a list expressions - * suitable for inclusion in WHERE clauses; each one defines one partition - * of the `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().jdbc()`. - */ - @deprecated("Use read.jdbc() instead.", "1.4.0") - def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = { - read.jdbc(url, table, theParts, new Properties) - } + /** @inheritdoc */ + override def jdbc(url: String, table: String, theParts: Array[String]): Dataset[Row] = + super.jdbc(url, table, theParts) } -/** - * This SQLContext object contains utility functions to create a singleton SQLContext instance, - * or to get the created SQLContext instance. - * - * It also provides utility functions to support preference for threads in multiple sessions - * scenario, setActive could set a SQLContext for current thread, which will be returned by - * getOrCreate instead of the global one. - */ -object SQLContext { +object SQLContext extends api.SQLContextCompanion { - /** - * Get the singleton SQLContext if it exists or create a new one using the given SparkContext. - * - * This function can be used to create a singleton SQLContext object that can be shared across - * the JVM. - * - * If there is an active SQLContext for current thread, it will be returned instead of the global - * one. - * - * @since 1.5.0 - */ - @deprecated("Use SparkSession.builder instead", "2.0.0") + override private[sql] type SQLContextImpl = SQLContext + override private[sql] type SparkContextImpl = SparkContext + + /** @inheritdoc */ def getOrCreate(sparkContext: SparkContext): SQLContext = { SparkSession.builder().sparkContext(sparkContext).getOrCreate().sqlContext } - /** - * Changes the SQLContext that will be returned in this thread and its children when - * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives - * a SQLContext with an isolated session, instead of the global (first created) context. - * - * @since 1.6.0 - */ - @deprecated("Use SparkSession.setActiveSession instead", "2.0.0") - def setActive(sqlContext: SQLContext): Unit = { - SparkSession.setActiveSession(sqlContext.sparkSession) - } - - /** - * Clears the active SQLContext for current thread. Subsequent calls to getOrCreate will - * return the first created context instead of a thread-local override. - * - * @since 1.6.0 - */ - @deprecated("Use SparkSession.clearActiveSession instead", "2.0.0") - def clearActive(): Unit = { - SparkSession.clearActiveSession() - } + /** @inheritdoc */ + override def setActive(sqlContext: SQLContext): Unit = super.setActive(sqlContext) /** - * Converts an iterator of Java Beans to InternalRow using the provided - * bean info & schema. This is not related to the singleton, but is a static - * method for internal use. + * Converts an iterator of Java Beans to InternalRow using the provided bean info & schema. This + * is not related to the singleton, but is a static method for internal use. */ private[sql] def beansToRows( data: Iterator[_], @@ -1058,7 +400,9 @@ object SQLContext { attrs: Seq[AttributeReference]): Iterator[InternalRow] = { def createStructConverter(cls: Class[_], fieldTypes: Seq[DataType]): Any => InternalRow = { val methodConverters = - JavaTypeInference.getJavaBeanReadableProperties(cls).zip(fieldTypes) + JavaTypeInference + .getJavaBeanReadableProperties(cls) + .zip(fieldTypes) .map { case (property, fieldType) => val method = property.getReadMethod method -> createConverter(method.getReturnType, fieldType) @@ -1067,16 +411,17 @@ object SQLContext { if (value == null) { null } else { - new GenericInternalRow( - methodConverters.map { case (method, converter) => - converter(method.invoke(value)) - }) + new GenericInternalRow(methodConverters.map { case (method, converter) => + converter(method.invoke(value)) + }) } } + def createConverter(cls: Class[_], dataType: DataType): Any => Any = dataType match { case struct: StructType => createStructConverter(cls, struct.map(_.dataType)) case _ => CatalystTypeConverters.createToCatalystConverter(dataType) } + val dataConverter = createStructConverter(beanClass, attrs.map(_.dataType)) data.map(dataConverter) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index a7f85db12b214..3b36f6b59cb38 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -42,19 +42,21 @@ import org.apache.spark.sql.catalog.Catalog import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis.{NameParameterizedQuery, PosParameterizedQuery, UnresolvedRelation} import org.apache.spark.sql.catalyst.encoders._ -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} import org.apache.spark.sql.catalyst.parser.ParserInterface -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Range} +import org.apache.spark.sql.catalyst.plans.logical.{CompoundBody, LocalRelation, LogicalPlan, Range} +import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.ExternalCommandRunner -import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, SqlScriptingErrors} import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.ExternalCommandExecutor import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} import org.apache.spark.sql.functions.lit import org.apache.spark.sql.internal._ import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION +import org.apache.spark.sql.scripting.SqlScriptingExecution import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.streaming._ import org.apache.spark.sql.types.{DataType, StructType} @@ -96,7 +98,7 @@ class SparkSession private( @transient private[sql] val extensions: SparkSessionExtensions, @transient private[sql] val initialSessionOptions: Map[String, String], @transient private val parentManagedJobTags: Map[String, String]) - extends api.SparkSession with Logging { self => + extends api.SparkSession with Logging with classic.ColumnConversions { self => // The call site where this SparkSession was constructed. private val creationSite: CallSite = Utils.getCallSite() @@ -431,6 +433,43 @@ class SparkSession private( | Everything else | * ----------------- */ + /** + * Executes given script and return the result of the last statement. + * If script contains no queries, an empty `DataFrame` is returned. + * + * @param script A SQL script to execute. + * @param args A map of parameter names to SQL literal expressions. + * + * @return The result as a `DataFrame`. + */ + private def executeSqlScript( + script: CompoundBody, + args: Map[String, Expression] = Map.empty): DataFrame = { + val sse = new SqlScriptingExecution(script, this, args) + var result: Option[Seq[Row]] = None + + // We must execute returned df before calling sse.getNextResult again because sse.hasNext + // advances the script execution and executes all statements until the next result. We must + // collect results immediately to maintain execution order. + // This ensures we respect the contract of SqlScriptingExecution API. + var df: Option[DataFrame] = sse.getNextResult + while (df.isDefined) { + sse.withErrorHandling { + // Collect results from the current DataFrame. + result = Some(df.get.collect().toSeq) + } + df = sse.getNextResult + } + + if (result.isEmpty) { + emptyDataFrame + } else { + val attributes = DataTypeUtils.toAttributes(result.get.head.schema) + Dataset.ofRows( + self, LocalRelation.fromExternalRows(attributes, result.get)) + } + } + /** * Executes a SQL query substituting positional parameters by the given arguments, * returning the result as a `DataFrame`. @@ -450,17 +489,33 @@ class SparkSession private( withActive { val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) { val parsedPlan = sessionState.sqlParser.parsePlan(sqlText) - if (args.nonEmpty) { - PosParameterizedQuery(parsedPlan, args.map(lit(_).expr).toImmutableArraySeq) - } else { - parsedPlan + parsedPlan match { + case compoundBody: CompoundBody => + if (args.nonEmpty) { + // Positional parameters are not supported for SQL scripting. + throw SqlScriptingErrors.positionalParametersAreNotSupportedWithSqlScripting() + } + compoundBody + case logicalPlan: LogicalPlan => + if (args.nonEmpty) { + PosParameterizedQuery(logicalPlan, args.map(lit(_).expr).toImmutableArraySeq) + } else { + logicalPlan + } } } - Dataset.ofRows(self, plan, tracker) + + plan match { + case compoundBody: CompoundBody => + // Execute the SQL script. + executeSqlScript(compoundBody) + case logicalPlan: LogicalPlan => + // Execute the standalone SQL statement. + Dataset.ofRows(self, plan, tracker) + } } /** @inheritdoc */ - @Experimental def sql(sqlText: String, args: Array[_]): DataFrame = { sql(sqlText, args, new QueryPlanningTracker) } @@ -488,23 +543,34 @@ class SparkSession private( withActive { val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) { val parsedPlan = sessionState.sqlParser.parsePlan(sqlText) - if (args.nonEmpty) { - NameParameterizedQuery(parsedPlan, args.transform((_, v) => lit(v).expr)) - } else { - parsedPlan + parsedPlan match { + case compoundBody: CompoundBody => + compoundBody + case logicalPlan: LogicalPlan => + if (args.nonEmpty) { + NameParameterizedQuery(logicalPlan, args.transform((_, v) => lit(v).expr)) + } else { + logicalPlan + } } } - Dataset.ofRows(self, plan, tracker) + + plan match { + case compoundBody: CompoundBody => + // Execute the SQL script. + executeSqlScript(compoundBody, args.transform((_, v) => lit(v).expr)) + case logicalPlan: LogicalPlan => + // Execute the standalone SQL statement. + Dataset.ofRows(self, plan, tracker) + } } /** @inheritdoc */ - @Experimental def sql(sqlText: String, args: Map[String, Any]): DataFrame = { sql(sqlText, args, new QueryPlanningTracker) } /** @inheritdoc */ - @Experimental override def sql(sqlText: String, args: java.util.Map[String, Any]): DataFrame = { sql(sqlText, args.asScala.toMap) } @@ -732,23 +798,11 @@ class SparkSession private( .getOrElse(sparkContext.defaultParallelism) } - private[sql] object Converter extends ColumnNodeToExpressionConverter with Serializable { - override protected def parser: ParserInterface = sessionState.sqlParser - override protected def conf: SQLConf = sessionState.conf - } - - private[sql] def expression(e: Column): Expression = Converter(e.node) - - private[sql] implicit class RichColumn(val column: Column) { - /** - * Returns the expression for this column. - */ - def expr: Expression = Converter(column.node) - /** - * Returns the expression for this column either with an existing or auto assigned name. - */ - def named: NamedExpression = ExpressionUtils.toNamed(expr) - } + override protected[sql] val converter: ColumnNodeToExpressionConverter = + new ColumnNodeToExpressionConverter with Serializable { + override protected def parser: ParserInterface = sessionState.sqlParser + override protected def conf: SQLConf = sessionState.conf + } private[sql] lazy val observationManager = new ObservationManager(this) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/TableArg.scala b/sql/core/src/main/scala/org/apache/spark/sql/TableArg.scala new file mode 100644 index 0000000000000..133775c0b666c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/TableArg.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.expressions.{Ascending, Expression, FunctionTableSubqueryArgumentExpression, SortOrder} + +class TableArg( + private[sql] val expression: FunctionTableSubqueryArgumentExpression, + sparkSession: SparkSession) + extends TableValuedFunctionArgument { + import sparkSession.toRichColumn + + private def isPartitioned: Boolean = + expression.partitionByExpressions.nonEmpty || expression.withSinglePartition + + @scala.annotation.varargs + def partitionBy(cols: Column*): TableArg = { + if (isPartitioned) { + throw new IllegalArgumentException( + "Cannot call partitionBy() after partitionBy() or withSinglePartition() has been called." + ) + } + val partitionByExpressions = cols.map(_.expr) + new TableArg( + expression.copy( + partitionByExpressions = partitionByExpressions), + sparkSession) + } + + @scala.annotation.varargs + def orderBy(cols: Column*): TableArg = { + if (!isPartitioned) { + throw new IllegalArgumentException( + "Please call partitionBy() or withSinglePartition() before orderBy()." + ) + } + val orderByExpressions = cols.map { col => + col.expr match { + case sortOrder: SortOrder => sortOrder + case expr: Expression => SortOrder(expr, Ascending) + } + } + new TableArg( + expression.copy(orderByExpressions = orderByExpressions), + sparkSession) + } + + def withSinglePartition(): TableArg = { + if (isPartitioned) { + throw new IllegalArgumentException( + "Cannot call withSinglePartition() after partitionBy() or " + + "withSinglePartition() has been called." + ) + } + new TableArg( + expression.copy(withSinglePartition = true), + sparkSession) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index a66a6e54a7c8a..49fe494903cdc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -33,10 +33,11 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TableFunctionRe import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.execution.{ExplainMode, QueryExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.execution.python.EvaluatePython -import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} +import org.apache.spark.sql.internal.ExpressionUtils.expression import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.{MutableURLClassLoader, Utils} @@ -143,6 +144,33 @@ private[sql] object PythonSQLUtils extends Logging { } } + def jsonToDDL(json: String): String = { + DataType.fromJson(json).asInstanceOf[StructType].toDDL + } + + def ddlToJson(ddl: String): String = { + val dataType = try { + // DDL format, "fieldname datatype, fieldname datatype". + StructType.fromDDL(ddl) + } catch { + case e: Throwable => + try { + // For backwards compatibility, "integer", "struct" and etc. + parseDataType(ddl) + } catch { + case _: Throwable => + try { + // For backwards compatibility, "fieldname: datatype, fieldname: datatype" case. + parseDataType(s"struct<${ddl.trim}>") + } catch { + case _: Throwable => + throw e + } + } + } + dataType.json + } + def unresolvedNamedLambdaVariable(name: String): Column = Column(internal.UnresolvedNamedLambdaVariable.apply(name)) @@ -152,7 +180,8 @@ private[sql] object PythonSQLUtils extends Logging { Column(internal.LambdaFunction(function.node, arguments)) } - def namedArgumentExpression(name: String, e: Column): Column = NamedArgumentExpression(name, e) + def namedArgumentExpression(name: String, e: Column): Column = + Column(NamedArgumentExpression(name, expression(e))) @scala.annotation.varargs def fn(name: String, arguments: Column*): Column = Column.fn(name, arguments: _*) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala index d362c5bef878e..6394cef9fc760 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.artifact -import java.io.File +import java.io.{File, IOException} +import java.lang.ref.Cleaner import java.net.{URI, URL, URLClassLoader} import java.nio.ByteBuffer import java.nio.file.{CopyOption, Files, Path, Paths, StandardCopyOption} @@ -30,8 +31,8 @@ import scala.reflect.ClassTag import org.apache.commons.io.{FilenameUtils, FileUtils} import org.apache.hadoop.fs.{LocalFileSystem, Path => FSPath} -import org.apache.spark.{JobArtifactSet, JobArtifactState, SparkEnv, SparkException, SparkUnsupportedOperationException} -import org.apache.spark.internal.Logging +import org.apache.spark.{JobArtifactSet, JobArtifactState, SparkContext, SparkEnv, SparkException, SparkUnsupportedOperationException} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.{CONNECT_SCALA_UDF_STUB_PREFIXES, EXECUTOR_USER_CLASS_PATH_FIRST} import org.apache.spark.sql.{Artifact, SparkSession} import org.apache.spark.sql.internal.SQLConf @@ -51,7 +52,7 @@ import org.apache.spark.util.{ChildFirstURLClassLoader, StubClassLoader, Utils} * * @param session The object used to hold the Spark Connect session state. */ -class ArtifactManager(session: SparkSession) extends Logging { +class ArtifactManager(session: SparkSession) extends AutoCloseable with Logging { import ArtifactManager._ // The base directory where all artifacts are stored. @@ -66,12 +67,11 @@ class ArtifactManager(session: SparkSession) extends Logging { // The base directory/URI where all artifacts are stored for this `sessionUUID`. protected[artifact] val (artifactPath, artifactURI): (Path, String) = (ArtifactUtils.concatenatePaths(artifactRootPath, session.sessionUUID), - s"$artifactRootURI${File.separator}${session.sessionUUID}") + s"$artifactRootURI/${session.sessionUUID}") // The base directory/URI where all class file artifacts are stored for this `sessionUUID`. protected[artifact] val (classDir, replClassURI): (Path, String) = - (ArtifactUtils.concatenatePaths(artifactPath, "classes"), - s"$artifactURI${File.separator}classes${File.separator}") + (ArtifactUtils.concatenatePaths(artifactPath, "classes"), s"$artifactURI/classes/") private lazy val alwaysApplyClassLoader = session.conf.get(SQLConf.ARTIFACTS_SESSION_ISOLATION_ALWAYS_APPLY_CLASSLOADER.key).toBoolean @@ -88,6 +88,9 @@ class ArtifactManager(session: SparkSession) extends Logging { */ protected val sessionArtifactAdded = new AtomicBoolean(false) + @volatile + protected var cachedClassLoader: Option[ClassLoader] = None + private def withClassLoaderIfNeeded[T](f: => T): T = { val log = s" classloader for session ${session.sessionUUID} because " + s"alwaysApplyClassLoader=$alwaysApplyClassLoader, " + @@ -203,6 +206,7 @@ class ArtifactManager(session: SparkSession) extends Logging { allowOverwrite = true, deleteSource = deleteStagedFile) sessionArtifactAdded.set(true) + cachedClassLoader = None } else { val target = ArtifactUtils.concatenatePaths(artifactPath, normalizedRemoteRelativePath) // Disallow overwriting with modified version @@ -227,6 +231,7 @@ class ArtifactManager(session: SparkSession) extends Logging { (SparkContextResourceType.JAR, normalizedRemoteRelativePath, fragment)) jarsList.add(normalizedRemoteRelativePath) sessionArtifactAdded.set(true) + cachedClassLoader = None } else if (normalizedRemoteRelativePath.startsWith(s"pyfiles${File.separator}")) { session.sparkContext.addFile(uri) sparkContextRelativePaths.add( @@ -282,10 +287,18 @@ class ArtifactManager(session: SparkSession) extends Logging { } } + def classloader: ClassLoader = synchronized { + cachedClassLoader.getOrElse { + val loader = buildClassLoader + cachedClassLoader = Some(loader) + loader + } + } + /** * Returns a [[ClassLoader]] for session-specific jar/class file resources. */ - def classloader: ClassLoader = { + private def buildClassLoader: ClassLoader = { val urls = (getAddedJars :+ classDir.toUri.toURL).toArray val prefixes = SparkEnv.get.conf.get(CONNECT_SCALA_UDF_STUB_PREFIXES) val userClasspathFirst = SparkEnv.get.conf.get(EXECUTOR_USER_CLASS_PATH_FIRST) @@ -361,40 +374,48 @@ class ArtifactManager(session: SparkSession) extends Logging { newArtifactManager } + private val cleanUpStateForGlobalResources = ArtifactStateForCleanup( + session.sessionUUID, + session.sparkContext, + state, + artifactPath) + // Ensure that no reference to `this` is captured/help by the cleanup lambda + private def getCleanable: Cleaner.Cleanable = cleaner.register( + this, + () => ArtifactManager.cleanUpGlobalResources(cleanUpStateForGlobalResources) + ) + private var cleanable = getCleanable + /** * Cleans up all resources specific to this `session`. */ - private[sql] def cleanUpResources(): Unit = { + private def cleanUpResources(): Unit = { logDebug( s"Cleaning up resources for session with sessionUUID ${session.sessionUUID}") - // Clean up added files - val fileserver = SparkEnv.get.rpcEnv.fileServer - val sparkContext = session.sparkContext - if (state != null) { - val shouldUpdateEnv = sparkContext.addedFiles.contains(state.uuid) || - sparkContext.addedArchives.contains(state.uuid) || - sparkContext.addedJars.contains(state.uuid) - if (shouldUpdateEnv) { - sparkContext.addedFiles.remove(state.uuid).foreach(_.keys.foreach(fileserver.removeFile)) - sparkContext.addedArchives.remove(state.uuid).foreach(_.keys.foreach(fileserver.removeFile)) - sparkContext.addedJars.remove(state.uuid).foreach(_.keys.foreach(fileserver.removeJar)) - sparkContext.postEnvironmentUpdate() - } - } - - // Clean up cached relations - val blockManager = sparkContext.env.blockManager - blockManager.removeCache(session.sessionUUID) - - // Clean up artifacts folder - FileUtils.deleteDirectory(artifactPath.toFile) + // Clean up global resources via the Cleaner process. + // Note that this will only be run once per instance. + cleanable.clean() // Clean up internal trackers jarsList.clear() pythonIncludeList.clear() cachedBlockIdList.clear() sparkContextRelativePaths.clear() + + // Removed cached classloader + cachedClassLoader = None + } + + override def close(): Unit = { + cleanUpResources() + } + + private[sql] def cleanUpResourcesForTesting(): Unit = { + cleanUpResources() + // Tests reuse the same instance so we need to re-register the cleanable otherwise, it is run + // only once per instance. + cleanable = getCleanable } def uploadArtifactToFs( @@ -466,4 +487,51 @@ object ArtifactManager extends Logging { throw SparkException.internalError(s"Block $fromId not found in the block manager.") } } + + // Shared cleaner instance + private val cleaner: Cleaner = Cleaner.create() + + /** + * Helper method to clean up global resources (i.e. resources associated with the calling + * instance but held externally in sparkContext, blockManager, disk etc.) + */ + private def cleanUpGlobalResources(cleanupState: ArtifactStateForCleanup): Unit = { + // Clean up added files + val (sparkSessionUUID, sparkContext, state, artifactPath) = ( + cleanupState.sparkSessionUUID, + cleanupState.sparkContext, + cleanupState.jobArtifactState, + cleanupState.artifactPath) + val fileServer = SparkEnv.get.rpcEnv.fileServer + if (state != null) { + val shouldUpdateEnv = sparkContext.addedFiles.contains(state.uuid) || + sparkContext.addedArchives.contains(state.uuid) || + sparkContext.addedJars.contains(state.uuid) + if (shouldUpdateEnv) { + sparkContext.addedFiles.remove(state.uuid).foreach(_.keys.foreach(fileServer.removeFile)) + sparkContext.addedArchives.remove(state.uuid).foreach(_.keys.foreach(fileServer.removeFile)) + sparkContext.addedJars.remove(state.uuid).foreach(_.keys.foreach(fileServer.removeJar)) + sparkContext.postEnvironmentUpdate() + } + } + + // Clean up cached relations + val blockManager = sparkContext.env.blockManager + blockManager.removeCache(sparkSessionUUID) + + // Clean up artifacts folder + try { + FileUtils.deleteDirectory(artifactPath.toFile) + } catch { + case e: IOException => + logWarning(log"Failed to delete directory ${MDC(LogKeys.PATH, artifactPath.toFile)}: " + + log"${MDC(LogKeys.EXCEPTION, e.getMessage)}", e) + } + } } + +private[artifact] case class ArtifactStateForCleanup( + sparkSessionUUID: String, + sparkContext: SparkContext, + jobArtifactState: JobArtifactState, + artifactPath: Path) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 92c74f7bede18..b73ea2f80452b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -54,6 +54,11 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { + case _ if ResolveDefaultStringTypes.needsResolution(plan) => + // if there are still unresolved string types in the plan + // we should not try to resolve it + plan + case AddColumns(ResolvedV1TableIdentifier(ident), cols) => cols.foreach { c => if (c.name.length > 1) { @@ -149,11 +154,11 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) // Use v1 command to describe (temp) view, as v2 catalog doesn't support view yet. case DescribeRelation( - ResolvedV1TableOrViewIdentifier(ident), partitionSpec, isExtended, output) => + ResolvedV1TableOrViewIdentifier(ident), partitionSpec, isExtended, output) => DescribeTableCommand(ident, partitionSpec, isExtended, output) case DescribeColumn( - ResolvedViewIdentifier(ident), column: UnresolvedAttribute, isExtended, output) => + ResolvedViewIdentifier(ident), column: UnresolvedAttribute, isExtended, output) => // For views, the column will not be resolved by `ResolveReferences` because // `ResolvedView` stores only the identifier. DescribeColumnCommand(ident, column.nameParts, isExtended, output) @@ -416,11 +421,12 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) AlterViewSchemaBindingCommand(ident, viewSchemaMode) case CreateView(ResolvedIdentifierInSessionCatalog(ident), userSpecifiedColumns, comment, - properties, originalText, child, allowExisting, replace, viewSchemaMode) => + collation, properties, originalText, child, allowExisting, replace, viewSchemaMode) => CreateViewCommand( name = ident, userSpecifiedColumns = userSpecifiedColumns, comment = comment, + collation = collation, properties = properties, originalText = originalText, plan = child, @@ -429,7 +435,7 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) viewType = PersistedView, viewSchemaMode = viewSchemaMode) - case CreateView(ResolvedIdentifier(catalog, _), _, _, _, _, _, _, _, _) => + case CreateView(ResolvedIdentifier(catalog, _), _, _, _, _, _, _, _, _, _) => throw QueryCompilationErrors.missingCatalogAbilityError(catalog, "views") case ShowViews(ns: ResolvedNamespace, pattern, output) => @@ -491,6 +497,27 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) case CreateFunction(ResolvedIdentifier(catalog, _), _, _, _, _) => throw QueryCompilationErrors.missingCatalogAbilityError(catalog, "CREATE FUNCTION") + + case c @ CreateUserDefinedFunction( + ResolvedIdentifierInSessionCatalog(ident), _, _, _, _, _, _, _, _, _, _, _) => + CreateUserDefinedFunctionCommand( + FunctionIdentifier(ident.table, ident.database, ident.catalog), + c.inputParamText, + c.returnTypeText, + c.exprText, + c.queryText, + c.comment, + c.isDeterministic, + c.containsSQL, + c.language, + c.isTableFunc, + isTemp = false, + c.ignoreIfExists, + c.replace) + + case CreateUserDefinedFunction( + ResolvedIdentifier(catalog, _), _, _, _, _, _, _, _, _, _, _, _) => + throw QueryCompilationErrors.missingCatalogAbilityError(catalog, "CREATE FUNCTION") } private def constructV1TableCmd( @@ -503,8 +530,8 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) storageFormat: CatalogStorageFormat, provider: String): CreateTableV1 = { val tableDesc = buildCatalogTable( - ident, tableSchema, partitioning, tableSpec.properties, provider, - tableSpec.location, tableSpec.comment, storageFormat, tableSpec.external) + ident, tableSchema, partitioning, tableSpec.properties, provider, tableSpec.location, + tableSpec.comment, tableSpec.collation, storageFormat, tableSpec.external) val mode = if (ignoreIfExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTableV1(tableDesc, mode, query) } @@ -580,6 +607,7 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) provider: String, location: Option[String], comment: Option[String], + collation: Option[String], storageFormat: CatalogStorageFormat, external: Boolean): CatalogTable = { val tableType = if (external || location.isDefined) { @@ -600,7 +628,9 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) properties = properties ++ maybeClusterBySpec.map( clusterBySpec => ClusterBySpec.toProperty(schema, clusterBySpec, conf.resolver)), - comment = comment) + comment = comment, + collation = collation + ) } object ResolvedViewIdentifier { @@ -717,7 +747,7 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) private def supportsV1Command(catalog: CatalogPlugin): Boolean = { isSessionCatalog(catalog) && ( - SQLConf.get.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION).isEmpty || + SQLConf.get.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION) == "builtin" || catalog.isInstanceOf[CatalogExtension]) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala deleted file mode 100644 index 8ae0341e5646c..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.catalog - -import org.apache.spark.sql.catalyst.FunctionIdentifier -import org.apache.spark.sql.catalyst.catalog.UserDefinedFunction._ -import org.apache.spark.sql.catalyst.parser.ParserInterface -import org.apache.spark.sql.types.{DataType, StructType} - -/** - * Represent a SQL function. - * - * @param name qualified name of the SQL function - * @param inputParam function input parameters - * @param returnType function return type - * @param exprText function body as an expression - * @param queryText function body as a query - * @param comment function comment - * @param deterministic whether the function is deterministic - * @param containsSQL whether the function has data access routine to be CONTAINS SQL - * @param isTableFunc whether the function is a table function - * @param properties additional properties to be serialized for the SQL function - * @param owner owner of the function - * @param createTimeMs function creation time in milliseconds - */ -case class SQLFunction( - name: FunctionIdentifier, - inputParam: Option[StructType], - returnType: Either[DataType, StructType], - exprText: Option[String], - queryText: Option[String], - comment: Option[String], - deterministic: Option[Boolean], - containsSQL: Option[Boolean], - isTableFunc: Boolean, - properties: Map[String, String], - owner: Option[String] = None, - createTimeMs: Long = System.currentTimeMillis) extends UserDefinedFunction { - - assert(exprText.nonEmpty || queryText.nonEmpty) - assert((isTableFunc && returnType.isRight) || (!isTableFunc && returnType.isLeft)) - - override val language: RoutineLanguage = LanguageSQL -} - -object SQLFunction { - - /** - * This method returns an optional DataType indicating, when present, either the return type for - * scalar user-defined functions, or a StructType indicating the names and types of the columns in - * the output schema for table functions. If the optional value is empty, this indicates that the - * CREATE FUNCTION statement did not have any RETURNS clause at all (for scalar functions), or - * that it included a RETURNS TABLE clause but without any specified output schema (for table - * functions), prompting the analyzer to infer these metadata instead. - */ - def parseReturnTypeText( - text: String, - isTableFunc: Boolean, - parser: ParserInterface): Option[Either[DataType, StructType]] = { - if (!isTableFunc) { - // This is a scalar user-defined function. - if (text.isEmpty) { - // The CREATE FUNCTION statement did not have any RETURNS clause. - Option.empty[Either[DataType, StructType]] - } else { - // The CREATE FUNCTION statement included a RETURNS clause with an explicit return type. - Some(Left(parseDataType(text, parser))) - } - } else { - // This is a table function. - if (text.equalsIgnoreCase("table")) { - // The CREATE FUNCTION statement had a RETURNS TABLE clause but without any explicit schema. - Option.empty[Either[DataType, StructType]] - } else { - // The CREATE FUNCTION statement included a RETURNS TABLE clause with an explicit schema. - Some(Right(parseTableSchema(text, parser))) - } - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala deleted file mode 100644 index 1473f19cb71bd..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.catalog - -import org.apache.spark.sql.catalyst.FunctionIdentifier -import org.apache.spark.sql.catalyst.parser.ParserInterface -import org.apache.spark.sql.catalyst.util.CharVarcharUtils -import org.apache.spark.sql.types.{DataType, StructType} - -/** - * The base class for all user defined functions registered via SQL queries. - */ -trait UserDefinedFunction { - - /** - * Qualified name of the function - */ - def name: FunctionIdentifier - - /** - * Additional properties to be serialized for the function. - * Use this to preserve the runtime configuration that should be used during the function - * execution, such as SQL configs etc. See [[SQLConf]] for more info. - */ - def properties: Map[String, String] - - /** - * Owner of the function - */ - def owner: Option[String] - - /** - * Function creation time in milliseconds since the linux epoch - */ - def createTimeMs: Long - - /** - * The language of the user defined function. - */ - def language: RoutineLanguage -} - -object UserDefinedFunction { - def parseTableSchema(text: String, parser: ParserInterface): StructType = { - val parsed = parser.parseTableSchema(text) - CharVarcharUtils.failIfHasCharVarchar(parsed).asInstanceOf[StructType] - } - - def parseDataType(text: String, parser: ParserInterface): DataType = { - val dataType = parser.parseDataType(text) - CharVarcharUtils.failIfHasCharVarchar(dataType) - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SQLFunctionNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SQLFunctionNode.scala new file mode 100644 index 0000000000000..0a3274af33b5b --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SQLFunctionNode.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.catalog.SQLFunction +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.trees.TreePattern.FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION +import org.apache.spark.sql.errors.DataTypeErrors.toSQLId +import org.apache.spark.sql.errors.QueryCompilationErrors + +/** + * A container for holding a SQL function query plan and its function identifier. + * + * @param function: the SQL function that this node represents. + * @param child: the SQL function body. + */ +case class SQLFunctionNode( + function: SQLFunction, + child: LogicalPlan) extends UnaryNode { + override def output: Seq[Attribute] = child.output + override def stringArgs: Iterator[Any] = Iterator(function.name, child) + override protected def withNewChildInternal(newChild: LogicalPlan): SQLFunctionNode = + copy(child = newChild) + + // Throw a reasonable error message when trying to call a SQL UDF with TABLE argument(s). + if (child.containsPattern(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION)) { + throw QueryCompilationErrors + .tableValuedArgumentsNotYetImplementedForSqlFunctions("call", toSQLId(function.name.funcName)) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala index b0ce2bb4293e1..23ae5ee7b9be4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala @@ -305,6 +305,8 @@ class V2ExpressionBuilder(e: Expression, isPredicate: Boolean = false) extends L case _: Md5 => generateExpressionWithName("MD5", expr, isPredicate) case _: Sha1 => generateExpressionWithName("SHA1", expr, isPredicate) case _: Sha2 => generateExpressionWithName("SHA2", expr, isPredicate) + case _: StringLPad => generateExpressionWithName("LPAD", expr, isPredicate) + case _: StringRPad => generateExpressionWithName("RPAD", expr, isPredicate) // TODO supports other expressions case ApplyFunctionExpression(function, children) => val childrenExpressions = children.flatMap(generateExpression(_)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/ClassicConversions.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/conversions.scala similarity index 56% rename from sql/core/src/main/scala/org/apache/spark/sql/classic/ClassicConversions.scala rename to sql/core/src/main/scala/org/apache/spark/sql/classic/conversions.scala index 8c3223fa72f55..e90fd4b6a6032 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/classic/ClassicConversions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/classic/conversions.scala @@ -20,8 +20,8 @@ import scala.language.implicitConversions import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.internal.ExpressionUtils +import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression} +import org.apache.spark.sql.internal.{ColumnNodeToExpressionConverter, ExpressionUtils} /** * Conversions from sql interfaces to the Classic specific implementation. @@ -56,4 +56,54 @@ trait ClassicConversions { } } +@DeveloperApi object ClassicConversions extends ClassicConversions + +/** + * Conversions from a [[Column]] to an [[Expression]]. + */ +@DeveloperApi +trait ColumnConversions { + protected def converter: ColumnNodeToExpressionConverter + + /** + * Convert a [[Column]] into an [[Expression]]. + */ + @DeveloperApi + def expression(column: Column): Expression = converter(column.node) + + /** + * Wrap a [[Column]] with a [[RichColumn]] to provide the `expr` and `named` methods. + */ + @DeveloperApi + implicit def toRichColumn(column: Column): RichColumn = new RichColumn(column, converter) +} + +/** + * Automatic conversions from a Column to an Expression. This uses the active SparkSession for + * parsing, and the active SQLConf for fetching configurations. + * + * This functionality is not part of the ClassicConversions because it is generally better to use + * `SparkSession.toRichColumn(...)` or `SparkSession.expression(...)` directly. + */ +@DeveloperApi +object ColumnConversions extends ColumnConversions { + override protected def converter: ColumnNodeToExpressionConverter = + ColumnNodeToExpressionConverter +} + +/** + * Helper class that adds the `expr` and `named` methods to a Column. This can be used to reinstate + * the pre-Spark 4 Column functionality. + */ +@DeveloperApi +class RichColumn(column: Column, converter: ColumnNodeToExpressionConverter) { + /** + * Returns the expression for this column. + */ + def expr: Expression = converter(column.node) + /** + * Returns the expression for this column either with an existing or auto assigned name. + */ + def named: NamedExpression = ExpressionUtils.toNamed(expr) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala index 64d2633c31079..60156bff1fb71 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala @@ -239,7 +239,7 @@ trait BaseScriptTransformationExec extends UnaryExecNode { val complexTypeFactory = JsonToStructs(attr.dataType, ioschema.outputSerdeProps.toMap, Literal(null), Some(conf.sessionLocalTimeZone)) wrapperConvertException(data => - complexTypeFactory.evaluator.evaluate(UTF8String.fromString(data)), any => any) + complexTypeFactory.nullSafeEval(UTF8String.fromString(data)), any => any) case udt: UserDefinedType[_] => wrapperConvertException(data => udt.deserialize(data), converter) case dt => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala index 64163da50e13a..a67648f24b4c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala @@ -194,7 +194,7 @@ case class ColumnarToRowExec(child: SparkPlan) extends ColumnarToRowTransition w | $shouldStop | } | $idx = $numRows; - | $batch.closeIfNotWritable(); + | $batch.closeIfFreeable(); | $batch = null; | $nextBatchFuncName(); |} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffset.scala index 6c7a9206a8e39..aa29128cda7e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffset.scala @@ -18,10 +18,11 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.expressions.SortOrder +import org.apache.spark.sql.catalyst.plans.logical.{Project, Sort} import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.adaptive.{AQEShuffleReadExec, ShuffleQueryStageExec} import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec +import org.apache.spark.sql.execution.python.EvalPythonExec import org.apache.spark.sql.internal.SQLConf /** @@ -41,31 +42,61 @@ object InsertSortForLimitAndOffset extends Rule[SparkPlan] { plan transform { case l @ GlobalLimitExec( _, - SinglePartitionShuffleWithGlobalOrdering(ordering), - _) => - val newChild = SortExec(ordering, global = false, child = l.child) - l.withNewChildren(Seq(newChild)) - } - } - - object SinglePartitionShuffleWithGlobalOrdering { - def unapply(plan: SparkPlan): Option[Seq[SortOrder]] = plan match { - case ShuffleExchangeExec(SinglePartition, SparkPlanWithGlobalOrdering(ordering), _, _) => - Some(ordering) - case p: AQEShuffleReadExec => unapply(p.child) - case p: ShuffleQueryStageExec => unapply(p.plan) - case _ => None + // Should not match AQE shuffle stage because we only target un-submitted stages which + // we can still rewrite the query plan. + s @ ShuffleExchangeExec(SinglePartition, child, _, _), + _) if child.logicalLink.isDefined => + extractOrderingAndPropagateOrderingColumns(child) match { + case Some((ordering, newChild)) => + val newShuffle = s.withNewChildren(Seq(newChild)) + val sorted = SortExec(ordering, global = false, child = newShuffle) + // We must set the logical plan link to avoid losing the added SortExec and ProjectExec + // during AQE re-optimization, where we turn physical plan back to logical plan. + val logicalSort = Sort(ordering, global = false, child = s.child.logicalLink.get) + sorted.setLogicalLink(logicalSort) + val projected = if (sorted.output == s.output) { + sorted + } else { + val p = ProjectExec(s.output, sorted) + p.setLogicalLink(Project(s.output, logicalSort)) + p + } + l.withNewChildren(Seq(projected)) + case _ => l + } } } // Note: this is not implementing a generalized notion of "global order preservation", but just - // tackles the regular ORDER BY semantics with optional LIMIT (top-K). - object SparkPlanWithGlobalOrdering { - def unapply(plan: SparkPlan): Option[Seq[SortOrder]] = plan match { - case p: SortExec if p.global => Some(p.sortOrder) - case p: LocalLimitExec => unapply(p.child) - case p: WholeStageCodegenExec => unapply(p.child) - case _ => None - } + // a best effort to catch the common query patterns that the data ordering should be preserved. + private def extractOrderingAndPropagateOrderingColumns( + plan: SparkPlan): Option[(Seq[SortOrder], SparkPlan)] = plan match { + case p: SortExec if p.global => Some(p.sortOrder, p) + case p: UnaryExecNode if + p.isInstanceOf[LocalLimitExec] || + p.isInstanceOf[WholeStageCodegenExec] || + p.isInstanceOf[FilterExec] || + p.isInstanceOf[EvalPythonExec] => + extractOrderingAndPropagateOrderingColumns(p.child) match { + case Some((ordering, newChild)) => Some((ordering, p.withNewChildren(Seq(newChild)))) + case _ => None + } + case p: ProjectExec => + extractOrderingAndPropagateOrderingColumns(p.child) match { + case Some((ordering, newChild)) => + val orderingCols = ordering.flatMap(_.references) + if (orderingCols.forall(p.outputSet.contains)) { + Some((ordering, p.withNewChildren(Seq(newChild)))) + } else { + // In order to do the sort after shuffle, we must propagate the ordering columns in the + // pre-shuffle ProjectExec. + val missingCols = orderingCols.filterNot(p.outputSet.contains) + val newProj = p.copy(projectList = p.projectList ++ missingCols, child = newChild) + newProj.copyTagsFrom(p) + Some((ordering, newProj)) + } + case _ => None + } + case _ => None } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 490184c93620a..d9b1a2136a5d3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -31,12 +31,11 @@ import org.apache.spark.internal.LogKeys.EXTENDED_EXPLAIN_GENERATOR import org.apache.spark.rdd.RDD import org.apache.spark.sql.{AnalysisException, ExtendedExplainGenerator, Row, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, QueryPlanningTracker} -import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker +import org.apache.spark.sql.catalyst.analysis.{LazyExpression, UnsupportedOperationChecker} import org.apache.spark.sql.catalyst.expressions.codegen.ByteCodeStats import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{AppendData, Command, CommandResult, CreateTableAsSelect, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect, ReturnAnswer, Union} import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule} -import org.apache.spark.sql.catalyst.trees.TreePattern.LAZY_ANALYSIS_EXPRESSION import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan} @@ -69,7 +68,10 @@ class QueryExecution( // TODO: Move the planner an optimizer into here from SessionState. protected def planner = sparkSession.sessionState.planner - lazy val isLazyAnalysis: Boolean = logical.containsAnyPattern(LAZY_ANALYSIS_EXPRESSION) + lazy val isLazyAnalysis: Boolean = { + // Only check the main query as subquery expression can be resolved now with the main query. + logical.exists(_.expressions.exists(_.exists(_.isInstanceOf[LazyExpression]))) + } def assertAnalyzed(): Unit = { try { @@ -90,12 +92,18 @@ class QueryExecution( } private val lazyAnalyzed = LazyTry { - val plan = executePhase(QueryPlanningTracker.ANALYSIS) { - // We can't clone `logical` here, which will reset the `_analyzed` flag. - sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) + try { + val plan = executePhase(QueryPlanningTracker.ANALYSIS) { + // We can't clone `logical` here, which will reset the `_analyzed` flag. + sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) + } + tracker.setAnalyzed(plan) + plan + } catch { + case NonFatal(e) => + tracker.setAnalysisFailed(logical) + throw e } - tracker.setAnalyzed(plan) - plan } def analyzed: LogicalPlan = lazyAnalyzed.get diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 6173703ef3cd9..a51870cfd7fdd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.optimizer._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.connector.catalog.CatalogManager -import org.apache.spark.sql.execution.datasources.{PruneFileSourcePartitions, SchemaPruning, V1Writes} +import org.apache.spark.sql.execution.datasources.{PruneFileSourcePartitions, PushVariantIntoScan, SchemaPruning, V1Writes} import org.apache.spark.sql.execution.datasources.v2.{GroupBasedRowLevelOperationScanPlanning, OptimizeMetadataOnlyDeleteFromTable, V2ScanPartitioningAndOrdering, V2ScanRelationPushDown, V2Writes} import org.apache.spark.sql.execution.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning, RowLevelOperationRuntimeGroupFiltering} import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs, ExtractPythonUDTFs} @@ -36,38 +36,42 @@ class SparkOptimizer( override def earlyScanPushDownRules: Seq[Rule[LogicalPlan]] = // TODO: move SchemaPruning into catalyst - Seq(SchemaPruning) :+ - GroupBasedRowLevelOperationScanPlanning :+ - V1Writes :+ - V2ScanRelationPushDown :+ - V2ScanPartitioningAndOrdering :+ - V2Writes :+ - PruneFileSourcePartitions + Seq( + SchemaPruning, + GroupBasedRowLevelOperationScanPlanning, + V1Writes, + V2ScanRelationPushDown, + V2ScanPartitioningAndOrdering, + V2Writes, + PruneFileSourcePartitions, + PushVariantIntoScan) override def preCBORules: Seq[Rule[LogicalPlan]] = - OptimizeMetadataOnlyDeleteFromTable :: Nil + Seq(OptimizeMetadataOnlyDeleteFromTable) - override def defaultBatches: Seq[Batch] = (preOptimizationBatches ++ super.defaultBatches :+ - Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+ + override def defaultBatches: Seq[Batch] = flattenBatches(Seq( + preOptimizationBatches, + super.defaultBatches, + Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)), Batch("PartitionPruning", Once, PartitionPruning, // We can't run `OptimizeSubqueries` in this batch, as it will optimize the subqueries // twice which may break some optimizer rules that can only be applied once. The rule below // only invokes `OptimizeSubqueries` to optimize newly added subqueries. - new RowLevelOperationRuntimeGroupFiltering(OptimizeSubqueries)) :+ + new RowLevelOperationRuntimeGroupFiltering(OptimizeSubqueries)), Batch("InjectRuntimeFilter", FixedPoint(1), - InjectRuntimeFilter) :+ + InjectRuntimeFilter), Batch("MergeScalarSubqueries", Once, MergeScalarSubqueries, - RewriteDistinctAggregates) :+ + RewriteDistinctAggregates), Batch("Pushdown Filters from PartitionPruning", fixedPoint, - PushDownPredicates) :+ + PushDownPredicates), Batch("Cleanup filters that cannot be pushed down", Once, CleanupDynamicPruningFilters, // cleanup the unnecessary TrueLiteral predicates BooleanSimplification, - PruneFilters)) ++ - postHocOptimizationBatches :+ + PruneFilters), + postHocOptimizationBatches, Batch("Extract Python UDFs", Once, ExtractPythonUDFFromJoinCondition, // `ExtractPythonUDFFromJoinCondition` can convert a join to a cartesian product. @@ -84,25 +88,27 @@ class SparkOptimizer( LimitPushDown, PushPredicateThroughNonJoin, PushProjectionThroughLimit, - RemoveNoopOperators) :+ + RemoveNoopOperators), Batch("Infer window group limit", Once, InferWindowGroupLimit, LimitPushDown, LimitPushDownThroughWindow, EliminateLimits, - ConstantFolding) :+ - Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) :+ - Batch("Replace CTE with Repartition", Once, ReplaceCTERefWithRepartition) + ConstantFolding), + Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*), + Batch("Replace CTE with Repartition", Once, ReplaceCTERefWithRepartition))) - override def nonExcludableRules: Seq[String] = super.nonExcludableRules :+ - ExtractPythonUDFFromJoinCondition.ruleName :+ - ExtractPythonUDFFromAggregate.ruleName :+ ExtractGroupingPythonUDFFromAggregate.ruleName :+ - ExtractPythonUDFs.ruleName :+ - GroupBasedRowLevelOperationScanPlanning.ruleName :+ - V2ScanRelationPushDown.ruleName :+ - V2ScanPartitioningAndOrdering.ruleName :+ - V2Writes.ruleName :+ - ReplaceCTERefWithRepartition.ruleName + override def nonExcludableRules: Seq[String] = super.nonExcludableRules ++ + Seq( + ExtractPythonUDFFromJoinCondition.ruleName, + ExtractPythonUDFFromAggregate.ruleName, + ExtractGroupingPythonUDFFromAggregate.ruleName, + ExtractPythonUDFs.ruleName, + GroupBasedRowLevelOperationScanPlanning.ruleName, + V2ScanRelationPushDown.ruleName, + V2ScanPartitioningAndOrdering.ruleName, + V2Writes.ruleName, + ReplaceCTERefWithRepartition.ruleName) /** * Optimization batches that are executed before the regular optimization batches (also before diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala index da3159319f98e..5dfe85548349c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala @@ -25,10 +25,13 @@ import org.apache.spark.sql.execution.adaptive.LogicalQueryStageStrategy import org.apache.spark.sql.execution.command.v2.V2CommandStrategy import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileSourceStrategy} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy +import org.apache.spark.sql.internal.SQLConf class SparkPlanner(val session: SparkSession, val experimentalMethods: ExperimentalMethods) extends SparkStrategies with SQLConfHelper { + override def conf: SQLConf = session.sessionState.conf + def numPartitions: Int = conf.numShufflePartitions override def strategies: Seq[Strategy] = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 9fbe400a555fc..2b7be9b34b9aa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -27,7 +27,7 @@ import org.antlr.v4.runtime.tree.TerminalNode import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, PlanWithUnresolvedIdentifier, SchemaEvolution, SchemaTypeEvolution, UnresolvedFunctionName, UnresolvedIdentifier, UnresolvedNamespace} +import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, PlanWithUnresolvedIdentifier, SchemaEvolution, SchemaTypeEvolution, UnresolvedAttribute, UnresolvedFunctionName, UnresolvedIdentifier, UnresolvedNamespace} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} import org.apache.spark.sql.catalyst.parser._ @@ -63,7 +63,7 @@ class SparkSqlAstBuilder extends AstBuilder { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ private val configKeyValueDef = """([a-zA-Z_\d\\.:]+)\s*=([^;]*);*""".r - private val configKeyDef = """([a-zA-Z_\d\\.:]+)$""".r + private val configKeyDef = """([a-zA-Z_\d\\.:]+)\s*$""".r private val configValueDef = """([^;]*);*""".r private val strLiteralDef = """(".*?[^\\]"|'.*?[^\\]'|[^ \n\r\t"']+)""".r @@ -106,14 +106,14 @@ class SparkSqlAstBuilder extends AstBuilder { SetCommand(Some(keyStr -> None)) } } else { - remainder(ctx.SET.getSymbol).trim match { + remainder(ctx.SET.getSymbol).trim.replaceAll(";+$", "") match { case configKeyValueDef(key, value) => SetCommand(Some(key -> Option(value.trim))) case configKeyDef(key) => SetCommand(Some(key -> None)) - case s if s == "-v" => + case s if s.trim == "-v" => SetCommand(Some("-v" -> None)) - case s if s.isEmpty => + case s if s.trim.isEmpty => SetCommand(None) case _ => throw QueryParsingErrors.unexpectedFormatForSetConfigurationError(ctx) } @@ -146,7 +146,7 @@ class SparkSqlAstBuilder extends AstBuilder { */ override def visitResetConfiguration( ctx: ResetConfigurationContext): LogicalPlan = withOrigin(ctx) { - remainder(ctx.RESET.getSymbol).trim match { + remainder(ctx.RESET.getSymbol).trim.replaceAll(";+$", "") match { case configKeyDef(key) => ResetCommand(Some(key)) case s if s.trim.isEmpty => @@ -377,7 +377,7 @@ class SparkSqlAstBuilder extends AstBuilder { invalidStatement("CREATE TEMPORARY TABLE IF NOT EXISTS", ctx) } - val (_, _, _, _, options, location, _, _, _) = + val (_, _, _, _, options, location, _, _, _, _) = visitCreateTableClauses(ctx.createTableClauses()) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText).getOrElse( throw QueryParsingErrors.createTempTableNotSpecifyProviderError(ctx)) @@ -520,6 +520,7 @@ class SparkSqlAstBuilder extends AstBuilder { * * create_view_clauses (order insensitive): * [COMMENT view_comment] + * [DEFAULT COLLATION collation_name] * [TBLPROPERTIES (property_name = property_value, ...)] * }}} */ @@ -529,6 +530,7 @@ class SparkSqlAstBuilder extends AstBuilder { } checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) + checkDuplicateClauses(ctx.collationSpec(), "DEFAULT COLLATION", ctx) checkDuplicateClauses(ctx.schemaBinding(), "WITH SCHEMA", ctx) checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED ON", ctx) checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) @@ -584,6 +586,7 @@ class SparkSqlAstBuilder extends AstBuilder { withIdentClause(ctx.identifierReference(), UnresolvedIdentifier(_)), userSpecifiedColumns, visitCommentSpecList(ctx.commentSpec()), + visitCollationSpecList(ctx.collationSpec()), properties, Some(originalText), qPlan, @@ -609,6 +612,7 @@ class SparkSqlAstBuilder extends AstBuilder { tableIdentifier, userSpecifiedColumns, visitCommentSpecList(ctx.commentSpec()), + visitCollationSpecList(ctx.collationSpec()), properties, Option(source(ctx.query)), otherPlans.head, @@ -719,8 +723,19 @@ class SparkSqlAstBuilder extends AstBuilder { withIdentClause(ctx.identifierReference(), functionIdentifier => { if (ctx.TEMPORARY == null) { - // TODO: support creating persistent UDFs. - operationNotAllowed(s"creating persistent SQL functions is not supported", ctx) + CreateUserDefinedFunction( + UnresolvedIdentifier(functionIdentifier), + inputParamText, + returnTypeText, + exprText, + queryText, + comment, + deterministic, + containsSQL, + language, + isTableFunc, + ctx.EXISTS != null, + ctx.REPLACE != null) } else { // Disallow to define a temporary function with `IF NOT EXISTS` if (ctx.EXISTS != null) { @@ -1138,4 +1153,46 @@ class SparkSqlAstBuilder extends AstBuilder { withIdentClause(ctx.identifierReference(), UnresolvedNamespace(_)), cleanedProperties) } + + /** + * Create a [[DescribeColumn]] or [[DescribeRelation]] or [[DescribeRelationAsJsonCommand]] + * command. + */ + override def visitDescribeRelation(ctx: DescribeRelationContext): LogicalPlan = withOrigin(ctx) { + val isExtended = ctx.EXTENDED != null || ctx.FORMATTED != null + val asJson = ctx.JSON != null + if (asJson && !isExtended) { + val tableName = ctx.identifierReference.getText.split("\\.").lastOption.getOrElse("table") + throw QueryCompilationErrors.describeJsonNotExtendedError(tableName) + } + val relation = createUnresolvedTableOrView(ctx.identifierReference, "DESCRIBE TABLE") + if (ctx.describeColName != null) { + if (ctx.partitionSpec != null) { + throw QueryParsingErrors.descColumnForPartitionUnsupportedError(ctx) + } else if (asJson) { + throw QueryCompilationErrors.describeColJsonUnsupportedError() + } else { + DescribeColumn( + relation, + UnresolvedAttribute(ctx.describeColName.nameParts.asScala.map(_.getText).toSeq), + isExtended) + } + } else { + val partitionSpec = if (ctx.partitionSpec != null) { + // According to the syntax, visitPartitionSpec returns `Map[String, Option[String]]`. + visitPartitionSpec(ctx.partitionSpec).map { + case (key, Some(value)) => key -> value + case (key, _) => + throw QueryParsingErrors.emptyPartitionKeyError(key, ctx.partitionSpec) + } + } else { + Map.empty[String, String] + } + if (asJson) { + DescribeRelationJsonCommand(relation, partitionSpec, isExtended) + } else { + DescribeRelation(relation, partitionSpec, isExtended) + } + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 22082aca81a22..36e25773f8342 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -607,7 +607,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { // [COUNT(DISTINCT bar), COUNT(DISTINCT foo)] is disallowed because those two distinct // aggregates have different column expressions. val distinctExpressions = - functionsWithDistinct.head.aggregateFunction.children.filterNot(_.foldable) + functionsWithDistinct.head.aggregateFunction.children + .filterNot(_.foldable) + .map { + case s: SortOrder => s.child + case e => e + } val normalizedNamedDistinctExpressions = distinctExpressions.map { e => // Ideally this should be done in `NormalizeFloatingNumbers`, but we do it here // because `distinctExpressions` is not extracted during logical phase. @@ -789,8 +794,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { object TransformWithStateInPandasStrategy extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case t @ TransformWithStateInPandas( - func, _, outputAttrs, outputMode, timeMode, child, - hasInitialState, initialState, _, initialStateSchema) => + func, _, outputAttrs, outputMode, timeMode, child, + hasInitialState, initialState, _, initialStateSchema) => val execPlan = TransformWithStateInPandasExec( func, t.leftAttributes, outputAttrs, outputMode, timeMode, stateInfo = None, @@ -798,6 +803,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { eventTimeWatermarkForLateEvents = None, eventTimeWatermarkForEviction = None, planLater(child), + isStreaming = true, hasInitialState, planLater(initialState), t.rightAttributes, @@ -962,6 +968,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { keyEncoder, outputObjAttr, planLater(child), hasInitialState, initialStateGroupingAttrs, initialStateDataAttrs, initialStateDeserializer, planLater(initialState)) :: Nil + case t @ TransformWithStateInPandas( + func, _, outputAttrs, outputMode, timeMode, child, + hasInitialState, initialState, _, initialStateSchema) => + TransformWithStateInPandasExec.generateSparkPlanForBatchQueries(func, + t.leftAttributes, outputAttrs, outputMode, timeMode, planLater(child), hasInitialState, + planLater(initialState), t.rightAttributes, initialStateSchema) :: Nil case _: FlatMapGroupsInPandasWithState => // TODO(SPARK-40443): support applyInPandasWithState in batch query diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala index 1bbc26f3e52ed..3fdcb17bdeae6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala @@ -33,6 +33,8 @@ import org.apache.spark.util.Utils */ case class CoalesceShufflePartitions(session: SparkSession) extends AQEShuffleReadRule { + override def conf: SQLConf = session.sessionState.conf + override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS, REPARTITION_BY_COL, REBALANCE_PARTITIONS_BY_NONE, REBALANCE_PARTITIONS_BY_COL) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala index 8517911d70262..73fc9b1fe4e2c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala @@ -44,6 +44,8 @@ import org.apache.spark.sql.internal.SQLConf case class InsertAdaptiveSparkPlan( adaptiveExecutionContext: AdaptiveExecutionContext) extends Rule[SparkPlan] { + override def conf: SQLConf = adaptiveExecutionContext.session.sessionState.conf + override def apply(plan: SparkPlan): SparkPlan = applyInternal(plan, false) private def applyInternal(plan: SparkPlan, isSubquery: Boolean): SparkPlan = plan match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveDynamicPruningFilters.scala index 3d35abff3c538..77c180b18aee0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveDynamicPruningFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveDynamicPruningFilters.scala @@ -25,12 +25,16 @@ import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, HashedRelationBroadcastMode, HashJoin} +import org.apache.spark.sql.internal.SQLConf /** * A rule to insert dynamic pruning predicates in order to reuse the results of broadcast. */ case class PlanAdaptiveDynamicPruningFilters( rootPlan: AdaptiveSparkPlanExec) extends Rule[SparkPlan] with AdaptiveSparkPlanHelper { + + override def conf: SQLConf = rootPlan.context.session.sessionState.conf + def apply(plan: SparkPlan): SparkPlan = { if (!conf.dynamicPartitionPruningEnabled) { return plan diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala index 35a815d83922d..5f2638655c37c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala @@ -30,7 +30,7 @@ case class PlanAdaptiveSubqueries( def apply(plan: SparkPlan): SparkPlan = { plan.transformAllExpressionsWithPruning( _.containsAnyPattern(SCALAR_SUBQUERY, IN_SUBQUERY, DYNAMIC_PRUNING_SUBQUERY)) { - case expressions.ScalarSubquery(_, _, exprId, _, _, _, _, _) => + case expressions.ScalarSubquery(_, _, exprId, _, _, _, _) => val subquery = SubqueryExec.createForScalarSubquery( s"subquery#${exprId.id}", subqueryMap(exprId.id)) execution.ScalarSubquery(subquery, exprId) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala index bb7d904402ded..1ea4df0254673 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala @@ -61,7 +61,7 @@ object ShufflePartitionsUtil extends Logging { val targetSize = maxTargetSize.min(advisoryTargetSize).max(minPartitionSize) val shuffleIds = mapOutputStatistics.flatMap(_.map(_.shuffleId)).mkString(", ") - logInfo(log"For shuffle(${MDC(LogKeys.SHUFFLE_ID, shuffleIds)}, advisory target size: " + + logInfo(log"For shuffle(${MDC(LogKeys.SHUFFLE_IDS, shuffleIds)}, advisory target size: " + log"${MDC(LogKeys.ADVISORY_TARGET_SIZE, advisoryTargetSize)}, actual target size " + log"${MDC(LogKeys.TARGET_SIZE, targetSize)}, minimum partition size: " + log"${MDC(LogKeys.PARTITION_SIZE, minPartitionSize)}") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala index 09d9915022a65..1197a16a35e9b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala @@ -501,17 +501,17 @@ case class ScalaAggregator[IN, BUF, OUT]( with Logging { // input and buffer encoders are resolved by ResolveEncodersInScalaAgg - private[this] lazy val inputDeserializer = inputEncoder.createDeserializer() - private[this] lazy val bufferSerializer = bufferEncoder.createSerializer() - private[this] lazy val bufferDeserializer = bufferEncoder.createDeserializer() - private[this] lazy val outputEncoder = encoderFor(agg.outputEncoder) - private[this] lazy val outputSerializer = outputEncoder.createSerializer() + @transient private[this] lazy val inputDeserializer = inputEncoder.createDeserializer() + @transient private[this] lazy val bufferSerializer = bufferEncoder.createSerializer() + @transient private[this] lazy val bufferDeserializer = bufferEncoder.createDeserializer() + @transient private[this] lazy val outputEncoder = encoderFor(agg.outputEncoder) + @transient private[this] lazy val outputSerializer = outputEncoder.createSerializer() def dataType: DataType = outputEncoder.objSerializer.dataType def inputTypes: Seq[DataType] = inputEncoder.schema.map(_.dataType) - override lazy val deterministic: Boolean = isDeterministic + @transient override lazy val deterministic: Boolean = isDeterministic def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ScalaAggregator[IN, BUF, OUT] = copy(mutableAggBufferOffset = newMutableAggBufferOffset) @@ -519,7 +519,7 @@ case class ScalaAggregator[IN, BUF, OUT]( def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ScalaAggregator[IN, BUF, OUT] = copy(inputAggBufferOffset = newInputAggBufferOffset) - private[this] lazy val inputProjection = UnsafeProjection.create(children) + @transient private[this] lazy val inputProjection = UnsafeProjection.create(children) def createAggregationBuffer(): BUF = agg.zero @@ -533,7 +533,7 @@ case class ScalaAggregator[IN, BUF, OUT]( if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType) } - private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length) + @transient private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length) def serialize(agg: BUF): Array[Byte] = bufferSerializer(agg).asInstanceOf[UnsafeRow].getBytes() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 23555c98135f6..1268b14a32fb5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -140,6 +140,7 @@ case class AnalyzeColumnCommand( case DoubleType | FloatType => true case BooleanType => true case _: DatetimeType => true + case CharType(_) | VarcharType(_) => false case BinaryType | _: StringType => true case _ => false } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala index d2aaa93fcca06..fe4e6f121f57b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala @@ -17,9 +17,19 @@ package org.apache.spark.sql.execution.command -import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.SparkException +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.FunctionIdentifier -import org.apache.spark.sql.catalyst.catalog.SQLFunction +import org.apache.spark.sql.catalyst.analysis.{Analyzer, SQLFunctionNode, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation} +import org.apache.spark.sql.catalyst.catalog.{SessionCatalog, SQLFunction, UserDefinedFunctionErrors} +import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Generator, LateralSubquery, Literal, ScalarSubquery, SubqueryExpression, WindowExpression} +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.catalyst.plans.Inner +import org.apache.spark.sql.catalyst.plans.logical.{LateralJoin, LogicalPlan, OneRowRelation, Project, UnresolvedWith} +import org.apache.spark.sql.catalyst.trees.TreePattern.UNRESOLVED_ATTRIBUTE +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.execution.command.CreateUserDefinedFunctionCommand._ +import org.apache.spark.sql.types.{DataType, StructField, StructType} /** * The DDL command that creates a SQL function. @@ -52,10 +62,13 @@ case class CreateSQLFunctionCommand( replace: Boolean) extends CreateUserDefinedFunctionCommand { - override def run(sparkSession: SparkSession): Seq[Row] = { - import SQLFunction._ + import SQLFunction._ + override def run(sparkSession: SparkSession): Seq[Row] = { val parser = sparkSession.sessionState.sqlParser + val analyzer = sparkSession.sessionState.analyzer + val catalog = sparkSession.sessionState.catalog + val conf = sparkSession.sessionState.conf val inputParam = inputParamText.map(parser.parseTableSchema) val returnType = parseReturnTypeText(returnTypeText, isTableFunc, parser) @@ -72,8 +85,332 @@ case class CreateSQLFunctionCommand( isTableFunc, Map.empty) - // TODO: Implement the rest of the method. + val newFunction = { + val (expression, query) = function.getExpressionAndQuery(parser, isTableFunc) + assert(query.nonEmpty || expression.nonEmpty) + + // Check if the function can be replaced. + if (replace && catalog.functionExists(name)) { + checkFunctionSignatures(catalog, name) + } + + // Build function input. + val inputPlan = if (inputParam.isDefined) { + val param = inputParam.get + checkParameterNotNull(param, inputParamText.get) + checkParameterNameDuplication(param, conf, name) + checkDefaultsTrailing(param, name) + + // Qualify the input parameters with the function name so that attributes referencing + // the function input parameters can be resolved correctly. + val qualifier = Seq(name.funcName) + val input = param.map(p => Alias( + { + val defaultExpr = p.getDefault() + if (defaultExpr.isEmpty) { + Literal.create(null, p.dataType) + } else { + val defaultPlan = parseDefault(defaultExpr.get, parser) + if (SubqueryExpression.hasSubquery(defaultPlan)) { + throw new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.NOT_A_VALID_DEFAULT_EXPRESSION", + messageParameters = + Map("functionName" -> name.funcName, "parameterName" -> p.name)) + } else if (defaultPlan.containsPattern(UNRESOLVED_ATTRIBUTE)) { + // TODO(SPARK-50698): use parsed expression instead of expression string. + defaultPlan.collect { + case a: UnresolvedAttribute => + throw QueryCompilationErrors.unresolvedAttributeError( + "UNRESOLVED_COLUMN", a.sql, Seq.empty, a.origin) + } + } + Cast(defaultPlan, p.dataType) + } + }, p.name)(qualifier = qualifier)) + Project(input, OneRowRelation()) + } else { + OneRowRelation() + } + + // Build the function body and check if the function body can be analyzed successfully. + val (unresolvedPlan, analyzedPlan, inferredReturnType) = if (!isTableFunc) { + // Build SQL scalar function plan. + val outputExpr = if (query.isDefined) ScalarSubquery(query.get) else expression.get + val plan: LogicalPlan = returnType.map { t => + val retType: DataType = t match { + case Left(t) => t + case _ => throw SparkException.internalError( + "Unexpected return type for a scalar SQL UDF.") + } + val outputCast = Seq(Alias(Cast(outputExpr, retType), name.funcName)()) + Project(outputCast, inputPlan) + }.getOrElse { + // If no explicit RETURNS clause is present, infer the result type from the function body. + val outputAlias = Seq(Alias(outputExpr, name.funcName)()) + Project(outputAlias, inputPlan) + } + + // Check the function body can be analyzed correctly. + val analyzed = analyzer.execute(plan) + val (resolved, resolvedReturnType) = analyzed match { + case p @ Project(expr :: Nil, _) if expr.resolved => + (p, Left(expr.dataType)) + case other => + (other, function.returnType) + } + + // Check if the SQL function body contains aggregate/window functions. + // This check needs to be performed before checkAnalysis to provide better error messages. + checkAggOrWindowOrGeneratorExpr(resolved) + + // Check if the SQL function body can be analyzed. + checkFunctionBodyAnalysis(analyzer, function, resolved) + + (plan, resolved, resolvedReturnType) + } else { + // Build SQL table function plan. + if (query.isEmpty) { + throw UserDefinedFunctionErrors.bodyIsNotAQueryForSqlTableUdf(name.funcName) + } + + // Construct a lateral join to analyze the function body. + val plan = LateralJoin(inputPlan, LateralSubquery(query.get), Inner, None) + val analyzed = analyzer.execute(plan) + val newPlan = analyzed match { + case Project(_, j: LateralJoin) => j + case j: LateralJoin => j + case _ => throw SparkException.internalError("Unexpected plan returned when " + + s"creating a SQL TVF: ${analyzed.getClass.getSimpleName}.") + } + val maybeResolved = newPlan.asInstanceOf[LateralJoin].right.plan + + // Check if the function body can be analyzed. + checkFunctionBodyAnalysis(analyzer, function, maybeResolved) + + // Get the function's return schema. + val returnParam: StructType = returnType.map { + case Right(t) => t + case Left(_) => throw SparkException.internalError( + "Unexpected return schema for a SQL table function.") + }.getOrElse { + // If no explicit RETURNS clause is present, infer the result type from the function body. + // To detect this, we search for instances of the UnresolvedAlias expression. Examples: + // CREATE TABLE t USING PARQUET AS VALUES (0, 1), (1, 2) AS tab(c1, c2); + // SELECT c1 FROM t --> UnresolvedAttribute: 'c1 + // SELECT c1 + 1 FROM t --> UnresolvedAlias: unresolvedalias(('c1 + 1), None) + // SELECT c1 + 1 AS a FROM t --> Alias: ('c1 + 1) AS a#2 + query.get match { + case Project(projectList, _) if projectList.exists(_.isInstanceOf[UnresolvedAlias]) => + throw UserDefinedFunctionErrors.missingColumnNamesForSqlTableUdf(name.funcName) + case _ => + StructType(analyzed.asInstanceOf[LateralJoin].right.plan.output.map { col => + StructField(col.name, col.dataType) + }) + } + } + + // Check the return columns cannot have NOT NULL specified. + checkParameterNotNull(returnParam, returnTypeText) + + // Check duplicated return column names. + checkReturnsColumnDuplication(returnParam, conf, name) + + // Check if the actual output size equals to the number of return parameters. + val outputSize = maybeResolved.output.size + if (outputSize != returnParam.size) { + throw new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.RETURN_COLUMN_COUNT_MISMATCH", + messageParameters = Map( + "outputSize" -> s"$outputSize", + "returnParamSize" -> s"${returnParam.size}", + "name" -> s"$name" + ) + ) + } + + (plan, analyzed, Right(returnParam)) + } + + // A permanent function is not allowed to reference temporary objects. + // This should be called after `qe.assertAnalyzed()` (i.e., `plan` can be resolved) + verifyTemporaryObjectsNotExists(catalog, isTemp, name, unresolvedPlan, analyzedPlan) + + // Generate function properties. + val properties = generateFunctionProperties(sparkSession, unresolvedPlan, analyzedPlan) + + // Derive determinism of the SQL function. + val deterministic = analyzedPlan.deterministic + + function.copy( + // Assign the return type, inferring from the function body if needed. + returnType = inferredReturnType, + deterministic = Some(function.deterministic.getOrElse(deterministic)), + properties = properties + ) + } + + if (isTemp) { + if (isTableFunc) { + catalog.registerSQLTableFunction(newFunction, overrideIfExists = replace) + } else { + catalog.registerSQLScalarFunction(newFunction, overrideIfExists = replace) + } + } else { + if (replace && catalog.functionExists(name)) { + // Hive metastore alter function method does not alter function resources + // so the existing function must be dropped first when replacing a SQL function. + assert(!ignoreIfExists) + catalog.dropFunction(name, ignoreIfExists) + } + // For a persistent function, we will store the metadata into underlying external catalog. + // This function will be loaded into the FunctionRegistry when a query uses it. + // We do not load it into FunctionRegistry right now, to avoid loading the resource + // immediately, as the Spark application to create the function may not have + // access to the function. + catalog.createUserDefinedFunction(newFunction, ignoreIfExists) + } Seq.empty } + + /** + * Check if the function body can be analyzed. + */ + private def checkFunctionBodyAnalysis( + analyzer: Analyzer, + function: SQLFunction, + body: LogicalPlan): Unit = { + analyzer.checkAnalysis(SQLFunctionNode(function, body)) + } + + /** Check whether the new function is replacing an existing SQL function. */ + private def checkFunctionSignatures(catalog: SessionCatalog, name: FunctionIdentifier): Unit = { + val info = catalog.lookupFunctionInfo(name) + if (!isSQLFunction(info.getClassName)) { + throw new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.CANNOT_REPLACE_NON_SQL_UDF_WITH_SQL_UDF", + messageParameters = Map("name" -> s"$name") + ) + } + } + + /** + * Collect all temporary views and functions and return the identifiers separately + * This func traverses the unresolved plan `child`. Below are the reasons: + * 1) Analyzer replaces unresolved temporary views by a SubqueryAlias with the corresponding + * logical plan. After replacement, it is impossible to detect whether the SubqueryAlias is + * added/generated from a temporary view. + * 2) The temp functions are represented by multiple classes. Most are inaccessible from this + * package (e.g., HiveGenericUDF). + * 3) Temporary SQL functions, once resolved, cannot be identified as temp functions. + */ + private def collectTemporaryObjectsInUnresolvedPlan( + catalog: SessionCatalog, + child: LogicalPlan): (Seq[Seq[String]], Seq[String]) = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + def collectTempViews(child: LogicalPlan): Seq[Seq[String]] = { + child.flatMap { + case UnresolvedRelation(nameParts, _, _) if catalog.isTempView(nameParts) => + Seq(nameParts) + case w: UnresolvedWith if !w.resolved => w.innerChildren.flatMap(collectTempViews) + case plan if !plan.resolved => plan.expressions.flatMap(_.flatMap { + case e: SubqueryExpression => collectTempViews(e.plan) + case _ => Seq.empty + }) + case _ => Seq.empty + }.distinct + } + + def collectTempFunctions(child: LogicalPlan): Seq[String] = { + child.flatMap { + case w: UnresolvedWith if !w.resolved => w.innerChildren.flatMap(collectTempFunctions) + case plan if !plan.resolved => + plan.expressions.flatMap(_.flatMap { + case e: SubqueryExpression => collectTempFunctions(e.plan) + case e: UnresolvedFunction + if catalog.isTemporaryFunction(e.nameParts.asFunctionIdentifier) => + Seq(e.nameParts.asFunctionIdentifier.funcName) + case _ => Seq.empty + }) + case _ => Seq.empty + }.distinct + } + (collectTempViews(child), collectTempFunctions(child)) + } + + /** + * Permanent functions are not allowed to reference temp objects, including temp functions + * and temp views. + */ + private def verifyTemporaryObjectsNotExists( + catalog: SessionCatalog, + isTemporary: Boolean, + name: FunctionIdentifier, + child: LogicalPlan, + analyzed: LogicalPlan): Unit = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + if (!isTemporary) { + val (tempViews, tempFunctions) = collectTemporaryObjectsInUnresolvedPlan(catalog, child) + tempViews.foreach { nameParts => + throw UserDefinedFunctionErrors.invalidTempViewReference( + routineName = name.asMultipart, tempViewName = nameParts) + } + tempFunctions.foreach { funcName => + throw UserDefinedFunctionErrors.invalidTempFuncReference( + routineName = name.asMultipart, tempFuncName = funcName) + } + val tempVars = ViewHelper.collectTemporaryVariables(analyzed) + tempVars.foreach { varName => + throw UserDefinedFunctionErrors.invalidTempVarReference( + routineName = name.asMultipart, varName = varName) + } + } + } + + /** + * Check if the SQL function body contains aggregate/window/generate functions. + * Note subqueries inside the SQL function body can contain aggregate/window/generate functions. + */ + private def checkAggOrWindowOrGeneratorExpr(plan: LogicalPlan): Unit = { + if (plan.resolved) { + plan.transformAllExpressions { + case e if e.isInstanceOf[WindowExpression] || e.isInstanceOf[Generator] || + e.isInstanceOf[AggregateExpression] => + throw new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.CANNOT_CONTAIN_COMPLEX_FUNCTIONS", + messageParameters = Map("queryText" -> s"${exprText.orElse(queryText).get}") + ) + } + } + } + + /** + * Generate the function properties, including: + * 1. the SQL configs when creating the function. + * 2. the catalog and database name when creating the function. This will be used to provide + * context during nested function resolution. + * 3. referred temporary object names if the function is a temp function. + */ + private def generateFunctionProperties( + session: SparkSession, + plan: LogicalPlan, + analyzed: LogicalPlan): Map[String, String] = { + val catalog = session.sessionState.catalog + val conf = session.sessionState.conf + val manager = session.sessionState.catalogManager + + // Only collect temporary object names when the function is a temp function. + val (tempViews, tempFunctions) = if (isTemp) { + collectTemporaryObjectsInUnresolvedPlan(catalog, plan) + } else { + (Nil, Nil) + } + val tempVars = ViewHelper.collectTemporaryVariables(analyzed) + + sqlConfigsToProps(conf) ++ + catalogAndNamespaceToProps( + manager.currentCatalog.name, + manager.currentNamespace.toIndexedSeq) ++ + referredTempNamesToProps(tempViews, tempFunctions, tempVars) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala index bebb0f5cf6c38..1ee3c8a4c388f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala @@ -17,9 +17,15 @@ package org.apache.spark.sql.execution.command +import java.util.Locale + +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.catalog.{LanguageSQL, RoutineLanguage, UserDefinedFunctionErrors} +import org.apache.spark.sql.catalyst.catalog.UserDefinedFunction._ import org.apache.spark.sql.catalyst.plans.logical.IgnoreCachedData +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType /** * The base class for CreateUserDefinedFunctionCommand @@ -74,4 +80,108 @@ object CreateUserDefinedFunctionCommand { throw UserDefinedFunctionErrors.unsupportedUserDefinedFunction(other) } } + + /** + * Convert SQL configs to properties by prefixing all configs with a key. + * When converting a function to [[org.apache.spark.sql.catalyst.catalog.CatalogFunction]] or + * [[org.apache.spark.sql.catalyst.expressions.ExpressionInfo]], all SQL configs and other + * function properties (such as the function parameters and the function return type) + * are saved together in a property map. + */ + def sqlConfigsToProps(conf: SQLConf): Map[String, String] = { + val modifiedConfs = ViewHelper.getModifiedConf(conf) + modifiedConfs.map { case (key, value) => s"$SQL_CONFIG_PREFIX$key" -> value } + } + + /** + * Check whether the function parameters contain duplicated column names. + * It takes the function input parameter struct as input and verifies that there is no duplicates + * in the parameter column names. + * If any duplicates are found, it throws an exception with helpful information for users to + * fix the wrong function parameters. + * + * Perform this check while registering the function to fail early. + * This check does not need to run the function itself. + */ + def checkParameterNameDuplication( + param: StructType, + conf: SQLConf, + name: FunctionIdentifier): Unit = { + val names = if (conf.caseSensitiveAnalysis) { + param.fields.map(_.name) + } else { + param.fields.map(_.name.toLowerCase(Locale.ROOT)) + } + if (names.distinct.length != names.length) { + val duplicateColumns = names.groupBy(identity).collect { + case (x, ys) if ys.length > 1 => s"`$x`" + } + throw UserDefinedFunctionErrors.duplicateParameterNames( + routineName = name.funcName, + names = duplicateColumns.toSeq.sorted.mkString(", ")) + } + } + + /** + * Check whether the function has duplicate column names in the RETURNS clause. + */ + def checkReturnsColumnDuplication( + columns: StructType, + conf: SQLConf, + name: FunctionIdentifier): Unit = { + val names = if (conf.caseSensitiveAnalysis) { + columns.fields.map(_.name) + } else { + columns.fields.map(_.name.toLowerCase(Locale.ROOT)) + } + if (names.distinct.length != names.length) { + val duplicateColumns = names.groupBy(identity).collect { + case (x, ys) if ys.length > 1 => s"`$x`" + } + throw UserDefinedFunctionErrors.duplicateReturnsColumns( + routineName = name.funcName, + columns = duplicateColumns.toSeq.sorted.mkString(", ")) + } + } + + /** + * Check whether the function parameters contain non trailing defaults. + * For languages that support default values for input parameters, + * this check ensures once a default value is given to a parameter, + * all subsequent parameters must also have a default value. It throws error if otherwise. + * + * Perform this check on function input parameters while registering the function to fail early. + * This check does not need to run the function itself. + */ + def checkDefaultsTrailing(param: StructType, name: FunctionIdentifier): Unit = { + var defaultFound = false + var previousParamName = ""; + param.fields.foreach { field => + if (field.getDefault().isEmpty && defaultFound) { + throw new AnalysisException( + errorClass = "USER_DEFINED_FUNCTIONS.NOT_A_VALID_DEFAULT_PARAMETER_POSITION", + messageParameters = Map( + "functionName" -> name.funcName, + "parameterName" -> previousParamName, + "nextParameterName" -> field.name)) + } + defaultFound |= field.getDefault().isDefined + previousParamName = field.name + } + } + + /** + * Check whether the function input or return columns (for TABLE Return type) have NOT NULL + * specified. Throw exception if NOT NULL is found. + * + * Perform this check on function input and return parameters while registering the function + * to fail early. This check does not need to run the function itself. + */ + def checkParameterNotNull(param: StructType, input: String): Unit = { + param.fields.foreach { field => + if (!field.nullable) { + throw UserDefinedFunctionErrors.cannotSpecifyNotNullOnFunctionParameters(input) + } + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala new file mode 100644 index 0000000000000..6abe34f0ea156 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import scala.collection.mutable + +import org.json4s._ +import org.json4s.JsonAST.JObject +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.analysis.{ResolvedPersistentView, ResolvedTable, ResolvedTempView} +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, SessionCatalog} +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.util.quoteIfNeeded +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ +import org.apache.spark.sql.connector.catalog.V1Table +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.PartitioningUtils + +/** + * The command for `DESCRIBE ... AS JSON`. + */ +case class DescribeRelationJsonCommand( + child: LogicalPlan, + partitionSpec: TablePartitionSpec, + isExtended: Boolean, + override val output: Seq[Attribute] = Seq( + AttributeReference( + "json_metadata", + StringType, + nullable = false, + new MetadataBuilder().putString("comment", "JSON metadata of the table").build())() + )) extends UnaryRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val jsonMap = mutable.LinkedHashMap[String, JValue]() + child match { + case v: ResolvedTempView => + if (partitionSpec.nonEmpty) { + throw QueryCompilationErrors.descPartitionNotAllowedOnTempView(v.identifier.name()) + } + describeIdentifier(Seq("system", "session", v.identifier.name()), jsonMap) + describeColsJson(v.metadata.schema, jsonMap) + describeFormattedTableInfoJson(v.metadata, jsonMap) + + case v: ResolvedPersistentView => + if (partitionSpec.nonEmpty) { + throw QueryCompilationErrors.descPartitionNotAllowedOnView(v.identifier.name()) + } + describeIdentifier(v.identifier.toQualifiedNameParts(v.catalog), jsonMap) + describeColsJson(v.metadata.schema, jsonMap) + describeFormattedTableInfoJson(v.metadata, jsonMap) + + case ResolvedTable(catalog, identifier, V1Table(metadata), _) => + describeIdentifier(identifier.toQualifiedNameParts(catalog), jsonMap) + val schema = if (metadata.schema.isEmpty) { + // In older versions of Spark, + // the table schema can be empty and should be inferred at runtime. + sparkSession.table(metadata.identifier).schema + } else { + metadata.schema + } + describeColsJson(schema, jsonMap) + describeClusteringInfoJson(metadata, jsonMap) + if (partitionSpec.nonEmpty) { + // Outputs the partition-specific info for the DDL command: + // "DESCRIBE [EXTENDED|FORMATTED] table_name PARTITION (partitionVal*)" + describePartitionInfoJson( + sparkSession, sparkSession.sessionState.catalog, metadata, jsonMap) + } else { + describeFormattedTableInfoJson(metadata, jsonMap) + } + + case _ => throw QueryCompilationErrors.describeAsJsonNotSupportedForV2TablesError() + } + + Seq(Row(compact(render(JObject(jsonMap.toList))))) + } + + private def addKeyValueToMap( + key: String, + value: JValue, + jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = { + // Rename some JSON keys that are pre-named in describe table implementation + val renames = Map( + "inputformat" -> "input_format", + "outputformat" -> "output_format" + ) + + val normalizedKey = key.toLowerCase().replace(" ", "_") + val renamedKey = renames.getOrElse(normalizedKey, normalizedKey) + + if (!jsonMap.contains(renamedKey) && !excludedKeys.contains(renamedKey)) { + jsonMap += renamedKey -> value + } + } + + private def describeIdentifier( + ident: Seq[String], + jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = { + addKeyValueToMap("table_name", JString(ident.last), jsonMap) + addKeyValueToMap("catalog_name", JString(ident.head), jsonMap) + val namespace = ident.init.tail + addKeyValueToMap("namespace", JArray(namespace.map(JString).toList), jsonMap) + if (namespace.nonEmpty) { + addKeyValueToMap("schema_name", JString(namespace.last), jsonMap) + } + } + + /** + * Util to recursively form JSON string representation of data type, used for DESCRIBE AS JSON. + * Differs from `json` in DataType.scala by providing additional fields for some types. + */ + private def jsonType(dataType: DataType): JValue = { + dataType match { + case arrayType: ArrayType => + JObject( + "name" -> JString("array"), + "element_type" -> jsonType(arrayType.elementType), + "element_nullable" -> JBool(arrayType.containsNull) + ) + + case mapType: MapType => + JObject( + "name" -> JString("map"), + "key_type" -> jsonType(mapType.keyType), + "value_type" -> jsonType(mapType.valueType), + "value_nullable" -> JBool(mapType.valueContainsNull) + ) + + case structType: StructType => + val fieldsJson = structType.fields.map { field => + val baseJson = List( + "name" -> JString(field.name), + "type" -> jsonType(field.dataType), + "nullable" -> JBool(field.nullable) + ) + val commentJson = field.getComment().map(comment => "comment" -> JString(comment)).toList + val defaultJson = + field.getCurrentDefaultValue().map(default => "default" -> JString(default)).toList + + JObject(baseJson ++ commentJson ++ defaultJson: _*) + }.toList + + JObject( + "name" -> JString("struct"), + "fields" -> JArray(fieldsJson) + ) + + case decimalType: DecimalType => + JObject( + "name" -> JString("decimal"), + "precision" -> JInt(decimalType.precision), + "scale" -> JInt(decimalType.scale) + ) + + case varcharType: VarcharType => + JObject( + "name" -> JString("varchar"), + "length" -> JInt(varcharType.length) + ) + + case charType: CharType => + JObject( + "name" -> JString("char"), + "length" -> JInt(charType.length) + ) + + // Only override TimestampType; TimestampType_NTZ type is already timestamp_ntz + case _: TimestampType => + JObject("name" -> JString("timestamp_ltz")) + + case yearMonthIntervalType: YearMonthIntervalType => + def getFieldName(field: Byte): String = YearMonthIntervalType.fieldToString(field) + + JObject( + "name" -> JString("interval"), + "start_unit" -> JString(getFieldName(yearMonthIntervalType.startField)), + "end_unit" -> JString(getFieldName(yearMonthIntervalType.endField)) + ) + + case dayTimeIntervalType: DayTimeIntervalType => + def getFieldName(field: Byte): String = DayTimeIntervalType.fieldToString(field) + + JObject( + "name" -> JString("interval"), + "start_unit" -> JString(getFieldName(dayTimeIntervalType.startField)), + "end_unit" -> JString(getFieldName(dayTimeIntervalType.endField)) + ) + + case _ => + JObject("name" -> JString(dataType.simpleString)) + } + } + + private def describeColsJson( + schema: StructType, + jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = { + val columnsJson = jsonType(StructType(schema.fields)) + .asInstanceOf[JObject].find(_.isInstanceOf[JArray]).get + addKeyValueToMap("columns", columnsJson, jsonMap) + } + + private def describeClusteringInfoJson( + table: CatalogTable, jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = { + table.clusterBySpec.foreach { clusterBySpec => + val clusteringColumnsJson: JValue = JArray( + clusterBySpec.columnNames.map { fieldNames => + val nestedFieldOpt = table.schema.findNestedField(fieldNames.fieldNames.toIndexedSeq) + assert(nestedFieldOpt.isDefined, + "The clustering column " + + s"${fieldNames.fieldNames.map(quoteIfNeeded).mkString(".")} " + + s"was not found in the table schema ${table.schema.catalogString}." + ) + val (path, field) = nestedFieldOpt.get + JObject( + "name" -> JString((path :+ field.name).map(quoteIfNeeded).mkString(".")), + "type" -> jsonType(field.dataType), + "comment" -> field.getComment().map(JString).getOrElse(JNull) + ) + }.toList + ) + addKeyValueToMap("clustering_information", clusteringColumnsJson, jsonMap) + } + } + + private def describeFormattedTableInfoJson( + table: CatalogTable, jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = { + table.bucketSpec match { + case Some(spec) => + spec.toJsonLinkedHashMap.foreach { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + case _ => + } + table.storage.toJsonLinkedHashMap.foreach { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + + val filteredTableInfo = table.toJsonLinkedHashMap + + filteredTableInfo.map { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + } + + private def describePartitionInfoJson( + spark: SparkSession, + catalog: SessionCatalog, + metadata: CatalogTable, + jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = { + if (metadata.tableType == CatalogTableType.VIEW) { + throw QueryCompilationErrors.descPartitionNotAllowedOnView(metadata.identifier.identifier) + } + + DDLUtils.verifyPartitionProviderIsHive(spark, metadata, "DESC PARTITION") + val normalizedPartSpec = PartitioningUtils.normalizePartitionSpec( + partitionSpec, + metadata.partitionSchema, + metadata.identifier.quotedString, + spark.sessionState.conf.resolver) + val partition = catalog.getPartition(metadata.identifier, normalizedPartSpec) + + // First add partition details to jsonMap. + // `addKeyValueToMap` only adds unique keys, so this ensures the + // more detailed partition information is added + // in the case of duplicated key names (e.g. storage_information). + partition.toJsonLinkedHashMap.foreach { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + + metadata.toJsonLinkedHashMap.foreach { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + + metadata.bucketSpec match { + case Some(spec) => + spec.toJsonLinkedHashMap.foreach { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + case _ => + } + metadata.storage.toJsonLinkedHashMap.foreach { case (key, value) => + addKeyValueToMap(key, value, jsonMap) + } + } + + // Already added to jsonMap in DescribeTableJsonCommand + private val excludedKeys = Set("catalog", "schema", "database", "table") + + override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = { + copy(child = newChild) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index a8a91af1bdbc4..9dfe5c3e4c301 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -196,7 +196,8 @@ case class DescribeDatabaseCommand( if (properties.isEmpty) { "" } else { - conf.redactOptions(properties).toSeq.sortBy(_._1).mkString("(", ", ", ")") + sparkSession.sessionState.conf.redactOptions(properties).toSeq + .sortBy(_._1).mkString("(", ", ", ")") } result :+ Row("Properties", propertiesStr) } else { @@ -548,7 +549,7 @@ case class AlterTableAddPartitionCommand( // Hive metastore may not have enough memory to handle millions of partitions in single RPC. // Also the request to metastore times out when adding lot of partitions in one shot. // we should split them into smaller batches - val batchSize = conf.getConf(SQLConf.ADD_PARTITION_BATCH_SIZE) + val batchSize = sparkSession.sessionState.conf.getConf(SQLConf.ADD_PARTITION_BATCH_SIZE) parts.iterator.grouped(batchSize).foreach { batch => catalog.createPartitions(table.identifier, batch, ignoreIfExists = ifNotExists) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 9ecd3fd19aa64..a58e8fac6e36d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -240,7 +240,10 @@ case class AlterTableAddColumnsCommand( SchemaUtils.checkColumnNameDuplication( (colsWithProcessedDefaults ++ catalogTable.schema).map(_.name), - conf.caseSensitiveAnalysis) + sparkSession.sessionState.conf.caseSensitiveAnalysis) + if (!conf.allowCollationsInMapKeys) { + colsToAdd.foreach(col => SchemaUtils.checkNoCollationsInMapKeys(col.dataType)) + } DDLUtils.checkTableColumns(catalogTable, StructType(colsWithProcessedDefaults)) val existingSchema = CharVarcharUtils.getRawSchema(catalogTable.dataSchema) @@ -498,7 +501,7 @@ case class TruncateTableCommand( partLocations } val hadoopConf = spark.sessionState.newHadoopConf() - val ignorePermissionAcl = conf.truncateTableIgnorePermissionAcl + val ignorePermissionAcl = spark.sessionState.conf.truncateTableIgnorePermissionAcl locations.foreach { location => if (location.isDefined) { val path = new Path(location.get) @@ -816,7 +819,8 @@ case class DescribeColumnCommand( val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table) val colStatsMap = catalogTable.stats.map(_.colStats).getOrElse(Map.empty) - val colStats = if (conf.caseSensitiveAnalysis) colStatsMap else CaseInsensitiveMap(colStatsMap) + val colStats = if (sparkSession.sessionState.conf.caseSensitiveAnalysis) colStatsMap + else CaseInsensitiveMap(colStatsMap) val cs = colStats.get(field.name) val comment = if (field.metadata.contains("comment")) { @@ -972,7 +976,7 @@ case class ShowTablePropertiesCommand( Seq.empty[Row] } else { val catalogTable = catalog.getTableMetadata(table) - val properties = conf.redactOptions(catalogTable.properties) + val properties = sparkSession.sessionState.conf.redactOptions(catalogTable.properties) propertyKey match { case Some(p) => val propValue = properties diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 071e3826b20a0..6428583c9e1ea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -47,6 +47,7 @@ import org.apache.spark.util.ArrayImplicits._ * @param userSpecifiedColumns the output column names and optional comments specified by users, * can be Nil if not specified. * @param comment the comment of this view. + * @param collation the collation of this view. * @param properties the properties of this view. * @param originalText the original SQL text of this view, can be None if this view is created via * Dataset API. @@ -64,6 +65,7 @@ case class CreateViewCommand( name: TableIdentifier, userSpecifiedColumns: Seq[(String, Option[String])], comment: Option[String], + collation: Option[String], properties: Map[String, String], originalText: Option[String], plan: LogicalPlan, @@ -220,7 +222,8 @@ case class CreateViewCommand( properties = newProperties, viewOriginalText = originalText, viewText = originalText, - comment = comment + comment = comment, + collation = collation ) } @@ -461,12 +464,19 @@ object ViewHelper extends SQLConfHelper with Logging { } /** - * Convert the view SQL configs to `properties`. + * Get all configurations that are modifiable and should be captured. */ - private def sqlConfigsToProps(conf: SQLConf): Map[String, String] = { - val modifiedConfs = conf.getAllConfs.filter { case (k, _) => + def getModifiedConf(conf: SQLConf): Map[String, String] = { + conf.getAllConfs.filter { case (k, _) => conf.isModifiable(k) && shouldCaptureConfig(k) } + } + + /** + * Convert the view SQL configs to `properties`. + */ + private def sqlConfigsToProps(conf: SQLConf): Map[String, String] = { + val modifiedConfs = getModifiedConf(conf) // Some configs have dynamic default values, such as SESSION_LOCAL_TIMEZONE whose // default value relies on the JVM system timezone. We need to always capture them to // to make sure we apply the same configs when reading the view. @@ -687,7 +697,7 @@ object ViewHelper extends SQLConfHelper with Logging { /** * Collect all temporary SQL variables and return the identifiers separately. */ - private def collectTemporaryVariables(child: LogicalPlan): Seq[Seq[String]] = { + def collectTemporaryVariables(child: LogicalPlan): Seq[Seq[String]] = { def collectTempVars(child: LogicalPlan): Seq[Seq[String]] = { child.flatMap { plan => plan.expressions.flatMap(_.flatMap { @@ -729,7 +739,8 @@ object ViewHelper extends SQLConfHelper with Logging { val uncache = getRawTempView(name.table).map { r => needsToUncache(r, aliasedPlan) }.getOrElse(false) - val storeAnalyzedPlanForView = conf.storeAnalyzedPlanForView || originalText.isEmpty + val storeAnalyzedPlanForView = session.sessionState.conf.storeAnalyzedPlanForView || + originalText.isEmpty if (replace && uncache) { logDebug(s"Try to uncache ${name.quotedString} before replacing.") if (!storeAnalyzedPlanForView) { @@ -782,7 +793,6 @@ object ViewHelper extends SQLConfHelper with Logging { originalText: String, tempFunctions: Seq[String]): CatalogTable = { - val catalog = session.sessionState.catalog val tempViews = collectTemporaryViews(analyzedPlan) val tempVariables = collectTemporaryVariables(analyzedPlan) // TBLPROPERTIES is not allowed for temporary view, so we don't use it for @@ -797,6 +807,7 @@ object ViewHelper extends SQLConfHelper with Logging { storage = CatalogStorageFormat.empty, schema = viewSchema, viewText = Some(originalText), + createVersion = org.apache.spark.SPARK_VERSION, properties = newProperties) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index bc156cd82ed6a..58bbd91a8cc77 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -682,11 +682,10 @@ object DataSource extends Logging { throw e } } - case _ :: Nil if isUserDefinedDataSource => - // There was DSv1 or DSv2 loaded, but the same name source was found - // in user defined data source. - throw QueryCompilationErrors.foundMultipleDataSources(provider) case head :: Nil => + // We do not check whether the provider is a Python data source + // (isUserDefinedDataSource) to avoid the lookup cost. Java data sources + // always take precedence over Python user-defined data sources. head.getClass case sources => // There are multiple registered aliases for the input. If there is single datasource diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala index 93fc6cf367cfc..711e096ebd1f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala @@ -101,6 +101,7 @@ object DataSourceManager extends Logging { private def initialStaticDataSourceBuilders: Map[String, UserDefinedPythonDataSource] = { if (shouldLoadPythonDataSources) this.synchronized { + logInfo("Loading static Python Data Sources.") if (dataSourceBuilders.isEmpty) { val maybeResult = try { Some(UserDefinedPythonDataSource.lookupAllDataSourcesInPython()) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceResolver.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceResolver.scala new file mode 100644 index 0000000000000..3a2a3207a01f9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceResolver.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.resolver.{ + ExplicitlyUnsupportedResolverFeature, + ResolverExtension +} +import org.apache.spark.sql.catalyst.catalog.UnresolvedCatalogRelation +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 +import org.apache.spark.sql.execution.streaming.StreamingRelation + +/** + * The [[DataSourceResolver]] is a [[Resolver]] extension that resolves nodes defined in the + * [[datasources]] package. We have it as an extension to avoid cyclic dependencies between + * [[resolver]] and [[datasources]] packages. + */ +class DataSourceResolver(sparkSession: SparkSession) extends ResolverExtension { + private val findDataSourceTable = new FindDataSourceTable(sparkSession) + + /** + * Resolve [[UnresolvedCatalogRelation]]: + * - Reuse [[FindDataSourceTable]] code to resolve [[UnresolvedCatalogRelation]] + * - Create a new instance of [[LogicalRelation]] to regenerate the expression IDs + * - Explicitly disallow [[StreamingRelation]] and [[StreamingRelationV2]] for now + * - [[FileResolver]], which is a [[ResolverExtension]], introduces a new [[LogicalPlan]] node + * which resolution has to be handled here (further resolution of it doesn't need any specific + * resolution except adding it's attributes to the scope). + */ + override def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] = { + case unresolvedCatalogRelation: UnresolvedCatalogRelation => + val result = findDataSourceTable.resolveUnresolvedCatalogRelation(unresolvedCatalogRelation) + result match { + case logicalRelation: LogicalRelation => + logicalRelation.newInstance() + case streamingRelation: StreamingRelation => + throw new ExplicitlyUnsupportedResolverFeature( + s"unsupported operator: ${streamingRelation.getClass.getName}" + ) + case streamingRelationV2: StreamingRelationV2 => + throw new ExplicitlyUnsupportedResolverFeature( + s"unsupported operator: ${streamingRelationV2.getClass.getName}" + ) + case other => + other + } + case logicalRelation: LogicalRelation => + logicalRelation.newInstance() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileResolver.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileResolver.scala new file mode 100644 index 0000000000000..44102da752c2e --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileResolver.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.resolver.ResolverExtension +import org.apache.spark.sql.catalyst.plans.logical.{AnalysisHelper, LogicalPlan} + +/** + * The [[FileResolver]] is a [[MetadataResolver]] extension that resolves [[UnresolvedRelation]] + * which is created out of file. It reuses the code from [[ResolveSQLOnFile]] to resolve it + * properly. + * + * We have it as an extension to avoid cyclic dependencies between [[resolver]] and [[datasources]] + * packages. + */ +class FileResolver(sparkSession: SparkSession) extends ResolverExtension { + private val resolveSQLOnFile = new ResolveSQLOnFile(sparkSession) + + /** + * [[ResolveSQLOnFile]] code that is reused to resolve [[UnresolvedRelation]] has + * [[ExpressionEncoder.resolveAndBind]] on its path which introduces another call to + * the analyzer which is acceptable as it is called on the leaf node of the plan. That's why we + * have to allow invoking transforms in the single-pass analyzer. + */ + object UnresolvedRelationResolution { + def unapply(operator: LogicalPlan): Option[LogicalPlan] = + AnalysisHelper.allowInvokingTransformsInAnalyzer { + resolveSQLOnFile.UnresolvedRelationResolution.unapply(operator) + } + } + + /** + * Reuse [[ResolveSQLOnFile]] code to resolve [[UnresolvedRelation]] made out of file. + */ + override def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] = { + case UnresolvedRelationResolution(resolvedRelation) => + resolvedRelation + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 9bcdbadf7c5c0..e468807f4ffd1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -21,6 +21,8 @@ import java.io.{Closeable, FileNotFoundException, IOException} import java.net.URI import org.apache.hadoop.fs.Path +import org.apache.hadoop.hdfs.BlockMissingException +import org.apache.hadoop.security.AccessControlException import org.apache.spark.{Partition => RDDPartition, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil @@ -266,6 +268,7 @@ class FileScanRDD( null // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => logWarning(log"Skipped the rest of the content in the corrupted file: " + log"${MDC(PATH, currentFile)}", e) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index aed129c7dccc4..8a795f0748811 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -84,6 +84,9 @@ case class InsertIntoHadoopFsRelationCommand( outputColumnNames, sparkSession.sessionState.conf.caseSensitiveAnalysis) } + if (!conf.allowCollationsInMapKeys) { + SchemaUtils.checkNoCollationsInMapKeys(query.schema) + } val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options) val fs = outputPath.getFileSystem(hadoopConf) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PushVariantIntoScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PushVariantIntoScan.scala new file mode 100644 index 0000000000000..33ba4f772a13a --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PushVariantIntoScan.scala @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import scala.collection.mutable.HashMap + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.variant._ +import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project, Subquery} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +// A metadata class of a struct field. All struct fields in a struct must either all have this +// metadata, or all don't have it. +// We define a "variant struct" as: a special struct with its fields annotated with this metadata. +// It indicates that the struct should produce all requested fields of a variant type, and should be +// treated specially by the scan. +case class VariantMetadata( + // The `path` parameter of VariantGet. It has the same format as a JSON path, except that + // `[*]` is not supported. + path: String, + failOnError: Boolean, + timeZoneId: String) { + // Produce a metadata contain one key-value pair. The key is the special `METADATA_KEY`. + // The value contains three key-value pairs for `path`, `failOnError`, and `timeZoneId`. + def toMetadata: Metadata = + new MetadataBuilder().putMetadata( + VariantMetadata.METADATA_KEY, + new MetadataBuilder() + .putString(VariantMetadata.PATH_KEY, path) + .putBoolean(VariantMetadata.FAIL_ON_ERROR_KEY, failOnError) + .putString(VariantMetadata.TIME_ZONE_ID_KEY, timeZoneId) + .build() + ).build() + + def parsedPath(): Array[VariantPathSegment] = { + VariantPathParser.parse(path).getOrElse { + val name = if (failOnError) "variant_get" else "try_variant_get" + throw QueryExecutionErrors.invalidVariantGetPath(path, name) + } + } +} + +object VariantMetadata { + val METADATA_KEY = "__VARIANT_METADATA_KEY" + val PATH_KEY = "path" + val FAIL_ON_ERROR_KEY = "failOnError" + val TIME_ZONE_ID_KEY = "timeZoneId" + + def isVariantStruct(s: StructType): Boolean = + s.fields.length > 0 && s.fields.forall(_.metadata.contains(METADATA_KEY)) + + def isVariantStruct(t: DataType): Boolean = t match { + case s: StructType => isVariantStruct(s) + case _ => false + } + + // Parse the `VariantMetadata` from a metadata produced by `toMetadata`. + def fromMetadata(metadata: Metadata): VariantMetadata = { + val value = metadata.getMetadata(METADATA_KEY) + VariantMetadata( + value.getString(PATH_KEY), + value.getBoolean(FAIL_ON_ERROR_KEY), + value.getString(TIME_ZONE_ID_KEY) + ) + } +} + +// Represent a requested field of a variant that the scan should produce. +// Each `RequestedVariantField` is corresponded to a variant path extraction in the plan. +case class RequestedVariantField(path: VariantMetadata, targetType: DataType) + +object RequestedVariantField { + def fullVariant: RequestedVariantField = + RequestedVariantField(VariantMetadata("$", failOnError = true, "UTC"), VariantType) + + def apply(v: VariantGet): RequestedVariantField = + RequestedVariantField( + VariantMetadata(v.path.eval().toString, v.failOnError, v.timeZoneId.get), v.dataType) + + def apply(c: Cast): RequestedVariantField = + RequestedVariantField( + VariantMetadata("$", c.evalMode != EvalMode.TRY, c.timeZoneId.get), c.dataType) +} + +// Extract a nested struct access path. Return the (root attribute id, a sequence of ordinals to +// access the field). For non-nested attribute access, the sequence is empty. +object StructPath { + def unapply(expr: Expression): Option[(ExprId, Seq[Int])] = expr match { + case GetStructField(StructPath(root, path), ordinal, _) => Some((root, path :+ ordinal)) + case a: Attribute => Some(a.exprId, Nil) + case _ => None + } +} + +// A collection of all eligible variants in a relation, which are in the root of the relation output +// schema, or only nested in struct types. +// The user should: +// 1. Call `addVariantFields` to add all eligible variants in a relation. +// 2. Call `collectRequestedFields` on all expressions depending on the relation. This process will +// add the requested fields of each variant and potentially remove non-eligible variants. See +// `collectRequestedFields` for details. +// 3. Call `rewriteType` to produce a new output schema for the relation. +// 4. Call `rewriteExpr` to rewrite the previously visited expressions by replacing variant +// extractions with struct accessed. +class VariantInRelation { + // First level key: root attribute id. + // Second level key: struct access paths to the variant type. + // Third level key: requested fields of a variant type. + // Final value: the ordinal of a requested field in the final struct of requested fields. + val mapping = new HashMap[ExprId, HashMap[Seq[Int], HashMap[RequestedVariantField, Int]]] + + // Extract the SQL-struct path where the leaf is a variant. + object StructPathToVariant { + def unapply(expr: Expression): Option[HashMap[RequestedVariantField, Int]] = expr match { + case StructPath(attrId, path) => + mapping.get(attrId).flatMap(_.get(path)) + case _ => None + } + } + + // Find eligible variants recursively. `attrId` is the root attribute id. + // `path` is the current struct access path. `dataType` is the child data type after extracting + // `path` from the root attribute struct. + def addVariantFields( + attrId: ExprId, + dataType: DataType, + defaultValue: Any, + path: Seq[Int]): Unit = { + dataType match { + // TODO(SHREDDING): non-null default value is not yet supported. + case _: VariantType if defaultValue == null => + mapping.getOrElseUpdate(attrId, new HashMap).put(path, new HashMap) + case s: StructType if !VariantMetadata.isVariantStruct(s) => + val row = defaultValue.asInstanceOf[InternalRow] + for ((field, idx) <- s.fields.zipWithIndex) { + val fieldDefault = if (row == null || row.isNullAt(idx)) { + null + } else { + row.get(idx, field.dataType) + } + addVariantFields(attrId, field.dataType, fieldDefault, path :+ idx) + } + case _ => + } + } + + def rewriteType(attrId: ExprId, dataType: DataType, path: Seq[Int]): DataType = { + dataType match { + case _: VariantType => + mapping.get(attrId).flatMap(_.get(path)) match { + case Some(fields) => + var requestedFields = fields.toArray.sortBy(_._2).map { case (field, ordinal) => + StructField(ordinal.toString, field.targetType, metadata = field.path.toMetadata) + } + // Avoid producing an empty struct of requested fields. This is intended to simplify the + // scan implementation, which may not be able to handle empty struct type. This happens + // if the variant is not used, or only used in `IsNotNull/IsNull` expressions. The value + // of the placeholder field doesn't matter, even if the scan source accidentally + // contains such a field. + if (requestedFields.isEmpty) { + val placeholder = VariantMetadata("$.__placeholder_field__", + failOnError = false, timeZoneId = "UTC") + requestedFields = Array(StructField("0", BooleanType, + metadata = placeholder.toMetadata)) + } + StructType(requestedFields) + case _ => dataType + } + case s: StructType if !VariantMetadata.isVariantStruct(s) => + val newFields = s.fields.zipWithIndex.map { case (field, idx) => + field.copy(dataType = rewriteType(attrId, field.dataType, path :+ idx)) + } + StructType(newFields) + case _ => dataType + } + } + + // Add a requested field to a variant column. + private def addField( + map: HashMap[RequestedVariantField, Int], + field: RequestedVariantField): Unit = { + val idx = map.size + map.getOrElseUpdate(field, idx) + } + + // Update `mapping` with any access to a variant. Add the requested fields of each variant and + // potentially remove non-eligible variants. + // If a struct containing a variant is directly used, this variant is not eligible for push down. + // This is because we need to replace the variant type with a struct producing all requested + // fields, which also changes the struct type containing it, and it is difficult to reconstruct + // the original struct value. This is not a big loss, because we need the full variant anyway. + def collectRequestedFields(expr: Expression): Unit = expr match { + case v@VariantGet(StructPathToVariant(fields), _, _, _, _) => + addField(fields, RequestedVariantField(v)) + case c@Cast(StructPathToVariant(fields), _, _, _) => addField(fields, RequestedVariantField(c)) + case IsNotNull(StructPath(_, _)) | IsNull(StructPath(_, _)) => + case StructPath(attrId, path) => + mapping.get(attrId) match { + case Some(variants) => + variants.get(path) match { + case Some(fields) => + addField(fields, RequestedVariantField.fullVariant) + case _ => + // Remove non-eligible variants. + variants.filterInPlace { case (key, _) => !key.startsWith(path) } + } + case _ => + } + case _ => expr.children.foreach(collectRequestedFields) + } + + def rewriteExpr( + expr: Expression, + attributeMap: Map[ExprId, AttributeReference]): Expression = { + def rewriteAttribute(expr: Expression): Expression = expr.transformDown { + case a: Attribute => attributeMap.getOrElse(a.exprId, a) + } + + // Rewrite patterns should be consistent with visit patterns in `collectRequestedFields`. + expr.transformDown { + case g@VariantGet(v@StructPathToVariant(fields), _, _, _, _) => + // Rewrite the attribute in advance, rather than depending on the last branch to rewrite it. + // Ww need to avoid the `v@StructPathToVariant(fields)` branch to rewrite the child again. + GetStructField(rewriteAttribute(v), fields(RequestedVariantField(g))) + case c@Cast(v@StructPathToVariant(fields), _, _, _) => + GetStructField(rewriteAttribute(v), fields(RequestedVariantField(c))) + case i@IsNotNull(StructPath(_, _)) => rewriteAttribute(i) + case i@IsNull(StructPath(_, _)) => rewriteAttribute(i) + case v@StructPathToVariant(fields) => + GetStructField(rewriteAttribute(v), fields(RequestedVariantField.fullVariant)) + case a: Attribute => attributeMap.getOrElse(a.exprId, a) + } + } +} + +// Push variant into scan by rewriting the variant type with a struct type producing all requested +// fields and rewriting the variant extraction expressions by struct accesses. +// For example, for an input plan: +// - Project [v:a::int, v:b::string, v] +// - Filter [v:a::int = 1] +// - Relation [v: variant] +// Rewrite it as: +// - Project [v.0, v.1, v.2] +// - Filter [v.0 = 1] +// - Relation [v: struct<0: int, 1: string, 2: variant>] +// The struct fields are annotated with `VariantMetadata` to indicate the extraction path. +object PushVariantIntoScan extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan match { + // A correlated subquery will be rewritten into join later, and will go through this rule + // eventually. + case s: Subquery if s.correlated => plan + case _ if !SQLConf.get.getConf(SQLConf.PUSH_VARIANT_INTO_SCAN) => plan + case _ => plan.transformDown { + case p@PhysicalOperation(projectList, filters, + relation @ LogicalRelationWithTable( + hadoopFsRelation@HadoopFsRelation(_, _, _, _, _: ParquetFileFormat, _), _)) => + rewritePlan(p, projectList, filters, relation, hadoopFsRelation) + } + } + + private def rewritePlan( + originalPlan: LogicalPlan, + projectList: Seq[NamedExpression], + filters: Seq[Expression], + relation: LogicalRelation, + hadoopFsRelation: HadoopFsRelation): LogicalPlan = { + val variants = new VariantInRelation + val defaultValues = ResolveDefaultColumns.existenceDefaultValues(hadoopFsRelation.schema) + // I'm not aware of any case that an attribute `relation.output` can have a different data type + // than the corresponding field in `hadoopFsRelation.schema`. Other code seems to prefer using + // the data type in `hadoopFsRelation.schema`, let's also stick to it. + val schemaWithAttributes = hadoopFsRelation.schema.fields.zip(relation.output) + for (((f, attr), defaultValue) <- schemaWithAttributes.zip(defaultValues)) { + variants.addVariantFields(attr.exprId, f.dataType, defaultValue, Nil) + } + if (variants.mapping.isEmpty) return originalPlan + + projectList.foreach(variants.collectRequestedFields) + filters.foreach(variants.collectRequestedFields) + // `collectRequestedFields` may have removed all variant columns. + if (variants.mapping.forall(_._2.isEmpty)) return originalPlan + + val (newFields, newOutput) = schemaWithAttributes.map { + case (f, attr) => + if (variants.mapping.get(attr.exprId).exists(_.nonEmpty)) { + val newType = variants.rewriteType(attr.exprId, f.dataType, Nil) + val newAttr = AttributeReference(f.name, newType, f.nullable, f.metadata)() + (f.copy(dataType = newType), newAttr) + } else { + (f, attr) + } + }.unzip + + val newHadoopFsRelation = hadoopFsRelation.copy(dataSchema = StructType(newFields))( + hadoopFsRelation.sparkSession) + val newRelation = relation.copy(relation = newHadoopFsRelation, output = newOutput.toIndexedSeq) + + val attributeMap = relation.output.zip(newOutput).map { + case (oldAttr, newAttr) => oldAttr.exprId -> newAttr + }.toMap + val withFilter = if (filters.nonEmpty) { + Filter(filters.map(variants.rewriteExpr(_, attributeMap)).reduce(And), newRelation) + } else { + newRelation + } + val newProjectList = projectList.map { e => + val rewritten = variants.rewriteExpr(e, attributeMap) + rewritten match { + case n: NamedExpression => n + // This is when the variant column is directly selected. We replace the attribute reference + // with a struct access, which is not a `NamedExpression` that `Project` requires. We wrap + // it with an `Alias`. + case _ => Alias(rewritten, e.name)(e.exprId, e.qualifier) + } + } + Project(newProjectList, withFilter) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 8ef85ee91aa8f..b2b99e2d0f4ea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -86,7 +86,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { } override def getFileExtension(context: TaskAttemptContext): String = { - ".csv" + CodecStreams.getCompressionExtension(context) + "." + csvOptions.extension + CodecStreams.getCompressionExtension(context) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala index d9367d92d462e..eb9d5813cff7b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, V1CreateTablePlan} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.command.{DDLUtils, LeafRunnableCommand} @@ -43,7 +43,7 @@ import org.apache.spark.sql.types._ case class CreateTable( tableDesc: CatalogTable, mode: SaveMode, - query: Option[LogicalPlan]) extends LogicalPlan { + query: Option[LogicalPlan]) extends LogicalPlan with V1CreateTablePlan { assert(tableDesc.provider.isDefined, "The table to be created must have a provider.") if (query.isEmpty) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetColumn.scala index 6ac96300ccd65..4bc1194d9370d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetColumn.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetColumn.scala @@ -35,7 +35,10 @@ case class ParquetColumn( definitionLevel: Int, required: Boolean, path: Seq[String], - children: Seq[ParquetColumn]) { + children: Seq[ParquetColumn], + // When `variantFileType` has value, the parquet column should produce a Spark variant type, and + // `variantFileType` describes the file schema of the Parquet variant column. + variantFileType: Option[ParquetColumn] = None) { def isPrimitive: Boolean = descriptor.nonEmpty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 8dde02a4673f0..af0bf0d51f077 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -35,6 +35,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.datasources.VariantMetadata import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.types._ @@ -221,6 +222,9 @@ object ParquetReadSupport extends Logging { clipParquetMapType( parquetType.asGroupType(), t.keyType, t.valueType, caseSensitive, useFieldId) + case t: StructType if VariantMetadata.isVariantStruct(t) => + clipVariantSchema(parquetType.asGroupType(), t) + case t: StructType => clipParquetGroup(parquetType.asGroupType(), t, caseSensitive, useFieldId) @@ -390,6 +394,11 @@ object ParquetReadSupport extends Logging { .named(parquetRecord.getName) } + private def clipVariantSchema(parquetType: GroupType, variantStruct: StructType): GroupType = { + // TODO(SHREDDING): clip `parquetType` to retain the necessary columns. + parquetType + } + /** * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]]. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 838eb30c38fb1..550c2af43a706 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.errors.QueryExecutionErrors -import org.apache.spark.sql.execution.datasources.DataSourceUtils +import org.apache.spark.sql.execution.datasources.{DataSourceUtils, VariantMetadata} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{UTF8String, VariantVal} @@ -498,6 +498,9 @@ private[parquet] class ParquetRowConverter( case t: MapType => new ParquetMapConverter(parquetType.asGroupType(), t, updater) + case t: StructType if VariantMetadata.isVariantStruct(t) => + new ParquetVariantConverter(t, parquetType.asGroupType(), updater) + case t: StructType => val wrappedUpdater = { // SPARK-30338: avoid unnecessary InternalRow copying for nested structs: @@ -535,7 +538,11 @@ private[parquet] class ParquetRowConverter( wrappedUpdater) case t: VariantType => - new ParquetVariantConverter(parquetType.asGroupType(), updater) + if (SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_READING_SHREDDED)) { + new ParquetVariantConverter(t, parquetType.asGroupType(), updater) + } else { + new ParquetUnshreddedVariantConverter(parquetType.asGroupType(), updater) + } case t => throw QueryExecutionErrors.cannotCreateParquetConverterForDataTypeError( @@ -845,8 +852,8 @@ private[parquet] class ParquetRowConverter( } } - /** Parquet converter for Variant */ - private final class ParquetVariantConverter( + /** Parquet converter for unshredded Variant */ + private final class ParquetUnshreddedVariantConverter( parquetType: GroupType, updater: ParentContainerUpdater) extends ParquetGroupConverter(updater) { @@ -898,6 +905,47 @@ private[parquet] class ParquetRowConverter( } } + /** Parquet converter for Variant (shredded or unshredded) */ + private final class ParquetVariantConverter( + targetType: DataType, parquetType: GroupType, updater: ParentContainerUpdater) + extends ParquetGroupConverter(updater) { + + private[this] var currentRow: Any = _ + private[this] val parquetSparkType = SparkShreddingUtils.parquetTypeToSparkType(parquetType) + private[this] val variantSchema = SparkShreddingUtils.buildVariantSchema(parquetSparkType) + private[this] val fieldsToExtract = + SparkShreddingUtils.getFieldsToExtract(targetType, variantSchema) + // A struct converter that reads the underlying file data. + private[this] val fileConverter = new ParquetRowConverter( + schemaConverter, + parquetType, + parquetSparkType.asInstanceOf[StructType], + convertTz, + datetimeRebaseSpec, + int96RebaseSpec, + new ParentContainerUpdater { + override def set(value: Any): Unit = currentRow = value + }) + + override def getConverter(fieldIndex: Int): Converter = fileConverter.getConverter(fieldIndex) + + override def end(): Unit = { + fileConverter.end() + val row = currentRow.asInstanceOf[InternalRow] + val v = if (fieldsToExtract == null) { + SparkShreddingUtils.assembleVariant(row, variantSchema) + } else { + SparkShreddingUtils.assembleVariantStruct(row, variantSchema, fieldsToExtract) + } + updater.set(v) + } + + override def start(): Unit = { + fileConverter.start() + currentRow = null + } + } + private trait RepeatedConverter { private[this] val currentArray = ArrayBuffer.empty[Any] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala index 350d42c8efd76..daeb8e88a924b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala @@ -28,6 +28,7 @@ import org.apache.parquet.schema.Type.Repetition._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.execution.datasources.VariantMetadata import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -179,7 +180,15 @@ class ParquetToSparkSchemaConverter( field match { case primitiveColumn: PrimitiveColumnIO => convertPrimitiveField(primitiveColumn, targetType) case groupColumn: GroupColumnIO if targetType.contains(VariantType) => - convertVariantField(groupColumn) + if (SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_READING_SHREDDED)) { + val col = convertGroupField(groupColumn) + col.copy(sparkType = VariantType, variantFileType = Some(col)) + } else { + convertVariantField(groupColumn) + } + case groupColumn: GroupColumnIO if targetType.exists(VariantMetadata.isVariantStruct) => + val col = convertGroupField(groupColumn) + col.copy(sparkType = targetType.get, variantFileType = Some(col)) case groupColumn: GroupColumnIO => convertGroupField(groupColumn, targetType) } } @@ -747,6 +756,14 @@ class SparkToParquetSchemaConverter( .addField(convertField(StructField("metadata", BinaryType, nullable = false))) .named(field.name) + case s: StructType if SparkShreddingUtils.isVariantShreddingStruct(s) => + // Variant struct takes a Variant and writes to Parquet as a shredded schema. + val group = Types.buildGroup(repetition) + s.fields.foreach { f => + group.addField(convertField(f)) + } + group.named(field.name) + case StructType(fields) => fields.foldLeft(Types.buildGroup(repetition)) { (builder, field) => builder.addField(convertField(field)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala index 3e111252bc6fe..663182d8d1820 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala @@ -45,7 +45,7 @@ import org.apache.spark.sql.execution.datasources.{AggregatePushDownUtils, Outpu import org.apache.spark.sql.execution.datasources.v2.V2ColumnUtils import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.internal.SQLConf.PARQUET_AGGREGATE_PUSHDOWN_ENABLED -import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, MapType, StructField, StructType, UserDefinedType} +import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, MapType, StructField, StructType, UserDefinedType, VariantType} import org.apache.spark.util.ArrayImplicits._ object ParquetUtils extends Logging { @@ -420,6 +420,22 @@ object ParquetUtils extends Logging { statistics.getNumNulls; } + // Replaces each VariantType in the schema with the corresponding type in the shredding schema. + // Used for testing, where we force a single shredding schema for all Variant fields. + // Does not touch Variant fields nested in arrays, maps, or UDTs. + private def replaceVariantTypes(schema: StructType, shreddingSchema: StructType): StructType = { + val newFields = schema.fields.zip(shreddingSchema.fields).map { + case (field, shreddingField) => + field.dataType match { + case s: StructType => + field.copy(dataType = replaceVariantTypes(s, shreddingSchema)) + case VariantType => field.copy(dataType = shreddingSchema) + case _ => field + } + } + StructType(newFields) + } + def prepareWrite( sqlConf: SQLConf, job: Job, @@ -454,8 +470,23 @@ object ParquetUtils extends Logging { ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport]) + val shreddingSchema = if (sqlConf.getConf(SQLConf.VARIANT_WRITE_SHREDDING_ENABLED) && + !sqlConf.getConf(SQLConf.VARIANT_FORCE_SHREDDING_SCHEMA_FOR_TEST).isEmpty) { + // Convert the schema to a shredding schema, and replace it anywhere that there is a + // VariantType in the original schema. + val simpleShreddingSchema = DataType.fromDDL( + sqlConf.getConf(SQLConf.VARIANT_FORCE_SHREDDING_SCHEMA_FOR_TEST) + ) + val oneShreddingSchema = SparkShreddingUtils.variantShreddingSchema(simpleShreddingSchema) + val schemaWithMetadata = SparkShreddingUtils.addWriteShreddingMetadata(oneShreddingSchema) + Some(replaceVariantTypes(dataSchema, schemaWithMetadata)) + } else { + None + } + // This metadata is useful for keeping UDTs like Vector/Matrix. ParquetWriteSupport.setSchema(dataSchema, conf) + shreddingSchema.foreach(ParquetWriteSupport.setShreddingSchema(_, conf)) // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet // schema and writes actual rows to Parquet files. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala index 89a1cd5d4375a..35eb57a2e4fb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSourceUtils import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.types._ +import org.apache.spark.types.variant.Variant /** * A Parquet [[WriteSupport]] implementation that writes Catalyst [[InternalRow]]s as Parquet @@ -59,6 +60,10 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { // Schema of the `InternalRow`s to be written private var schema: StructType = _ + // Schema of the `InternalRow`s to be written, with VariantType replaced with its shredding + // schema, if appropriate. + private var shreddedSchema: StructType = _ + // `ValueWriter`s for all fields of the schema private var rootFieldWriters: Array[ValueWriter] = _ @@ -95,7 +100,16 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { override def init(configuration: Configuration): WriteContext = { val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA) + val shreddedSchemaString = configuration.get(ParquetWriteSupport.SPARK_VARIANT_SHREDDING_SCHEMA) this.schema = StructType.fromString(schemaString) + // If shreddingSchemaString is provided, we use that everywhere in the writer, except for + // setting the spark schema in the Parquet metadata. If it isn't provided, it means that there + // are no shredded Variant columns, so it is identical to this.schema. + this.shreddedSchema = if (shreddedSchemaString == null) { + this.schema + } else { + StructType.fromString(shreddedSchemaString) + } this.writeLegacyParquetFormat = { // `SQLConf.PARQUET_WRITE_LEGACY_FORMAT` should always be explicitly set in ParquetRelation assert(configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key) != null) @@ -108,9 +122,9 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { SQLConf.ParquetOutputTimestampType.withName(configuration.get(key)) } - this.rootFieldWriters = schema.map(_.dataType).map(makeWriter).toArray[ValueWriter] + this.rootFieldWriters = shreddedSchema.map(_.dataType).map(makeWriter).toArray[ValueWriter] - val messageType = new SparkToParquetSchemaConverter(configuration).convert(schema) + val messageType = new SparkToParquetSchemaConverter(configuration).convert(shreddedSchema) val metadata = Map( SPARK_VERSION_METADATA_KEY -> SPARK_VERSION_SHORT, ParquetReadSupport.SPARK_METADATA_KEY -> schemaString @@ -132,13 +146,23 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { } } - logDebug( - s"""Initialized Parquet WriteSupport with Catalyst schema: - |${schema.prettyJson} - |and corresponding Parquet message type: - |$messageType - """.stripMargin) - + if (shreddedSchemaString == null) { + logDebug( + s"""Initialized Parquet WriteSupport with Catalyst schema: + |${schema.prettyJson} + |and corresponding Parquet message type: + |$messageType + """.stripMargin) + } else { + logDebug( + s"""Initialized Parquet WriteSupport with Catalyst schema: + |${schema.prettyJson} + |and shredding schema: + |${shreddedSchema.prettyJson} + |and corresponding Parquet message type: + |$messageType + """.stripMargin) + } new WriteContext(messageType, metadata.asJava) } @@ -148,7 +172,7 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { override def write(row: InternalRow): Unit = { consumeMessage { - writeFields(row, schema, rootFieldWriters) + writeFields(row, shreddedSchema, rootFieldWriters) } } @@ -250,6 +274,17 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { } } + case s: StructType if SparkShreddingUtils.isVariantShreddingStruct(s) => + val fieldWriters = s.map(_.dataType).map(makeWriter).toArray[ValueWriter] + val variantShreddingSchema = SparkShreddingUtils.buildVariantSchema(s) + (row: SpecializedGetters, ordinal: Int) => + val v = row.getVariant(ordinal) + val variant = new Variant(v.getValue, v.getMetadata) + val shreddedValues = SparkShreddingUtils.castShredded(variant, variantShreddingSchema) + consumeGroup { + writeFields(shreddedValues, s, fieldWriters) + } + case t: StructType => val fieldWriters = t.map(_.dataType).map(makeWriter).toArray[ValueWriter] (row: SpecializedGetters, ordinal: Int) => @@ -499,6 +534,10 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { object ParquetWriteSupport { val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes" + // A version of `SPARK_ROW_SCHEMA`, where one or more Variant attributes have been replace with a + // shredded struct schema. + val SPARK_VARIANT_SHREDDING_SCHEMA: String = + "org.apache.spark.sql.parquet.variant.shredding.attributes" def setSchema(schema: StructType, configuration: Configuration): Unit = { configuration.set(SPARK_ROW_SCHEMA, schema.json) @@ -506,4 +545,8 @@ object ParquetWriteSupport { ParquetOutputFormat.WRITER_VERSION, ParquetProperties.WriterVersion.PARQUET_1_0.toString) } + + def setShreddingSchema(shreddingSchema: StructType, configuration: Configuration): Unit = { + configuration.set(SPARK_VARIANT_SHREDDING_SCHEMA, shreddingSchema.json) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkShreddingUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkShreddingUtils.scala index 2b81668b88b87..ffb6704061e66 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkShreddingUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkShreddingUtils.scala @@ -17,19 +17,438 @@ package org.apache.spark.sql.execution.datasources.parquet +import org.apache.parquet.io.ColumnIOFactory +import org.apache.parquet.schema.{Type => ParquetType, Types => ParquetTypes} + import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.expressions.codegen.Block._ +import org.apache.spark.sql.catalyst.expressions.variant._ +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} +import org.apache.spark.sql.execution.RowToColumnConverter +import org.apache.spark.sql.execution.datasources.VariantMetadata +import org.apache.spark.sql.execution.vectorized.WritableColumnVector import org.apache.spark.sql.types._ import org.apache.spark.types.variant._ +import org.apache.spark.types.variant.VariantUtil.Type import org.apache.spark.unsafe.types._ +case class SparkShreddedRow(row: SpecializedGetters) extends ShreddingUtils.ShreddedRow { + override def isNullAt(ordinal: Int): Boolean = row.isNullAt(ordinal) + override def getBoolean(ordinal: Int): Boolean = row.getBoolean(ordinal) + override def getByte(ordinal: Int): Byte = row.getByte(ordinal) + override def getShort(ordinal: Int): Short = row.getShort(ordinal) + override def getInt(ordinal: Int): Int = row.getInt(ordinal) + override def getLong(ordinal: Int): Long = row.getLong(ordinal) + override def getFloat(ordinal: Int): Float = row.getFloat(ordinal) + override def getDouble(ordinal: Int): Double = row.getDouble(ordinal) + override def getDecimal(ordinal: Int, precision: Int, scale: Int): java.math.BigDecimal = + row.getDecimal(ordinal, precision, scale).toJavaBigDecimal + override def getString(ordinal: Int): String = row.getUTF8String(ordinal).toString + override def getBinary(ordinal: Int): Array[Byte] = row.getBinary(ordinal) + override def getStruct(ordinal: Int, numFields: Int): SparkShreddedRow = + SparkShreddedRow(row.getStruct(ordinal, numFields)) + override def getArray(ordinal: Int): SparkShreddedRow = + SparkShreddedRow(row.getArray(ordinal)) + override def numElements(): Int = row.asInstanceOf[ArrayData].numElements() +} + +// The search result of a `VariantPathSegment` in a `VariantSchema`. +case class SchemaPathSegment( + rawPath: VariantPathSegment, + // Whether this path segment is an object or array extraction. + isObject: Boolean, + // `schema.typedIdx`, if the path exists in the schema (for object extraction, the schema + // should contain an object `typed_value` containing the requested field; similar for array + // extraction). Negative otherwise. + typedIdx: Int, + // For object extraction, it is the index of the desired field in `schema.objectSchema`. If the + // requested field doesn't exist, both `extractionIdx/typedIdx` are set to negative. + // For array extraction, it is the array index. The information is already stored in `rawPath`, + // but accessing a raw int should be more efficient than `rawPath`, which is an `Either`. + extractionIdx: Int) + +// Represent a single field in a variant struct (see `VariantMetadata` for definition), that is, a +// single requested field that the scan should produce by extracting from the variant column. +case class FieldToExtract(path: Array[SchemaPathSegment], reader: ParquetVariantReader) + +// A helper class to cast from scalar `typed_value` into a scalar `dataType`. Need a custom +// expression because it has different error reporting code than `Cast`. +case class ScalarCastHelper( + child: Expression, + dataType: DataType, + castArgs: VariantCastArgs) extends UnaryExpression { + // The expression is only for the internal use of `ScalarReader`, which can guarantee the child + // is not nullable. + assert(!child.nullable) + + // If `cast` is null, it means the cast always fails because the type combination is not allowed. + private val cast = if (Cast.canAnsiCast(child.dataType, dataType)) { + Cast(child, dataType, castArgs.zoneStr, EvalMode.TRY) + } else { + null + } + // Cast the input to string. Only used for reporting an invalid cast. + private val castToString = Cast(child, StringType, castArgs.zoneStr, EvalMode.ANSI) + + override def nullable: Boolean = !castArgs.failOnError + override def withNewChildInternal(newChild: Expression): UnaryExpression = copy(child = newChild) + + // No need to define the interpreted version of `eval`: the codegen must succeed. + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + // Throw an error or do nothing, depending on `castArgs.failOnError`. + val invalidCastCode = if (castArgs.failOnError) { + val castToStringCode = castToString.genCode(ctx) + val typeObj = ctx.addReferenceObj("dataType", dataType) + val cls = classOf[ScalarCastHelper].getName + s""" + ${castToStringCode.code} + $cls.throwInvalidVariantCast(${castToStringCode.value}, $typeObj); + """ + } else { + "" + } + val customCast = (child.dataType, dataType) match { + case (_: LongType, _: TimestampType) => "castLongToTimestamp" + case (_: DecimalType, _: TimestampType) => "castDecimalToTimestamp" + case (_: DecimalType, _: StringType) => "castDecimalToString" + case _ => null + } + if (customCast != null) { + val childCode = child.genCode(ctx) + // We can avoid the try-catch block for decimal -> string, but the performance benefit is + // little. We can also be more specific in the exception type, like catching + // `ArithmeticException` instead of `Exception`, but it is unnecessary. The `try_cast` codegen + // also catches `Exception` instead of specific exceptions. + val code = code""" + ${childCode.code} + boolean ${ev.isNull} = false; + ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; + try { + ${ev.value} = ${classOf[VariantGet].getName}.$customCast(${childCode.value}); + } catch (Exception e) { + ${ev.isNull} = true; + $invalidCastCode + } + """ + ev.copy(code = code) + } else if (cast != null) { + val castCode = cast.genCode(ctx) + val code = code""" + ${castCode.code} + boolean ${ev.isNull} = ${castCode.isNull}; + ${CodeGenerator.javaType(dataType)} ${ev.value} = ${castCode.value}; + if (${ev.isNull}) { $invalidCastCode } + """ + ev.copy(code = code) + } else { + val code = code""" + boolean ${ev.isNull} = true; + ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; + if (${ev.isNull}) { $invalidCastCode } + """ + ev.copy(code = code) + } + } +} + +object ScalarCastHelper { + // A helper function for codegen. The java compiler doesn't allow throwing a `Throwable` in a + // method without `throws` annotation. + def throwInvalidVariantCast(value: UTF8String, dataType: DataType): Any = + throw QueryExecutionErrors.invalidVariantCast(value.toString, dataType) +} + +// The base class to read Parquet variant values into a Spark type. +// For convenience, we also allow creating an instance of the base class itself. None of its +// functions can be used, but it can serve as a container of `targetType` and `castArgs`. +class ParquetVariantReader( + val schema: VariantSchema, val targetType: DataType, val castArgs: VariantCastArgs) { + // Read from a row containing a Parquet variant value (shredded or unshredded) and return a value + // of `targetType`. The row schema is described by `schema`. + // This function throws MALFORMED_VARIANT if the variant is missing. If the variant can be + // legally missing (the only possible situation is struct fields in object `typed_value`), the + // caller should check for it and avoid calling this function if the variant is missing. + def read(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (schema.typedIdx < 0 || row.isNullAt(schema.typedIdx)) { + if (schema.variantIdx < 0 || row.isNullAt(schema.variantIdx)) { + // Both `typed_value` and `value` are null, meaning the variant is missing. + throw QueryExecutionErrors.malformedVariant() + } + val v = new Variant(row.getBinary(schema.variantIdx), topLevelMetadata) + VariantGet.cast(v, targetType, castArgs) + } else { + readFromTyped(row, topLevelMetadata) + } + } + + // Subclasses should override it to produce the read result when `typed_value` is not null. + protected def readFromTyped(row: InternalRow, topLevelMetadata: Array[Byte]): Any = + throw QueryExecutionErrors.unreachableError() + + // A util function to rebuild the variant in binary format from a Parquet variant value. + protected final def rebuildVariant(row: InternalRow, topLevelMetadata: Array[Byte]): Variant = { + val builder = new VariantBuilder(false) + ShreddingUtils.rebuild(SparkShreddedRow(row), topLevelMetadata, schema, builder) + builder.result() + } + + // A util function to throw error or return null when an invalid cast happens. + protected final def invalidCast(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (castArgs.failOnError) { + throw QueryExecutionErrors.invalidVariantCast( + rebuildVariant(row, topLevelMetadata).toJson(castArgs.zoneId), targetType) + } else { + null + } + } +} + +object ParquetVariantReader { + // Create a reader for `targetType`. If `schema` is null, meaning that the extraction path doesn't + // exist in `typed_value`, it returns an instance of `ParquetVariantReader`. As described in the + // class comment, the reader is only a container of `targetType` and `castArgs` in this case. + def apply(schema: VariantSchema, targetType: DataType, castArgs: VariantCastArgs, + isTopLevelUnshredded: Boolean = false): ParquetVariantReader = targetType match { + case _ if schema == null => new ParquetVariantReader(schema, targetType, castArgs) + case s: StructType => new StructReader(schema, s, castArgs) + case a: ArrayType => new ArrayReader(schema, a, castArgs) + case m@MapType(_: StringType, _, _) => new MapReader(schema, m, castArgs) + case v: VariantType => new VariantReader(schema, v, castArgs, isTopLevelUnshredded) + case s: AtomicType => new ScalarReader(schema, s, castArgs) + case _ => + // Type check should have rejected map with non-string type. + throw QueryExecutionErrors.unreachableError(s"Invalid target type: `${targetType.sql}`") + } +} + +// Read Parquet variant values into a Spark struct type. It reads unshredded fields (fields that are +// not in the typed object) from the `value`, and reads the shredded fields from the object +// `typed_value`. +// `value` must not contain any shredded field according to the shredding spec, but this requirement +// is not enforced. If `value` does contain a shredded field, no error will occur, and the field in +// object `typed_value` will be the final result. +private[this] final class StructReader( + schema: VariantSchema, targetType: StructType, castArgs: VariantCastArgs) + extends ParquetVariantReader(schema, targetType, castArgs) { + // For each field in `targetType`, store the index of the field with the same name in object + // `typed_value`, or -1 if it doesn't exist in object `typed_value`. + private[this] val fieldInputIndices: Array[Int] = targetType.fields.map { f => + val inputIdx = if (schema.objectSchemaMap != null) schema.objectSchemaMap.get(f.name) else null + if (inputIdx != null) inputIdx.intValue() else -1 + } + // For each field in `targetType`, store the reader from the corresponding field in object + // `typed_value`, or null if it doesn't exist in object `typed_value`. + private[this] val fieldReaders: Array[ParquetVariantReader] = + targetType.fields.zip(fieldInputIndices).map { case (f, inputIdx) => + if (inputIdx >= 0) { + val fieldSchema = schema.objectSchema(inputIdx).schema + ParquetVariantReader(fieldSchema, f.dataType, castArgs) + } else { + null + } + } + // If all fields in `targetType` can be found in object `typed_value`, then the reader doesn't + // need to read from `value`. + private[this] val needUnshreddedObject: Boolean = fieldInputIndices.exists(_ < 0) + + override def readFromTyped(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (schema.objectSchema == null) return invalidCast(row, topLevelMetadata) + val obj = row.getStruct(schema.typedIdx, schema.objectSchema.length) + val result = new GenericInternalRow(fieldInputIndices.length) + var unshreddedObject: Variant = null + if (needUnshreddedObject && schema.variantIdx >= 0 && !row.isNullAt(schema.variantIdx)) { + unshreddedObject = new Variant(row.getBinary(schema.variantIdx), topLevelMetadata) + if (unshreddedObject.getType != Type.OBJECT) throw QueryExecutionErrors.malformedVariant() + } + val numFields = fieldInputIndices.length + var i = 0 + while (i < numFields) { + val inputIdx = fieldInputIndices(i) + if (inputIdx >= 0) { + // Shredded field must not be null. + if (obj.isNullAt(inputIdx)) throw QueryExecutionErrors.malformedVariant() + val fieldSchema = schema.objectSchema(inputIdx).schema + val fieldInput = obj.getStruct(inputIdx, fieldSchema.numFields) + // Only read from the shredded field if it is not missing. + if ((fieldSchema.typedIdx >= 0 && !fieldInput.isNullAt(fieldSchema.typedIdx)) || + (fieldSchema.variantIdx >= 0 && !fieldInput.isNullAt(fieldSchema.variantIdx))) { + result.update(i, fieldReaders(i).read(fieldInput, topLevelMetadata)) + } + } else if (unshreddedObject != null) { + val fieldName = targetType.fields(i).name + val fieldType = targetType.fields(i).dataType + val unshreddedField = unshreddedObject.getFieldByKey(fieldName) + if (unshreddedField != null) { + result.update(i, VariantGet.cast(unshreddedField, fieldType, castArgs)) + } + } + i += 1 + } + result + } +} + +// Read Parquet variant values into a Spark array type. +private[this] final class ArrayReader( + schema: VariantSchema, targetType: ArrayType, castArgs: VariantCastArgs) + extends ParquetVariantReader(schema, targetType, castArgs) { + private[this] val elementReader = if (schema.arraySchema != null) { + ParquetVariantReader(schema.arraySchema, targetType.elementType, castArgs) + } else { + null + } + + override def readFromTyped(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (schema.arraySchema == null) return invalidCast(row, topLevelMetadata) + val elementNumFields = schema.arraySchema.numFields + val arr = row.getArray(schema.typedIdx) + val size = arr.numElements() + val result = new Array[Any](size) + var i = 0 + while (i < size) { + // Shredded array element must not be null. + if (arr.isNullAt(i)) throw QueryExecutionErrors.malformedVariant() + result(i) = elementReader.read(arr.getStruct(i, elementNumFields), topLevelMetadata) + i += 1 + } + new GenericArrayData(result) + } +} + +// Read Parquet variant values into a Spark map type with string key type. The input must be object +// for a valid cast. The resulting map contains shredded fields from object `typed_value` and +// unshredded fields from object `value`. +// `value` must not contain any shredded field according to the shredding spec. Unlike +// `StructReader`, this requirement is enforced in `MapReader`. If `value` does contain a shredded +// field, throw a MALFORMED_VARIANT error. The purpose is to avoid duplicate map keys. +private[this] final class MapReader( + schema: VariantSchema, targetType: MapType, castArgs: VariantCastArgs) + extends ParquetVariantReader(schema, targetType, castArgs) { + // Readers that convert each shredded field into the map value type. + private[this] val valueReaders = if (schema.objectSchema != null) { + schema.objectSchema.map { f => + ParquetVariantReader(f.schema, targetType.valueType, castArgs) + } + } else { + null + } + // `UTF8String` representation of shredded field names. Do the `String -> UTF8String` once, so + // that `readFromTyped` doesn't need to do it repeatedly. + private[this] val shreddedFieldNames = if (schema.objectSchema != null) { + schema.objectSchema.map { f => UTF8String.fromString(f.fieldName) } + } else { + null + } + + override def readFromTyped(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (schema.objectSchema == null) return invalidCast(row, topLevelMetadata) + val obj = row.getStruct(schema.typedIdx, schema.objectSchema.length) + val numShreddedFields = valueReaders.length + var unshreddedObject: Variant = null + if (schema.variantIdx >= 0 && !row.isNullAt(schema.variantIdx)) { + unshreddedObject = new Variant(row.getBinary(schema.variantIdx), topLevelMetadata) + if (unshreddedObject.getType != Type.OBJECT) throw QueryExecutionErrors.malformedVariant() + } + val numUnshreddedFields = if (unshreddedObject != null) unshreddedObject.objectSize() else 0 + var keyArray = new Array[UTF8String](numShreddedFields + numUnshreddedFields) + var valueArray = new Array[Any](numShreddedFields + numUnshreddedFields) + var mapLength = 0 + var i = 0 + while (i < numShreddedFields) { + // Shredded field must not be null. + if (obj.isNullAt(i)) throw QueryExecutionErrors.malformedVariant() + val fieldSchema = schema.objectSchema(i).schema + val fieldInput = obj.getStruct(i, fieldSchema.numFields) + // Only add the shredded field to map if it is not missing. + if ((fieldSchema.typedIdx >= 0 && !fieldInput.isNullAt(fieldSchema.typedIdx)) || + (fieldSchema.variantIdx >= 0 && !fieldInput.isNullAt(fieldSchema.variantIdx))) { + keyArray(mapLength) = shreddedFieldNames(i) + valueArray(mapLength) = valueReaders(i).read(fieldInput, topLevelMetadata) + mapLength += 1 + } + i += 1 + } + i = 0 + while (i < numUnshreddedFields) { + val field = unshreddedObject.getFieldAtIndex(i) + if (schema.objectSchemaMap.containsKey(field.key)) { + throw QueryExecutionErrors.malformedVariant() + } + keyArray(mapLength) = UTF8String.fromString(field.key) + valueArray(mapLength) = VariantGet.cast(field.value, targetType.valueType, castArgs) + mapLength += 1 + i += 1 + } + // Need to shrink the arrays if there are missing shredded fields. + if (mapLength < keyArray.length) { + keyArray = keyArray.slice(0, mapLength) + valueArray = valueArray.slice(0, mapLength) + } + ArrayBasedMapData(keyArray, valueArray) + } +} + +// Read Parquet variant values into a Spark variant type (the binary format). +private[this] final class VariantReader( + schema: VariantSchema, targetType: DataType, castArgs: VariantCastArgs, + // An optional optimization: the user can set it to true if the Parquet variant column is + // unshredded and the extraction path is empty. We are not required to do anything special, bu + // we can avoid rebuilding variant for optimization purpose. + private[this] val isTopLevelUnshredded: Boolean) + extends ParquetVariantReader(schema, targetType, castArgs) { + override def read(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (isTopLevelUnshredded) { + if (row.isNullAt(schema.variantIdx)) throw QueryExecutionErrors.malformedVariant() + return new VariantVal(row.getBinary(schema.variantIdx), topLevelMetadata) + } + val v = rebuildVariant(row, topLevelMetadata) + new VariantVal(v.getValue, v.getMetadata) + } +} + +// Read Parquet variant values into a Spark scalar type. When `typed_value` is not null but not a +// scalar, all other target types should return an invalid cast, but only the string target type can +// still build a string from array/object `typed_value`. For scalar `typed_value`, it depends on +// `ScalarCastHelper` to perform the cast. +// According to the shredding spec, scalar `typed_value` and `value` must not be non-null at the +// same time. The requirement is not enforced in this reader. If they are both non-null, no error +// will occur, and the reader will read from `typed_value`. +private[this] final class ScalarReader( + schema: VariantSchema, targetType: DataType, castArgs: VariantCastArgs) + extends ParquetVariantReader(schema, targetType, castArgs) { + private[this] val castProject = if (schema.scalarSchema != null) { + val scalarType = SparkShreddingUtils.scalarSchemaToSparkType(schema.scalarSchema) + // Read the cast input from ordinal `schema.typedIdx` in the input row. The cast input is never + // null, because `readFromTyped` is only called when `typed_value` is not null. + val input = BoundReference(schema.typedIdx, scalarType, nullable = false) + MutableProjection.create(Seq(ScalarCastHelper(input, targetType, castArgs))) + } else { + null + } + + override def readFromTyped(row: InternalRow, topLevelMetadata: Array[Byte]): Any = { + if (castProject == null) { + return if (targetType.isInstanceOf[StringType]) { + UTF8String.fromString(rebuildVariant(row, topLevelMetadata).toJson(castArgs.zoneId)) + } else { + invalidCast(row, topLevelMetadata) + } + } + val result = castProject(row) + if (result.isNullAt(0)) null else result.get(0, targetType) + } +} + case object SparkShreddingUtils { val VariantValueFieldName = "value"; val TypedValueFieldName = "typed_value"; val MetadataFieldName = "metadata"; + val VARIANT_WRITE_SHREDDING_KEY: String = "__VARIANT_WRITE_SHREDDING_KEY" + def buildVariantSchema(schema: DataType): VariantSchema = { schema match { case s: StructType => buildVariantSchema(s, topLevel = true) @@ -53,16 +472,21 @@ case object SparkShreddingUtils { */ def variantShreddingSchema(dataType: DataType, isTopLevel: Boolean = true): StructType = { val fields = dataType match { - case ArrayType(elementType, containsNull) => + case ArrayType(elementType, _) => + // Always set containsNull to false. One of value or typed_value must always be set for + // array elements. val arrayShreddingSchema = - ArrayType(variantShreddingSchema(elementType, false), containsNull) + ArrayType(variantShreddingSchema(elementType, false), containsNull = false) Seq( StructField(VariantValueFieldName, BinaryType, nullable = true), StructField(TypedValueFieldName, arrayShreddingSchema, nullable = true) ) case StructType(fields) => + // The field name level is always non-nullable: Variant null values are represented in the + // "value" columna as "00", and missing values are represented by setting both "value" and + // "typed_value" to null. val objectShreddingSchema = StructType(fields.map(f => - f.copy(dataType = variantShreddingSchema(f.dataType, false)))) + f.copy(dataType = variantShreddingSchema(f.dataType, false), nullable = false))) Seq( StructField(VariantValueFieldName, BinaryType, nullable = true), StructField(TypedValueFieldName, objectShreddingSchema, nullable = true) @@ -89,6 +513,27 @@ case object SparkShreddingUtils { } } + /** + * Given a schema that represents a valid shredding schema (e.g. constructed by + * SparkShreddingUtils.variantShreddingSchema), add metadata to the top-level fields to mark it + * as a shredding schema for writers. + */ + def addWriteShreddingMetadata(schema: StructType): StructType = { + val newFields = schema.fields.map { f => + f.copy(metadata = new + MetadataBuilder() + .withMetadata(f.metadata) + .putNull(VARIANT_WRITE_SHREDDING_KEY).build()) + } + StructType(newFields) + } + + // Check if the struct is marked with metadata set by addWriteShreddingMetadata - i.e. it + // represents a Variant converted to a shredding schema for writing. + def isVariantShreddingStruct(s: StructType): Boolean = { + s.fields.length > 0 && s.fields.forall(_.metadata.contains(VARIANT_WRITE_SHREDDING_KEY)) + } + /* * Given a Spark schema that represents a valid shredding schema (e.g. constructed by * SparkShreddingUtils.variantShreddingSchema), return the corresponding VariantSchema. @@ -101,6 +546,11 @@ case object SparkShreddingUtils { var objectSchema: Array[VariantSchema.ObjectField] = null var arraySchema: VariantSchema = null + // The struct must not be empty or contain duplicate field names. The latter is enforced in the + // loop below (`if (typedIdx != -1)` and other similar checks). + if (schema.fields.isEmpty) { + throw QueryCompilationErrors.invalidVariantShreddingSchema(schema) + } schema.fields.zipWithIndex.foreach { case (f, i) => f.name match { case TypedValueFieldName => @@ -110,8 +560,11 @@ case object SparkShreddingUtils { typedIdx = i f.dataType match { case StructType(fields) => - objectSchema = - new Array[VariantSchema.ObjectField](fields.length) + // The struct must not be empty or contain duplicate field names. + if (fields.isEmpty || fields.map(_.name).distinct.length != fields.length) { + throw QueryCompilationErrors.invalidVariantShreddingSchema(schema) + } + objectSchema = new Array[VariantSchema.ObjectField](fields.length) fields.zipWithIndex.foreach { case (field, fieldIdx) => field.dataType match { case s: StructType => @@ -163,6 +616,32 @@ case object SparkShreddingUtils { scalarSchema, objectSchema, arraySchema) } + // Convert a scalar variant schema into a Spark scalar type. + def scalarSchemaToSparkType(scalar: VariantSchema.ScalarType): DataType = scalar match { + case _: VariantSchema.StringType => StringType + case it: VariantSchema.IntegralType => it.size match { + case VariantSchema.IntegralSize.BYTE => ByteType + case VariantSchema.IntegralSize.SHORT => ShortType + case VariantSchema.IntegralSize.INT => IntegerType + case VariantSchema.IntegralSize.LONG => LongType + } + case _: VariantSchema.FloatType => FloatType + case _: VariantSchema.DoubleType => DoubleType + case _: VariantSchema.BooleanType => BooleanType + case _: VariantSchema.BinaryType => BinaryType + case dt: VariantSchema.DecimalType => DecimalType(dt.precision, dt.scale) + case _: VariantSchema.DateType => DateType + case _: VariantSchema.TimestampType => TimestampType + case _: VariantSchema.TimestampNTZType => TimestampNTZType + } + + // Convert a Parquet type into a Spark data type. + def parquetTypeToSparkType(parquetType: ParquetType): DataType = { + val messageType = ParquetTypes.buildMessage().addField(parquetType).named("foo") + val column = new ColumnIOFactory().getColumnIO(messageType) + new ParquetToSparkSchemaConverter().convertField(column.getChild(0)).sparkType + } + class SparkShreddedResult(schema: VariantSchema) extends VariantShreddingWriter.ShreddedResult { // Result is stored as an InternalRow. val row = new GenericInternalRow(schema.numFields) @@ -217,4 +696,188 @@ case object SparkShreddingUtils { .asInstanceOf[SparkShreddedResult] .row } + + // Return a list of fields to extract. `targetType` must be either variant or variant struct. + // If it is variant, return null because the target is the full variant and there is no field to + // extract. If it is variant struct, return a list of fields matching the variant struct fields. + def getFieldsToExtract(targetType: DataType, inputSchema: VariantSchema): Array[FieldToExtract] = + targetType match { + case _: VariantType => null + case s: StructType if VariantMetadata.isVariantStruct(s) => + s.fields.map { f => + val metadata = VariantMetadata.fromMetadata(f.metadata) + val rawPath = metadata.parsedPath() + val schemaPath = new Array[SchemaPathSegment](rawPath.length) + var schema = inputSchema + // Search `rawPath` in `schema` to produce `schemaPath`. If a raw path segment cannot be + // found at a certain level of the file type, then `typedIdx` will be -1 starting from + // this position, and the final `schema` will be null. + for (i <- rawPath.indices) { + val isObject = rawPath(i).isInstanceOf[ObjectExtraction] + var typedIdx = -1 + var extractionIdx = -1 + rawPath(i) match { + case ObjectExtraction(key) if schema != null && schema.objectSchema != null => + val fieldIdx = schema.objectSchemaMap.get(key) + if (fieldIdx != null) { + typedIdx = schema.typedIdx + extractionIdx = fieldIdx + schema = schema.objectSchema(fieldIdx).schema + } else { + schema = null + } + case ArrayExtraction(index) if schema != null && schema.arraySchema != null => + typedIdx = schema.typedIdx + extractionIdx = index + schema = schema.arraySchema + case _ => + schema = null + } + schemaPath(i) = SchemaPathSegment(rawPath(i), isObject, typedIdx, extractionIdx) + } + val reader = ParquetVariantReader(schema, f.dataType, VariantCastArgs( + metadata.failOnError, + Some(metadata.timeZoneId), + DateTimeUtils.getZoneId(metadata.timeZoneId)), + isTopLevelUnshredded = schemaPath.isEmpty && inputSchema.isUnshredded) + FieldToExtract(schemaPath, reader) + } + case _ => + throw QueryExecutionErrors.unreachableError(s"Invalid target type: `${targetType.sql}`") + } + + // Extract a single variant struct field from a Parquet variant value. It steps into `inputRow` + // according to the variant extraction path, and read the extracted value as the target type. + private def extractField( + inputRow: InternalRow, + topLevelMetadata: Array[Byte], + inputSchema: VariantSchema, + pathList: Array[SchemaPathSegment], + reader: ParquetVariantReader): Any = { + var pathIdx = 0 + val pathLen = pathList.length + var row = inputRow + var schema = inputSchema + while (pathIdx < pathLen) { + val path = pathList(pathIdx) + + if (path.typedIdx < 0) { + // The extraction doesn't exist in `typed_value`. Try to extract the remaining part of the + // path in `value`. + val variantIdx = schema.variantIdx + if (variantIdx < 0 || row.isNullAt(variantIdx)) return null + var v = new Variant(row.getBinary(variantIdx), topLevelMetadata) + while (pathIdx < pathLen) { + v = pathList(pathIdx).rawPath match { + case ObjectExtraction(key) if v.getType == Type.OBJECT => v.getFieldByKey(key) + case ArrayExtraction(index) if v.getType == Type.ARRAY => v.getElementAtIndex(index) + case _ => null + } + if (v == null) return null + pathIdx += 1 + } + return VariantGet.cast(v, reader.targetType, reader.castArgs) + } + + if (row.isNullAt(path.typedIdx)) return null + if (path.isObject) { + val obj = row.getStruct(path.typedIdx, schema.objectSchema.length) + // Object field must not be null. + if (obj.isNullAt(path.extractionIdx)) throw QueryExecutionErrors.malformedVariant() + schema = schema.objectSchema(path.extractionIdx).schema + row = obj.getStruct(path.extractionIdx, schema.numFields) + // Return null if the field is missing. + if ((schema.typedIdx < 0 || row.isNullAt(schema.typedIdx)) && + (schema.variantIdx < 0 || row.isNullAt(schema.variantIdx))) { + return null + } + } else { + val arr = row.getArray(path.typedIdx) + // Return null if the extraction index is out of bound. + if (path.extractionIdx >= arr.numElements()) return null + // Array element must not be null. + if (arr.isNullAt(path.extractionIdx)) throw QueryExecutionErrors.malformedVariant() + schema = schema.arraySchema + row = arr.getStruct(path.extractionIdx, schema.numFields) + } + pathIdx += 1 + } + reader.read(row, topLevelMetadata) + } + + // Assemble a variant (binary format) from a Parquet variant value. + def assembleVariant(row: InternalRow, schema: VariantSchema): VariantVal = { + val v = ShreddingUtils.rebuild(SparkShreddedRow(row), schema) + new VariantVal(v.getValue, v.getMetadata) + } + + // Assemble a variant struct, in which each field is extracted from the Parquet variant value. + def assembleVariantStruct( + inputRow: InternalRow, + schema: VariantSchema, + fields: Array[FieldToExtract]): InternalRow = { + if (inputRow.isNullAt(schema.topLevelMetadataIdx)) { + throw QueryExecutionErrors.malformedVariant() + } + val topLevelMetadata = inputRow.getBinary(schema.topLevelMetadataIdx) + val numFields = fields.length + val resultRow = new GenericInternalRow(numFields) + var fieldIdx = 0 + while (fieldIdx < numFields) { + resultRow.update(fieldIdx, extractField(inputRow, topLevelMetadata, schema, + fields(fieldIdx).path, fields(fieldIdx).reader)) + fieldIdx += 1 + } + resultRow + } + + // Assemble a batch of variant (binary format) from a batch of Parquet variant values. + def assembleVariantBatch( + input: WritableColumnVector, + output: WritableColumnVector, + schema: VariantSchema): Unit = { + val numRows = input.getElementsAppended + output.reset() + output.reserve(numRows) + val valueChild = output.getChild(0) + val metadataChild = output.getChild(1) + var i = 0 + while (i < numRows) { + if (input.isNullAt(i)) { + output.appendStruct(true) + } else { + output.appendStruct(false) + val v = SparkShreddingUtils.assembleVariant(input.getStruct(i), schema) + valueChild.appendByteArray(v.getValue, 0, v.getValue.length) + metadataChild.appendByteArray(v.getMetadata, 0, v.getMetadata.length) + } + i += 1 + } + } + + // Assemble a batch of variant struct from a batch of Parquet variant values. + def assembleVariantStructBatch( + input: WritableColumnVector, + output: WritableColumnVector, + schema: VariantSchema, + fields: Array[FieldToExtract]): Unit = { + val numRows = input.getElementsAppended + output.reset() + output.reserve(numRows) + val converter = new RowToColumnConverter(StructType(Array(StructField("", output.dataType())))) + val converterVectors = Array(output) + val converterRow = new GenericInternalRow(1) + output.reset() + output.reserve(input.getElementsAppended) + var i = 0 + while (i < numRows) { + if (input.isNullAt(i)) { + converterRow.update(0, null) + } else { + converterRow.update(0, assembleVariantStruct(input.getStruct(i), schema, fields)) + } + converter.convert(converterRow, converterVectors) + i += 1 + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index fcc3a257cd2dd..c78f9702557cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -22,6 +22,7 @@ import java.util.Locale import scala.collection.mutable.{HashMap, HashSet} import scala.jdk.CollectionConverters._ +import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog._ @@ -36,6 +37,7 @@ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.command.ViewHelper.generateViewProperties import org.apache.spark.sql.execution.datasources.{CreateTable => CreateTableV1} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.InsertableRelation import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec @@ -46,14 +48,21 @@ import org.apache.spark.util.ArrayImplicits._ * Replaces [[UnresolvedRelation]]s if the plan is for direct query on files. */ class ResolveSQLOnFile(sparkSession: SparkSession) extends Rule[LogicalPlan] { + + override def conf: SQLConf = sparkSession.sessionState.conf + object UnresolvedRelationResolution { def unapply(plan: LogicalPlan): Option[LogicalPlan] = { - plan match { + val result = plan match { case u: UnresolvedRelation if maybeSQLFile(u) => try { val ds = resolveDataSource(u) Some(LogicalRelation(ds.resolveRelation())) } catch { + case e: SparkUnsupportedOperationException => + u.failAnalysis( + errorClass = e.getCondition, + messageParameters = e.getMessageParameters.asScala.toMap) case _: ClassNotFoundException => None case e: Exception if !e.isInstanceOf[AnalysisException] => // the provider is valid, but failed to create a logical plan @@ -66,6 +75,17 @@ class ResolveSQLOnFile(sparkSession: SparkSession) extends Rule[LogicalPlan] { case _ => None } + result.foreach(resolvedRelation => plan match { + case unresolvedRelation: UnresolvedRelation => + // We put the resolved relation into the [[AnalyzerBridgeState]] for + // it to be later reused by the single-pass [[Resolver]] to avoid resolving the + // relation metadata twice. + AnalysisContext.get.getSinglePassResolverBridgeState.map { bridgeState => + bridgeState.relationsWithResolvedMetadata.put(unresolvedRelation, resolvedRelation) + } + case _ => + }) + result } } @@ -338,6 +358,9 @@ case class PreprocessTableCreation(catalog: SessionCatalog) extends Rule[Logical SchemaUtils.checkSchemaColumnNameDuplication( schema, conf.caseSensitiveAnalysis) + if (!conf.allowCollationsInMapKeys) { + SchemaUtils.checkNoCollationsInMapKeys(schema) + } val normalizedPartCols = normalizePartitionColumns(schema, table) val normalizedBucketSpec = normalizeBucketSpec(schema, table) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala index 56c44a1256815..86fa0c8523f1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala @@ -89,6 +89,7 @@ case class CacheTableAsSelectExec( name = TableIdentifier(tempViewName), userSpecifiedColumns = Nil, comment = None, + collation = None, properties = Map.empty, originalText = Some(originalText), plan = query, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala index 9ffa0d728ca28..9c19609dce79a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala @@ -23,16 +23,19 @@ import scala.jdk.CollectionConverters._ import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.catalyst.analysis.TimeTravelSpec import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SessionConfigSupport, SupportsCatalogOptions, SupportsRead, Table, TableProvider} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SessionConfigSupport, StagedTable, StagingTableCatalog, SupportsCatalogOptions, SupportsRead, Table, TableProvider} import org.apache.spark.sql.connector.catalog.TableCapability.BATCH_READ import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{LongType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -179,4 +182,34 @@ private[sql] object DataSourceV2Utils extends Logging { extraOptions + ("paths" -> objectMapper.writeValueAsString(paths.toArray)) } } + + /** + * If `table` is a StagedTable, commit the staged changes and report the commit metrics. + * Do nothing if the table is not a StagedTable. + */ + def commitStagedChanges( + sparkContext: SparkContext, table: Table, metrics: Map[String, SQLMetric]): Unit = { + table match { + case stagedTable: StagedTable => + stagedTable.commitStagedChanges() + + val driverMetrics = stagedTable.reportDriverMetrics() + if (driverMetrics.nonEmpty) { + for (taskMetric <- driverMetrics) { + metrics.get(taskMetric.name()).foreach(_.set(taskMetric.value())) + } + + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq) + } + case _ => + } + } + + def commitMetrics( + sparkContext: SparkContext, tableCatalog: StagingTableCatalog): Map[String, SQLMetric] = { + tableCatalog.supportedCustomMetrics().map { + metric => metric.name() -> SQLMetrics.createV2CustomMetric(sparkContext, metric) + }.toMap + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala index 2679f14144569..be4f5dcb65aa1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala @@ -18,6 +18,9 @@ package org.apache.spark.sql.execution.datasources.v2 import java.io.{FileNotFoundException, IOException} +import org.apache.hadoop.hdfs.BlockMissingException +import org.apache.hadoop.security.AccessControlException + import org.apache.spark.internal.{Logging, MDC} import org.apache.spark.internal.LogKeys.{CURRENT_FILE, PARTITIONED_FILE_READER} import org.apache.spark.rdd.InputFileBlockHolder @@ -48,6 +51,8 @@ class FilePartitionReader[T]( case e: FileNotFoundException if ignoreMissingFiles => logWarning(s"Skipped missing file.", e) currentReader = null + case e @ (_ : AccessControlException | _ : BlockMissingException) => + throw FileDataSourceV2.attachFilePath(file.urlEncodedPath, e) case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => logWarning( s"Skipped the rest of the content in the corrupted file.", e) @@ -64,6 +69,8 @@ class FilePartitionReader[T]( val hasNext = try { currentReader != null && currentReader.next() } catch { + case e @ (_ : AccessControlException | _ : BlockMissingException) => + throw FileDataSourceV2.attachFilePath(currentReader.file.urlEncodedPath, e) case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => logWarning(log"Skipped the rest of the content in the corrupted file: " + log"${MDC(PARTITIONED_FILE_READER, currentReader)}", e) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala index 4eee731e0b2d6..863104da80c2e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, LogicalWriteInfoImpl} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.streaming.{FileStreamSink, MetadataLogFileIndex} @@ -159,6 +160,19 @@ abstract class FileTable( options.asCaseSensitiveMap().asScala new CaseInsensitiveStringMap(finalOptions.asJava) } + + /** + * Merge the options of FileTable and the LogicalWriteInfo while respecting the + * keys of the options carried by LogicalWriteInfo. + */ + protected def mergedWriteInfo(writeInfo: LogicalWriteInfo): LogicalWriteInfo = { + LogicalWriteInfoImpl( + writeInfo.queryId(), + writeInfo.schema(), + mergedOptions(writeInfo.options()), + writeInfo.rowIdSchema(), + writeInfo.metadataSchema()) + } } object FileTable { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala index f4cabcb69d08c..77e1ade44780f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala @@ -49,7 +49,7 @@ trait FileWrite extends Write { private val schema = info.schema() private val queryId = info.queryId() - private val options = info.options() + val options = info.options() override def description(): String = formatName diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala index 104d8a706efb7..894a3a10d4193 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.TableSpec import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Column, Identifier, StagedTable, StagingTableCatalog, Table, TableCatalog} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.Utils case class ReplaceTableExec( @@ -65,6 +66,9 @@ case class AtomicReplaceTableExec( val tableProperties = CatalogV2Util.convertTableProperties(tableSpec) + override val metrics: Map[String, SQLMetric] = + DataSourceV2Utils.commitMetrics(sparkContext, catalog) + override protected def run(): Seq[InternalRow] = { if (catalog.tableExists(identifier)) { val table = catalog.loadTable(identifier) @@ -92,7 +96,7 @@ case class AtomicReplaceTableExec( private def commitOrAbortStagedChanges(staged: StagedTable): Unit = { Utils.tryWithSafeFinallyAndFailureCallbacks({ - staged.commitStagedChanges() + DataSourceV2Utils.commitStagedChanges(sparkContext, staged, metrics) })(catchBlock = { staged.abortStagedChanges() }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateTableExec.scala index 37339a34af3db..4195560c5cc1c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateTableExec.scala @@ -57,6 +57,7 @@ case class ShowCreateTableExec( showTableOptions(builder, tableOptions) showTablePartitioning(table, builder) showTableComment(table, builder) + showTableCollation(table, builder) showTableLocation(table, builder) showTableProperties(table, builder, tableOptions) } @@ -155,6 +156,12 @@ case class ShowCreateTableExec( .foreach(builder.append) } + private def showTableCollation(table: Table, builder: StringBuilder): Unit = { + Option(table.properties.get(TableCatalog.PROP_COLLATION)) + .map("COLLATION '" + escapeSingleQuotedString(_) + "'\n") + .foreach(builder.append) + } + private def concatByMultiLines(iter: Iterable[String]): String = { iter.mkString("(\n ", ",\n ", ")\n") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index 22c13fd98ced1..0a533645648e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -83,7 +83,7 @@ class V2SessionCatalog(catalog: SessionCatalog) } private def hasCustomSessionCatalog: Boolean = { - catalog.conf.contains(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key) + catalog.conf.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION) != "builtin" } override def loadTable(ident: Identifier): Table = { @@ -239,7 +239,8 @@ class V2SessionCatalog(catalog: SessionCatalog) maybeClusterBySpec.map( clusterBySpec => ClusterBySpec.toProperty(newSchema, clusterBySpec, conf.resolver)), tracksPartitionsInCatalog = conf.manageFilesourcePartitions, - comment = Option(properties.get(TableCatalog.PROP_COMMENT))) + comment = Option(properties.get(TableCatalog.PROP_COMMENT)), + collation = Option(properties.get(TableCatalog.PROP_COLLATION))) try { catalog.createTable(tableDesc, ignoreIfExists = false) @@ -290,6 +291,7 @@ class V2SessionCatalog(catalog: SessionCatalog) val schema = CatalogV2Util.applySchemaChanges( catalogTable.schema, changes, catalogTable.provider, "ALTER TABLE") val comment = properties.get(TableCatalog.PROP_COMMENT) + val collation = properties.get(TableCatalog.PROP_COLLATION) val owner = properties.getOrElse(TableCatalog.PROP_OWNER, catalogTable.owner) val location = properties.get(TableCatalog.PROP_LOCATION).map(CatalogUtils.stringToURI) val storage = if (location.isDefined) { @@ -303,7 +305,7 @@ class V2SessionCatalog(catalog: SessionCatalog) catalog.alterTable( catalogTable.copy( properties = finalProperties, schema = schema, owner = owner, comment = comment, - storage = storage)) + collation = collation, storage = storage)) } catch { case _: NoSuchTableException => throw QueryCompilationErrors.noSuchTableError(ident) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala index 319cc1c731577..17b2579ca873a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.{Optional, UUID} +import scala.jdk.CollectionConverters._ + import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, Project, ReplaceData, WriteDelta} import org.apache.spark.sql.catalyst.rules.Rule @@ -44,7 +46,8 @@ object V2Writes extends Rule[LogicalPlan] with PredicateHelper { override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case a @ AppendData(r: DataSourceV2Relation, query, options, _, None, _) => - val writeBuilder = newWriteBuilder(r.table, options, query.schema) + val writeOptions = mergeOptions(options, r.options.asScala.toMap) + val writeBuilder = newWriteBuilder(r.table, writeOptions, query.schema) val write = writeBuilder.build() val newQuery = DistributionAndOrderingUtils.prepareQuery(write, query, r.funCatalog) a.copy(write = Some(write), query = newQuery) @@ -61,7 +64,8 @@ object V2Writes extends Rule[LogicalPlan] with PredicateHelper { }.toArray val table = r.table - val writeBuilder = newWriteBuilder(table, options, query.schema) + val writeOptions = mergeOptions(options, r.options.asScala.toMap) + val writeBuilder = newWriteBuilder(table, writeOptions, query.schema) val write = writeBuilder match { case builder: SupportsTruncate if isTruncate(predicates) => builder.truncate().build() @@ -76,7 +80,8 @@ object V2Writes extends Rule[LogicalPlan] with PredicateHelper { case o @ OverwritePartitionsDynamic(r: DataSourceV2Relation, query, options, _, None) => val table = r.table - val writeBuilder = newWriteBuilder(table, options, query.schema) + val writeOptions = mergeOptions(options, r.options.asScala.toMap) + val writeBuilder = newWriteBuilder(table, writeOptions, query.schema) val write = writeBuilder match { case builder: SupportsDynamicOverwrite => builder.overwriteDynamicPartitions().build() @@ -87,31 +92,44 @@ object V2Writes extends Rule[LogicalPlan] with PredicateHelper { o.copy(write = Some(write), query = newQuery) case WriteToMicroBatchDataSource( - relation, table, query, queryId, writeOptions, outputMode, Some(batchId)) => - + relationOpt, table, query, queryId, options, outputMode, Some(batchId)) => + val writeOptions = mergeOptions( + options, relationOpt.map(r => r.options.asScala.toMap).getOrElse(Map.empty)) val writeBuilder = newWriteBuilder(table, writeOptions, query.schema, queryId) val write = buildWriteForMicroBatch(table, writeBuilder, outputMode) val microBatchWrite = new MicroBatchWrite(batchId, write.toStreaming) val customMetrics = write.supportedCustomMetrics.toImmutableArraySeq - val funCatalogOpt = relation.flatMap(_.funCatalog) + val funCatalogOpt = relationOpt.flatMap(_.funCatalog) val newQuery = DistributionAndOrderingUtils.prepareQuery(write, query, funCatalogOpt) - WriteToDataSourceV2(relation, microBatchWrite, newQuery, customMetrics) + WriteToDataSourceV2(relationOpt, microBatchWrite, newQuery, customMetrics) case rd @ ReplaceData(r: DataSourceV2Relation, _, query, _, _, None) => val rowSchema = DataTypeUtils.fromAttributes(rd.dataInput) - val writeBuilder = newWriteBuilder(r.table, Map.empty, rowSchema) + val writeOptions = mergeOptions(Map.empty, r.options.asScala.toMap) + val writeBuilder = newWriteBuilder(r.table, writeOptions, rowSchema) val write = writeBuilder.build() val newQuery = DistributionAndOrderingUtils.prepareQuery(write, query, r.funCatalog) // project away any metadata columns that could be used for distribution and ordering rd.copy(write = Some(write), query = Project(rd.dataInput, newQuery)) case wd @ WriteDelta(r: DataSourceV2Relation, _, query, _, projections, None) => - val deltaWriteBuilder = newDeltaWriteBuilder(r.table, Map.empty, projections) + val writeOptions = mergeOptions(Map.empty, r.options.asScala.toMap) + val deltaWriteBuilder = newDeltaWriteBuilder(r.table, writeOptions, projections) val deltaWrite = deltaWriteBuilder.build() val newQuery = DistributionAndOrderingUtils.prepareQuery(deltaWrite, query, r.funCatalog) wd.copy(write = Some(deltaWrite), query = newQuery) } + private def mergeOptions( + commandOptions: Map[String, String], + dsOptions: Map[String, String]): Map[String, String] = { + // for DataFrame API cases, same options are carried by both Command and DataSourceV2Relation + // for DataFrameV2 API cases, options are only carried by Command + // for SQL cases, options are only carried by DataSourceV2Relation + assert(commandOptions == dsOptions || commandOptions.isEmpty || dsOptions.isEmpty) + commandOptions ++ dsOptions + } + private def buildWriteForMicroBatch( table: SupportsWrite, writeBuilder: WriteBuilder, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index b238b0ce9760c..308b1bceca12a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -110,6 +110,9 @@ case class AtomicCreateTableAsSelectExec( val properties = CatalogV2Util.convertTableProperties(tableSpec) + override val metrics: Map[String, SQLMetric] = + DataSourceV2Utils.commitMetrics(sparkContext, catalog) + override protected def run(): Seq[InternalRow] = { if (catalog.tableExists(ident)) { if (ifNotExists) { @@ -197,6 +200,9 @@ case class AtomicReplaceTableAsSelectExec( val properties = CatalogV2Util.convertTableProperties(tableSpec) + override val metrics: Map[String, SQLMetric] = + DataSourceV2Utils.commitMetrics(sparkContext, catalog) + override protected def run(): Seq[InternalRow] = { val columns = getV2Columns(query.schema, catalog.useNullableQuerySchema) if (catalog.tableExists(ident)) { @@ -474,7 +480,7 @@ trait WritingSparkTask[W <: DataWriter[InternalRow]] extends Logging with Serial log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}." + - log"${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, stageAttempt)})") dataWriter.commit() } else { @@ -494,7 +500,8 @@ trait WritingSparkTask[W <: DataWriter[InternalRow]] extends Logging with Serial logInfo(log"Committed partition ${MDC(LogKeys.PARTITION_ID, partId)} " + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + - log"stage ${MDC(LogKeys.STAGE_ID, stageId)}.${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, stageAttempt)})") DataWritingSparkTaskResult(iterWithMetrics.count, msg) @@ -503,12 +510,14 @@ trait WritingSparkTask[W <: DataWriter[InternalRow]] extends Logging with Serial logError(log"Aborting commit for partition ${MDC(LogKeys.PARTITION_ID, partId)} " + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + - log"stage ${MDC(LogKeys.STAGE_ID, stageId)}.${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, stageAttempt)})") dataWriter.abort() logError(log"Aborted commit for partition ${MDC(LogKeys.PARTITION_ID, partId)} " + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + - log"stage ${MDC(LogKeys.STAGE_ID, stageId)}.${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT_ID, stageAttempt)})") }, finallyBlock = { dataWriter.close() }) @@ -630,10 +639,7 @@ private[v2] trait V2CreateTableAsSelectBaseExec extends LeafV2CommandExec { val qe = session.sessionState.executePlan(append) qe.assertCommandExecuted() - table match { - case st: StagedTable => st.commitStagedChanges() - case _ => - } + DataSourceV2Utils.commitStagedChanges(sparkContext, table, metrics) Nil })(catchBlock = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala index 4c201ca66cf6c..df8df37b711fb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala @@ -49,10 +49,12 @@ case class CSVTable( CSVDataSource(parsedOptions).inferSchema(sparkSession, files, parsedOptions) } - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder { - override def build(): Write = CSVWrite(paths, formatName, supportsDataType, info) + override def build(): Write = + CSVWrite(paths, formatName, supportsDataType, mergedWriteInfo(info)) } + } override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWrite.scala index f38a1d385a39c..7011fea77d888 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWrite.scala @@ -58,7 +58,7 @@ case class CSVWrite( } override def getFileExtension(context: TaskAttemptContext): String = { - ".csv" + CodecStreams.getCompressionExtension(context) + "." + csvOptions.extension + CodecStreams.getCompressionExtension(context) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala index 54244c4d95e77..1c1d3393b95a4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala @@ -49,10 +49,12 @@ case class JsonTable( sparkSession, files, parsedOptions) } - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder { - override def build(): Write = JsonWrite(paths, formatName, supportsDataType, info) + override def build(): Write = + JsonWrite(paths, formatName, supportsDataType, mergedWriteInfo(info)) } + } override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala index 1037370967c87..81c347ae9c59c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala @@ -43,10 +43,12 @@ case class OrcTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = OrcUtils.inferSchema(sparkSession, files, options.asScala.toMap) - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder { - override def build(): Write = OrcWrite(paths, formatName, supportsDataType, info) + override def build(): Write = + OrcWrite(paths, formatName, supportsDataType, mergedWriteInfo(info)) } + } override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala index 8463a05569c05..28c5a62f91ecb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala @@ -43,10 +43,12 @@ case class ParquetTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files) - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder { - override def build(): Write = ParquetWrite(paths, formatName, supportsDataType, info) + override def build(): Write = + ParquetWrite(paths, formatName, supportsDataType, mergedWriteInfo(info)) } + } override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonCustomMetric.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonCustomMetric.scala index bca1cbed7e70b..7551cd04f20f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonCustomMetric.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonCustomMetric.scala @@ -45,6 +45,8 @@ object PythonCustomMetric { // See also `UserDefinedPythonDataSource.createPythonMetrics`. PythonSQLMetrics.pythonSizeMetricsDesc.keys .map(_ -> new SQLMetric("size", -1)).toMap ++ + PythonSQLMetrics.pythonTimingMetricsDesc.keys + .map(_ -> new SQLMetric("timing", -1)).toMap ++ PythonSQLMetrics.pythonOtherMetricsDesc.keys .map(_ -> new SQLMetric("sum", -1)).toMap } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/UserDefinedPythonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/UserDefinedPythonDataSource.scala index 241d8087fc3c2..b3fd8479bda0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/UserDefinedPythonDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/UserDefinedPythonDataSource.scala @@ -307,7 +307,7 @@ case class PythonDataSourceReadInfo( /** * Send information to a Python process to plan a Python data source read. * - * @param func an Python data source instance + * @param func a Python data source instance * @param inputSchema input schema to the data source read from its child plan * @param outputSchema output schema of the Python data source */ @@ -342,7 +342,7 @@ private class UserDefinedPythonDataSourceReadRunner( if (length == SpecialLengths.PYTHON_EXCEPTION_THROWN) { val msg = PythonWorkerUtils.readUTF(dataIn) throw QueryCompilationErrors.pythonDataSourceError( - action = "plan", tpe = "read", msg = msg) + action = "initialize", tpe = "reader", msg = msg) } // Receive the pickled 'read' function. @@ -354,7 +354,7 @@ private class UserDefinedPythonDataSourceReadRunner( if (numPartitions == SpecialLengths.PYTHON_EXCEPTION_THROWN) { val msg = PythonWorkerUtils.readUTF(dataIn) throw QueryCompilationErrors.pythonDataSourceError( - action = "plan", tpe = "read", msg = msg) + action = "generate", tpe = "read partitions", msg = msg) } for (_ <- 0 until numPartitions) { val pickledPartition: Array[Byte] = PythonWorkerUtils.readBytes(dataIn) @@ -420,7 +420,7 @@ private class UserDefinedPythonDataSourceWriteRunner( if (length == SpecialLengths.PYTHON_EXCEPTION_THROWN) { val msg = PythonWorkerUtils.readUTF(dataIn) throw QueryCompilationErrors.pythonDataSourceError( - action = "plan", tpe = "write", msg = msg) + action = "initialize", tpe = "writer", msg = msg) } // Receive the pickled data source write function. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala index 2a9abfa5d6a50..4eebc19acee89 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala @@ -53,6 +53,10 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging private lazy val serializedHadoopConf = new SerializableConfiguration(hadoopConf) + // Seq of operator names who uses state schema v3 and TWS related options. + // This Seq was used in checks before reading state schema files. + private val twsShortNameSeq = Seq("transformWithStateExec", "transformWithStateInPandasExec") + override def shortName(): String = "statestore" override def getTable( @@ -132,12 +136,11 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging private def runStateVarChecks( sourceOptions: StateSourceOptions, stateStoreMetadata: Array[StateMetadataTableEntry]): Unit = { - val twsShortName = "transformWithStateExec" if (sourceOptions.stateVarName.isDefined || sourceOptions.readRegisteredTimers) { // Perform checks for transformWithState operator in case state variable name is provided require(stateStoreMetadata.size == 1) val opMetadata = stateStoreMetadata.head - if (opMetadata.operatorName != twsShortName) { + if (!twsShortNameSeq.contains(opMetadata.operatorName)) { // if we are trying to query state source with state variable name, then the operator // should be transformWithState val errorMsg = "Providing state variable names is only supported with the " + @@ -178,7 +181,7 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging } else { // if the operator is transformWithState, then a state variable argument is mandatory if (stateStoreMetadata.size == 1 && - stateStoreMetadata.head.operatorName == twsShortName) { + twsShortNameSeq.contains(stateStoreMetadata.head.operatorName)) { throw StateDataSourceErrors.requiredOptionUnspecified("stateVarName") } } @@ -212,7 +215,7 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging // Read the schema file path from operator metadata version v2 onwards // for the transformWithState operator val oldSchemaFilePath = if (storeMetadata.length > 0 && storeMetadata.head.version == 2 - && storeMetadata.head.operatorName.contains("transformWithStateExec")) { + && twsShortNameSeq.exists(storeMetadata.head.operatorName.contains)) { val storeMetadataEntry = storeMetadata.head val operatorProperties = TransformWithStateOperatorProperties.fromJson( storeMetadataEntry.operatorPropertiesJson) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala index 87ae34532f88a..d8880b84c6211 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala @@ -39,10 +39,12 @@ case class TextTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = Some(StructType(Array(StructField("value", StringType)))) - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder { - override def build(): Write = TextWrite(paths, formatName, supportsDataType, info) + override def build(): Write = + TextWrite(paths, formatName, supportsDataType, mergedWriteInfo(info)) } + } override def supportsDataType(dataType: DataType): Boolean = dataType == StringType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataSource.scala index 71f285e381745..8a179afb0f357 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataSource.scala @@ -25,8 +25,10 @@ import scala.util.control.NonFatal import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.hdfs.BlockMissingException import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.hadoop.security.AccessControlException import org.apache.spark.TaskContext import org.apache.spark.input.{PortableDataStream, StreamInputFormat} @@ -190,6 +192,7 @@ object MultiLineXmlDataSource extends XmlDataSource { Iterator.empty[String] case NonFatal(e) => ExceptionUtils.getRootCause(e) match { + case e @ (_ : AccessControlException | _ : BlockMissingException) => throw e case _: RuntimeException | _: IOException if parsedOptions.ignoreCorruptFiles => logWarning("Skipped the rest of the content in the corrupted file", e) Iterator.empty[String] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala index 3a08b13be0134..6907061d67703 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.DYNAMIC_PRUNING_SUBQUERY import org.apache.spark.sql.execution.{InSubqueryExec, QueryExecution, SparkPlan, SubqueryBroadcastExec} import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec import org.apache.spark.sql.execution.joins._ +import org.apache.spark.sql.internal.SQLConf /** * This planner rule aims at rewriting dynamic pruning predicates in order to reuse the @@ -36,6 +37,8 @@ import org.apache.spark.sql.execution.joins._ */ case class PlanDynamicPruningFilters(sparkSession: SparkSession) extends Rule[SparkPlan] { + override def conf: SQLConf = sparkSession.sessionState.conf + /** * Identify the shape in which keys of a given plan are broadcasted. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index 8ec903f8e61da..de5c3aaa4fe4d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -64,7 +64,7 @@ case class EnsureRequirements( // Ensure that the operator's children satisfy their output distribution requirements. var children = originalChildren.zip(requiredChildDistributions).map { case (child, distribution) if child.outputPartitioning.satisfies(distribution) => - child + ensureOrdering(child, distribution) case (child, BroadcastDistribution(mode)) => BroadcastExchangeExec(mode, child) case (child, distribution) => @@ -290,6 +290,23 @@ case class EnsureRequirements( } } + private def ensureOrdering(plan: SparkPlan, distribution: Distribution) = { + (plan.outputPartitioning, distribution) match { + case (p @ KeyGroupedPartitioning(expressions, _, partitionValues, _), + d @ OrderedDistribution(ordering)) if p.satisfies(d) => + val attrs = expressions.flatMap(_.collectLeaves()).map(_.asInstanceOf[Attribute]) + val partitionOrdering: Ordering[InternalRow] = { + RowOrdering.create(ordering, attrs) + } + // Sort 'commonPartitionValues' and use this mechanism to ensure BatchScan's + // output partitions are ordered + val sorted = partitionValues.sorted(partitionOrdering) + populateCommonPartitionInfo(plan, sorted.map((_, 1)), + None, None, applyPartialClustering = false, replicatePartitions = false) + case _ => plan + } + } + /** * Recursively reorders the join keys based on partitioning. It starts reordering the * join keys to match HashPartitioning on either side, followed by PartitioningCollection. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala index ae982f2f87f2e..d704638b85e8a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala @@ -65,7 +65,8 @@ class ApplyInPandasWithStatePythonRunner( stateValueSchema: StructType, override val pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String]) - extends BasePythonRunner[InType, OutType](funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) + extends BasePythonRunner[InType, OutType]( + funcs.map(_._1), evalType, argOffsets, jobArtifactUUID, pythonMetrics) with PythonArrowInput[InType] with PythonArrowOutput[OutType] { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala index a555d660ea1ac..579b496046852 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala @@ -38,7 +38,7 @@ abstract class BaseArrowPythonRunner( override val pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String]) extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch]( - funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) + funcs.map(_._1), evalType, argOffsets, jobArtifactUUID, pythonMetrics) with BasicPythonArrowInput with BasicPythonArrowOutput { @@ -116,6 +116,9 @@ object ArrowPythonRunner { conf.pandasGroupedMapAssignColumnsByName.toString) val arrowSafeTypeCheck = Seq(SQLConf.PANDAS_ARROW_SAFE_TYPE_CONVERSION.key -> conf.arrowSafeTypeConversion.toString) - Map(timeZoneConf ++ pandasColsByName ++ arrowSafeTypeCheck: _*) + val arrowAyncParallelism = conf.pythonUDFArrowConcurrencyLevel.map(v => + Seq(SQLConf.PYTHON_UDF_ARROW_CONCURRENCY_LEVEL.key -> v.toString) + ).getOrElse(Seq.empty) + Map(timeZoneConf ++ pandasColsByName ++ arrowSafeTypeCheck ++ arrowAyncParallelism: _*) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonUDTFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonUDTFRunner.scala index f52b01b6646ac..99a9e706c6620 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonUDTFRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonUDTFRunner.scala @@ -43,7 +43,7 @@ class ArrowPythonUDTFRunner( jobArtifactUUID: Option[String]) extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch]( Seq(ChainedPythonFunctions(Seq(udtf.func))), evalType, Array(argMetas.map(_.offset)), - jobArtifactUUID) + jobArtifactUUID, pythonMetrics) with BasicPythonArrowInput with BasicPythonArrowOutput { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala index e6958392cad48..28318a319b088 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.EvalPythonExec.ArgumentMetadata +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StructField, StructType} /** @@ -39,10 +40,12 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute] private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) override protected def evaluatorFactory: EvalPythonEvaluatorFactory = { + val batchSize = conf.getConf(SQLConf.PYTHON_UDF_MAX_RECORDS_PER_BATCH) new BatchEvalPythonEvaluatorFactory( child.output, udfs, output, + batchSize, pythonMetrics, jobArtifactUUID, conf.pythonUDFProfiler) @@ -56,6 +59,7 @@ class BatchEvalPythonEvaluatorFactory( childOutput: Seq[Attribute], udfs: Seq[PythonUDF], output: Seq[Attribute], + batchSize: Int, pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String], profiler: Option[String]) @@ -70,7 +74,7 @@ class BatchEvalPythonEvaluatorFactory( EvaluatePython.registerPicklers() // register pickler for Row // Input iterator to Python. - val inputIterator = BatchEvalPythonExec.getInputIterator(iter, schema) + val inputIterator = BatchEvalPythonExec.getInputIterator(iter, schema, batchSize) // Output iterator for results from Python. val outputIterator = @@ -107,7 +111,8 @@ class BatchEvalPythonEvaluatorFactory( object BatchEvalPythonExec { def getInputIterator( iter: Iterator[InternalRow], - schema: StructType): Iterator[Array[Byte]] = { + schema: StructType, + batchSize: Int): Iterator[Array[Byte]] = { val dataTypes = schema.map(_.dataType) val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython) @@ -140,6 +145,6 @@ object BatchEvalPythonExec { } fields } - }.grouped(100).map(x => pickle.dumps(x.toArray)) + }.grouped(batchSize).map(x => pickle.dumps(x.toArray)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala index 9eebd4ea7e79c..c0dcb77817420 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala @@ -23,8 +23,9 @@ import scala.jdk.CollectionConverters._ import net.razorvine.pickle.Unpickler -import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.{JobArtifactSet, SparkEnv, TaskContext} import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType, PythonWorkerUtils} +import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.GenericArrayData @@ -63,7 +64,8 @@ case class BatchEvalPythonUDTFExec( EvaluatePython.registerPicklers() // register pickler for Row // Input iterator to Python. - val inputIterator = BatchEvalPythonExec.getInputIterator(iter, schema) + // For Python UDTF, we don't have a separate configuration for the batch size yet. + val inputIterator = BatchEvalPythonExec.getInputIterator(iter, schema, 100) // Output iterator for results from Python. val outputIterator = @@ -101,6 +103,9 @@ class PythonUDTFRunner( Seq((ChainedPythonFunctions(Seq(udtf.func)), udtf.resultId.id)), PythonEvalType.SQL_TABLE_UDF, Array(argMetas.map(_.offset)), pythonMetrics, jobArtifactUUID) { + // Overriding here to NOT use the same value of UDF config in UDTF. + override val bufferSize: Int = SparkEnv.get.conf.get(BUFFER_SIZE) + override protected def writeUDF(dataOut: DataOutputStream): Unit = { PythonUDTFRunner.writeUDTF(dataOut, udtf, argMetas) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala index 5670cad67e7b0..c5e86d010938d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala @@ -51,7 +51,7 @@ class CoGroupedArrowPythonRunner( profiler: Option[String]) extends BasePythonRunner[ (Iterator[InternalRow], Iterator[InternalRow]), ColumnarBatch]( - funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) + funcs.map(_._1), evalType, argOffsets, jobArtifactUUID, pythonMetrics) with BasicPythonArrowOutput { override val pythonExec: String = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonSQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonSQLMetrics.scala index 4df6d821c014f..bd22739613eef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonSQLMetrics.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonSQLMetrics.scala @@ -24,6 +24,8 @@ trait PythonSQLMetrics { self: SparkPlan => protected val pythonMetrics: Map[String, SQLMetric] = { PythonSQLMetrics.pythonSizeMetricsDesc.map { case (k, v) => k -> SQLMetrics.createSizeMetric(sparkContext, v) + } ++ PythonSQLMetrics.pythonTimingMetricsDesc.map { case (k, v) => + k -> SQLMetrics.createTimingMetric(sparkContext, v) } ++ PythonSQLMetrics.pythonOtherMetricsDesc.map { case (k, v) => k -> SQLMetrics.createMetric(sparkContext, v) } @@ -40,6 +42,14 @@ object PythonSQLMetrics { ) } + val pythonTimingMetricsDesc: Map[String, String] = { + Map( + "pythonBootTime" -> "total time to start Python workers", + "pythonInitTime" -> "total time to initialize Python workers", + "pythonTotalTime" -> "total time to run Python workers" + ) + } + val pythonOtherMetricsDesc: Map[String, String] = { Map("pythonNumRowsReceived" -> "number of output rows") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala index 87ff5a0ec4333..167e1fd8b0f01 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala @@ -36,7 +36,7 @@ abstract class BasePythonUDFRunner( pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String]) extends BasePythonRunner[Array[Byte], Array[Byte]]( - funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) { + funcs.map(_._1), evalType, argOffsets, jobArtifactUUID, pythonMetrics) { override val pythonExec: String = SQLConf.get.pysparkWorkerPythonExecutable.getOrElse( @@ -46,6 +46,8 @@ abstract class BasePythonUDFRunner( override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled + override val bufferSize: Int = SQLConf.get.getConf(SQLConf.PYTHON_UDF_BUFFER_SIZE) + protected def writeUDF(dataOut: DataOutputStream): Unit protected override def newWriter( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala index 7dd4d4647eeba..9b51822679a91 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala @@ -16,27 +16,32 @@ */ package org.apache.spark.sql.execution.python +import java.util.UUID + import scala.concurrent.duration.NANOSECONDS import org.apache.hadoop.conf.Configuration -import org.apache.spark.JobArtifactSet +import org.apache.spark.{JobArtifactSet, SparkException} import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, PythonUDF, SortOrder} +import org.apache.spark.sql.catalyst.plans.logical.ProcessingTime import org.apache.spark.sql.catalyst.plans.physical.Distribution import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.execution.{BinaryExecNode, CoGroupedIterator, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.PandasGroupUtils.{executePython, groupAndProject, resolveArgOffsets} -import org.apache.spark.sql.execution.streaming.{StatefulOperatorCustomMetric, StatefulOperatorCustomSumMetric, StatefulOperatorPartitioning, StatefulOperatorStateInfo, StatefulProcessorHandleImpl, StateStoreWriter, WatermarkSupport} +import org.apache.spark.sql.execution.streaming.{DriverStatefulProcessorHandleImpl, StatefulOperatorCustomMetric, StatefulOperatorCustomSumMetric, StatefulOperatorPartitioning, StatefulOperatorStateInfo, StatefulProcessorHandleImpl, StateStoreWriter, TransformWithStateMetadataUtils, TransformWithStateVariableInfo, WatermarkSupport} import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.StateStoreAwareZipPartitionsHelper -import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateSchemaValidationResult, StateStore, StateStoreConf, StateStoreId, StateStoreOps, StateStoreProviderId} +import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, OperatorStateMetadata, RocksDBStateStoreProvider, StateSchemaValidationResult, StateStore, StateStoreColFamilySchema, StateStoreConf, StateStoreId, StateStoreOps, StateStoreProvider, StateStoreProviderId} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.{OutputMode, TimeMode} import org.apache.spark.sql.types.{BinaryType, StructField, StructType} -import org.apache.spark.util.{CompletionIterator, SerializableConfiguration} +import org.apache.spark.util.{CompletionIterator, SerializableConfiguration, Utils} /** * Physical operator for executing @@ -52,8 +57,11 @@ import org.apache.spark.util.{CompletionIterator, SerializableConfiguration} * @param eventTimeWatermarkForLateEvents event time watermark for filtering late events * @param eventTimeWatermarkForEviction event time watermark for state eviction * @param child the physical plan for the underlying data + * @param isStreaming defines whether the query is streaming or batch + * @param hasInitialState defines whether the query has initial state * @param initialState the physical plan for the input initial state * @param initialStateGroupingAttrs grouping attributes for initial state + * @param initialStateSchema schema for initial state */ case class TransformWithStateInPandasExec( functionExpr: Expression, @@ -66,12 +74,17 @@ case class TransformWithStateInPandasExec( eventTimeWatermarkForLateEvents: Option[Long], eventTimeWatermarkForEviction: Option[Long], child: SparkPlan, + isStreaming: Boolean = true, hasInitialState: Boolean, initialState: SparkPlan, initialStateGroupingAttrs: Seq[Attribute], initialStateSchema: StructType) - extends BinaryExecNode with StateStoreWriter with WatermarkSupport { + extends BinaryExecNode + with StateStoreWriter + with WatermarkSupport + with TransformWithStateMetadataUtils { + override def shortName: String = "transformWithStateInPandasExec" private val pythonUDF = functionExpr.asInstanceOf[PythonUDF] private val pythonFunction = pythonUDF.func private val chainedFunc = @@ -80,6 +93,7 @@ case class TransformWithStateInPandasExec( private val sessionLocalTimeZone = conf.sessionLocalTimeZone private val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf) private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + private val (dedupAttributes, argOffsets) = resolveArgOffsets(child.output, groupingAttributes) private val groupingKeyStructFields = groupingAttributes .map(a => StructField(a.name, a.dataType, a.nullable)) @@ -98,6 +112,22 @@ case class TransformWithStateInPandasExec( // Each state variable has its own schema, this is a dummy one. protected val schemaForValueRow: StructType = new StructType().add("value", BinaryType) + override def operatorStateMetadataVersion: Int = 2 + + override def getColFamilySchemas(): Map[String, StateStoreColFamilySchema] = { + driverProcessorHandle.getColumnFamilySchemas + } + + override def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo] = { + driverProcessorHandle.getStateVariableInfos + } + + /** Metadata of this stateful operator and its states stores. + * Written during IncrementalExecution. `validateAndMaybeEvolveStateSchema` will initialize + * `columnFamilySchemas` and `stateVariableInfos` during `init()` call on driver. */ + private val driverProcessorHandle: DriverStatefulProcessorHandleImpl = + new DriverStatefulProcessorHandleImpl(timeMode, groupingKeyExprEncoder) + /** * Distribute by grouping attributes - We need the underlying data and the initial state data * to have the same grouping so that the data are co-located on the same task. @@ -118,12 +148,74 @@ case class TransformWithStateInPandasExec( groupingAttributes.map(SortOrder(_, Ascending)), initialStateGroupingAttrs.map(SortOrder(_, Ascending))) + override def operatorStateMetadata( + stateSchemaPaths: List[String]): OperatorStateMetadata = { + getOperatorStateMetadata(stateSchemaPaths, getStateInfo, shortName, timeMode, outputMode) + } + + override def validateNewMetadata( + oldOperatorMetadata: OperatorStateMetadata, + newOperatorMetadata: OperatorStateMetadata): Unit = { + validateNewMetadataForTWS(oldOperatorMetadata, newOperatorMetadata) + } + override def validateAndMaybeEvolveStateSchema( hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): List[StateSchemaValidationResult] = { - // TODO(SPARK-49212): Implement schema evolution support - List.empty + // Start a python runner on driver, and execute pre-init UDF on the runner + val runner = new TransformWithStateInPandasPythonPreInitRunner( + pythonFunction, + "pyspark.sql.streaming.transform_with_state_driver_worker", + sessionLocalTimeZone, + groupingKeySchema, + driverProcessorHandle + ) + // runner initialization + runner.init() + try { + // execute UDF on the python runner + runner.process() + } catch { + case e: Throwable => + throw new SparkException("TransformWithStateInPandas driver worker " + + "exited unexpectedly (crashed)", e) + } + runner.stop() + + validateAndWriteStateSchema(hadoopConf, batchId, stateSchemaVersion, getStateInfo, + session, operatorStateMetadataVersion) + } + + override def shouldRunAnotherBatch(newInputWatermark: Long): Boolean = { + if (timeMode == ProcessingTime) { + // TODO SPARK-50180: check if we can return true only if actual timers are registered, + // or there is expired state + true + } else if (outputMode == OutputMode.Append || outputMode == OutputMode.Update) { + eventTimeWatermarkForEviction.isDefined && + newInputWatermark > eventTimeWatermarkForEviction.get + } else { + false + } + } + + /** + * Controls watermark propagation to downstream modes. If timeMode is + * ProcessingTime, the output rows cannot be interpreted in eventTime, hence + * this node will not propagate watermark in this timeMode. + * + * For timeMode EventTime, output watermark is same as input Watermark because + * transformWithState does not allow users to set the event time column to be + * earlier than the watermark. + */ + override def produceOutputWatermark(inputWatermarkMs: Long): Option[Long] = { + timeMode match { + case ProcessingTime => + None + case _ => + Some(inputWatermarkMs) + } } override def customStatefulOperatorMetrics: Seq[StatefulOperatorCustomMetric] = { @@ -156,18 +248,32 @@ case class TransformWithStateInPandasExec( metrics if (!hasInitialState) { - child.execute().mapPartitionsWithStateStore[InternalRow]( - getStateInfo, - schemaForKeyRow, - schemaForValueRow, - NoPrefixKeyStateEncoderSpec(schemaForKeyRow), - session.sqlContext.sessionState, - Some(session.sqlContext.streams.stateStoreCoordinator), - useColumnFamilies = true, - useMultipleValuesPerKey = true - ) { - case (store: StateStore, dataIterator: Iterator[InternalRow]) => - processDataWithPartition(store, dataIterator) + if (isStreaming) { + child.execute().mapPartitionsWithStateStore[InternalRow]( + getStateInfo, + schemaForKeyRow, + schemaForValueRow, + NoPrefixKeyStateEncoderSpec(schemaForKeyRow), + session.sqlContext.sessionState, + Some(session.sqlContext.streams.stateStoreCoordinator), + useColumnFamilies = true, + useMultipleValuesPerKey = true + ) { + case (store: StateStore, dataIterator: Iterator[InternalRow]) => + processDataWithPartition(store, dataIterator) + } + } else { + // If the query is running in batch mode, we need to create a new StateStore and instantiate + // a temp directory on the executors in mapPartitionsWithIndex. + val hadoopConfBroadcast = sparkContext.broadcast( + new SerializableConfiguration(session.sessionState.newHadoopConf())) + child.execute().mapPartitionsWithIndex[InternalRow]( + (partitionId: Int, dataIterator: Iterator[InternalRow]) => { + initNewStateStoreAndProcessData(partitionId, hadoopConfBroadcast) { store => + processDataWithPartition(store, dataIterator) + } + } + ) } } else { val storeConf = new StateStoreConf(session.sqlContext.sessionState.conf) @@ -182,25 +288,71 @@ case class TransformWithStateInPandasExec( // The state store aware zip partitions will provide us with two iterators, // child data iterator and the initial state iterator per partition. case (partitionId, childDataIterator, initStateIterator) => - val stateStoreId = StateStoreId(stateInfo.get.checkpointLocation, - stateInfo.get.operatorId, partitionId) - val storeProviderId = StateStoreProviderId(stateStoreId, stateInfo.get.queryRunId) - val store = StateStore.get( - storeProviderId = storeProviderId, - keySchema = schemaForKeyRow, - valueSchema = schemaForValueRow, - NoPrefixKeyStateEncoderSpec(schemaForKeyRow), - version = stateInfo.get.storeVersion, - stateStoreCkptId = stateInfo.get.getStateStoreCkptId(partitionId).map(_.head), - useColumnFamilies = true, - storeConf = storeConf, - hadoopConf = hadoopConfBroadcast.value.value - ) - processDataWithPartition(store, childDataIterator, initStateIterator) + if (isStreaming) { + val stateStoreId = StateStoreId(stateInfo.get.checkpointLocation, + stateInfo.get.operatorId, partitionId) + val storeProviderId = StateStoreProviderId(stateStoreId, stateInfo.get.queryRunId) + val store = StateStore.get( + storeProviderId = storeProviderId, + keySchema = schemaForKeyRow, + valueSchema = schemaForValueRow, + NoPrefixKeyStateEncoderSpec(schemaForKeyRow), + version = stateInfo.get.storeVersion, + stateStoreCkptId = stateInfo.get.getStateStoreCkptId(partitionId).map(_.head), + useColumnFamilies = true, + storeConf = storeConf, + hadoopConf = hadoopConfBroadcast.value.value + ) + processDataWithPartition(store, childDataIterator, initStateIterator) + } else { + initNewStateStoreAndProcessData(partitionId, hadoopConfBroadcast) { store => + processDataWithPartition(store, childDataIterator, initStateIterator) + } + } } } } + /** + * Create a new StateStore for given partitionId and instantiate a temp directory + * on the executors. Process data and close the stateStore provider afterwards. + */ + private def initNewStateStoreAndProcessData( + partitionId: Int, + hadoopConfBroadcast: Broadcast[SerializableConfiguration]) + (f: StateStore => Iterator[InternalRow]): Iterator[InternalRow] = { + + val providerId = { + val tempDirPath = Utils.createTempDir().getAbsolutePath + new StateStoreProviderId( + StateStoreId(tempDirPath, 0, partitionId), getStateInfo.queryRunId) + } + + val sqlConf = new SQLConf() + sqlConf.setConfString(SQLConf.STATE_STORE_PROVIDER_CLASS.key, + classOf[RocksDBStateStoreProvider].getName) + val storeConf = new StateStoreConf(sqlConf) + + // Create StateStoreProvider for this partition + val stateStoreProvider = StateStoreProvider.createAndInit( + providerId, + schemaForKeyRow, + schemaForValueRow, + NoPrefixKeyStateEncoderSpec(schemaForKeyRow), + useColumnFamilies = true, + storeConf = storeConf, + hadoopConf = hadoopConfBroadcast.value.value, + useMultipleValuesPerKey = true) + + val store = stateStoreProvider.getStore(0, None) + val outputIterator = f(store) + CompletionIterator[InternalRow, Iterator[InternalRow]](outputIterator.iterator, { + stateStoreProvider.close() + }).map { row => + row + } + } + private def processDataWithPartition( store: StateStore, dataIterator: Iterator[InternalRow], @@ -213,12 +365,18 @@ case class TransformWithStateInPandasExec( val currentTimeNs = System.nanoTime val updatesStartTimeNs = currentTimeNs - val (dedupAttributes, argOffsets) = resolveArgOffsets(child.output, groupingAttributes) - val data = - groupAndProject(dataIterator, groupingAttributes, child.output, dedupAttributes) + // If timeout is based on event time, then filter late data based on watermark + val filteredIter = watermarkPredicateForDataForLateEvents match { + case Some(predicate) => + applyRemovingRowsOlderThanWatermark(dataIterator, predicate) + case _ => + dataIterator + } + + val data = groupAndProject(filteredIter, groupingAttributes, child.output, dedupAttributes) val processorHandle = new StatefulProcessorHandleImpl(store, getStateInfo.queryRunId, - groupingKeyExprEncoder, timeMode, isStreaming = true, batchTimestampMs, metrics) + groupingKeyExprEncoder, timeMode, isStreaming, batchTimestampMs, metrics) val outputIterator = if (!hasInitialState) { val runner = new TransformWithStateInPandasPythonRunner( @@ -270,8 +428,12 @@ case class TransformWithStateInPandasExec( // by the upstream (consumer) operators in addition to the processing in this operator. allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { - processorHandle.doTtlCleanup() - store.commit() + if (isStreaming) { + processorHandle.doTtlCleanup() + store.commit() + } else { + store.abort() + } } setStoreMetrics(store) setOperatorMetrics() @@ -293,3 +455,48 @@ case class TransformWithStateInPandasExec( override def right: SparkPlan = initialState } + +// scalastyle:off argcount +object TransformWithStateInPandasExec { + + // Plan logical transformWithStateInPandas for batch queries + def generateSparkPlanForBatchQueries( + functionExpr: Expression, + groupingAttributes: Seq[Attribute], + output: Seq[Attribute], + outputMode: OutputMode, + timeMode: TimeMode, + child: SparkPlan, + hasInitialState: Boolean = false, + initialState: SparkPlan, + initialStateGroupingAttrs: Seq[Attribute], + initialStateSchema: StructType): SparkPlan = { + val shufflePartitions = child.session.sessionState.conf.numShufflePartitions + val statefulOperatorStateInfo = StatefulOperatorStateInfo( + checkpointLocation = "", // empty checkpointLocation will be populated in doExecute + queryRunId = UUID.randomUUID(), + operatorId = 0, + storeVersion = 0, + numPartitions = shufflePartitions, + stateStoreCkptIds = None + ) + + new TransformWithStateInPandasExec( + functionExpr, + groupingAttributes, + output, + outputMode, + timeMode, + Some(statefulOperatorStateInfo), + Some(System.currentTimeMillis), + None, + None, + child, + isStreaming = false, + hasInitialState, + initialState, + initialStateGroupingAttrs, + initialStateSchema) + } +} +// scalastyle:on argcount diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasPythonRunner.scala index c5980012124fe..f415ae2543d34 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasPythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasPythonRunner.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.python -import java.io.DataOutputStream +import java.io.{DataInputStream, DataOutputStream} import java.net.ServerSocket import scala.concurrent.ExecutionContext @@ -25,13 +25,13 @@ import scala.concurrent.ExecutionContext import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.ArrowStreamWriter -import org.apache.spark.TaskContext -import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions, PythonRDD} +import org.apache.spark.{SparkException, TaskContext} +import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions, PythonFunction, PythonRDD, PythonWorkerUtils, StreamingPythonRunner} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.TransformWithStateInPandasPythonRunner.{GroupedInType, InType} -import org.apache.spark.sql.execution.streaming.StatefulProcessorHandleImpl +import org.apache.spark.sql.execution.streaming.{DriverStatefulProcessorHandleImpl, StatefulProcessorHandleImpl} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -173,16 +173,16 @@ abstract class TransformWithStateInPandasPythonBaseRunner[I]( groupingKeySchema: StructType, batchTimestampMs: Option[Long], eventTimeWatermarkForEviction: Option[Long]) - extends BasePythonRunner[I, ColumnarBatch](funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) + extends BasePythonRunner[I, ColumnarBatch]( + funcs.map(_._1), evalType, argOffsets, jobArtifactUUID, pythonMetrics) with PythonArrowInput[I] with BasicPythonArrowOutput + with TransformWithStateInPandasPythonRunnerUtils with Logging { protected val sqlConf = SQLConf.get protected val arrowMaxRecordsPerBatch = sqlConf.arrowMaxRecordsPerBatch - private var stateServerSocketPort: Int = 0 - override protected val workerConf: Map[String, String] = initialWorkerConf + (SQLConf.ARROW_EXECUTION_MAX_RECORDS_PER_BATCH.key -> arrowMaxRecordsPerBatch.toString) @@ -204,21 +204,7 @@ abstract class TransformWithStateInPandasPythonBaseRunner[I]( inputIterator: Iterator[I], partitionIndex: Int, context: TaskContext): Iterator[ColumnarBatch] = { - var stateServerSocket: ServerSocket = null - var failed = false - try { - stateServerSocket = new ServerSocket( /* port = */ 0, - /* backlog = */ 1) - stateServerSocketPort = stateServerSocket.getLocalPort - } catch { - case e: Throwable => - failed = true - throw e - } finally { - if (failed) { - closeServerSocketChannelSilently(stateServerSocket) - } - } + initStateServer() val executor = ThreadUtils.newDaemonSingleThreadExecutor("stateConnectionListenerThread") val executionContext = ExecutionContext.fromExecutor(executor) @@ -238,7 +224,108 @@ abstract class TransformWithStateInPandasPythonBaseRunner[I]( super.compute(inputIterator, partitionIndex, context) } - private def closeServerSocketChannelSilently(stateServerSocket: ServerSocket): Unit = { + override protected def writeUDF(dataOut: DataOutputStream): Unit = { + PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, None) + } +} + +/** + * TransformWithStateInPandas driver side Python runner. Similar as executor side runner, + * will start a new daemon thread on the Python runner to run state server. + */ +class TransformWithStateInPandasPythonPreInitRunner( + func: PythonFunction, + workerModule: String, + timeZoneId: String, + groupingKeySchema: StructType, + processorHandleImpl: DriverStatefulProcessorHandleImpl) + extends StreamingPythonRunner(func, "", "", workerModule) + with TransformWithStateInPandasPythonRunnerUtils + with Logging { + protected val sqlConf = SQLConf.get + + private var dataOut: DataOutputStream = _ + private var dataIn: DataInputStream = _ + + private var daemonThread: Thread = _ + + override def init(): (DataOutputStream, DataInputStream) = { + val result = super.init() + dataOut = result._1 + dataIn = result._2 + + // start state server, update socket port + startStateServer() + (dataOut, dataIn) + } + + def process(): Unit = { + // Also write the port number for state server + dataOut.writeInt(stateServerSocketPort) + PythonWorkerUtils.writeUTF(groupingKeySchema.json, dataOut) + dataOut.flush() + + val resFromPython = dataIn.readInt() + if (resFromPython != 0) { + val errMessage = PythonWorkerUtils.readUTF(dataIn) + throw streamingPythonRunnerInitializationFailure(resFromPython, errMessage) + } + } + + override def stop(): Unit = { + super.stop() + closeServerSocketChannelSilently(stateServerSocket) + daemonThread.interrupt() + } + + private def startStateServer(): Unit = { + initStateServer() + + daemonThread = new Thread { + override def run(): Unit = { + try { + new TransformWithStateInPandasStateServer(stateServerSocket, processorHandleImpl, + groupingKeySchema, timeZoneId, errorOnDuplicatedFieldNames = true, + largeVarTypes = sqlConf.arrowUseLargeVarTypes, + sqlConf.arrowTransformWithStateInPandasMaxRecordsPerBatch).run() + } catch { + case e: Exception => + throw new SparkException("TransformWithStateInPandas state server " + + "daemon thread exited unexpectedly (crashed)", e) + } + } + } + daemonThread.setDaemon(true) + daemonThread.setName("stateConnectionListenerThread") + daemonThread.start() + } +} + +/** + * TransformWithStateInPandas Python runner utils functions for handling a state server + * in a new daemon thread. + */ +trait TransformWithStateInPandasPythonRunnerUtils extends Logging { + protected var stateServerSocketPort: Int = 0 + protected var stateServerSocket: ServerSocket = null + protected def initStateServer(): Unit = { + var failed = false + try { + stateServerSocket = new ServerSocket(/* port = */ 0, + /* backlog = */ 1) + stateServerSocketPort = stateServerSocket.getLocalPort + } catch { + case e: Throwable => + failed = true + throw e + } finally { + if (failed) { + closeServerSocketChannelSilently(stateServerSocket) + } + } + } + + protected def closeServerSocketChannelSilently(stateServerSocket: ServerSocket): Unit = { try { logInfo(log"closing the state server socket") stateServerSocket.close() @@ -247,10 +334,6 @@ abstract class TransformWithStateInPandasPythonBaseRunner[I]( logError(log"failed to close state server socket", e) } } - - override protected def writeUDF(dataOut: DataOutputStream): Unit = { - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, None) - } } object TransformWithStateInPandasPythonRunner { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServer.scala index 0373c8607ff2c..e37e4266b46b8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServer.scala @@ -33,8 +33,9 @@ import org.apache.spark.sql.api.python.PythonSQLUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.execution.streaming.{ImplicitGroupingKeyTracker, StatefulProcessorHandleImpl, StatefulProcessorHandleState, StateVariableType} -import org.apache.spark.sql.execution.streaming.state.StateMessage.{HandleState, ImplicitGroupingKeyRequest, ListStateCall, MapStateCall, StatefulProcessorCall, StateRequest, StateResponse, StateResponseWithLongTypeVal, StateVariableRequest, TimerRequest, TimerStateCallCommand, TimerValueRequest, ValueStateCall} +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.execution.streaming.{ImplicitGroupingKeyTracker, StatefulProcessorHandleImpl, StatefulProcessorHandleImplBase, StatefulProcessorHandleState, StateVariableType} +import org.apache.spark.sql.execution.streaming.state.StateMessage.{HandleState, ImplicitGroupingKeyRequest, ListStateCall, MapStateCall, StatefulProcessorCall, StateRequest, StateResponse, StateResponseWithLongTypeVal, StateResponseWithStringTypeVal, StateVariableRequest, TimerRequest, TimerStateCallCommand, TimerValueRequest, UtilsRequest, ValueStateCall} import org.apache.spark.sql.streaming.{ListState, MapState, TTLConfig, ValueState} import org.apache.spark.sql.types.{BinaryType, LongType, StructField, StructType} import org.apache.spark.sql.util.ArrowUtils @@ -52,7 +53,7 @@ import org.apache.spark.util.Utils */ class TransformWithStateInPandasStateServer( stateServerSocket: ServerSocket, - statefulProcessorHandle: StatefulProcessorHandleImpl, + statefulProcessorHandle: StatefulProcessorHandleImplBase, groupingKeySchema: StructType, timeZoneId: String, errorOnDuplicatedFieldNames: Boolean, @@ -120,6 +121,8 @@ class TransformWithStateInPandasStateServer( } /** Timer related class variables */ + // An iterator to store all expired timer info. This is meant to be consumed only once per + // partition. This should be called after finishing handling all input rows. private var expiryTimestampIter: Option[Iterator[(Any, Long)]] = if (expiryTimerIterForTest != null) { Option(expiryTimerIterForTest) @@ -156,6 +159,11 @@ class TransformWithStateInPandasStateServer( logWarning(log"No more data to read from the socket") statefulProcessorHandle.setHandleState(StatefulProcessorHandleState.CLOSED) return + case _: InterruptedException => + logInfo(log"Thread interrupted, shutting down state server") + Thread.currentThread().interrupt() + statefulProcessorHandle.setHandleState(StatefulProcessorHandleState.CLOSED) + return case e: Exception => logError(log"Error reading message: ${MDC(LogKeys.ERROR, e.getMessage)}", e) sendResponse(1, e.getMessage) @@ -184,6 +192,19 @@ class TransformWithStateInPandasStateServer( handleStateVariableRequest(message.getStateVariableRequest) case StateRequest.MethodCase.TIMERREQUEST => handleTimerRequest(message.getTimerRequest) + case StateRequest.MethodCase.UTILSREQUEST => + handleUtilsRequest(message.getUtilsRequest) + case _ => + throw new IllegalArgumentException("Invalid method call") + } + } + + private[sql] def handleUtilsRequest(message: UtilsRequest): Unit = { + message.getMethodCase match { + case UtilsRequest.MethodCase.PARSESTRINGSCHEMA => + val stringSchema = message.getParseStringSchema.getSchema + val schema = CatalystSqlParser.parseTableSchema(stringSchema) + sendResponseWithStringVal(0, null, schema.json) case _ => throw new IllegalArgumentException("Invalid method call") } @@ -212,11 +233,13 @@ class TransformWithStateInPandasStateServer( // API and it will only be used by `group_ops` once per partition, we won't // need to worry about different function calls will interleaved and hence // this implementation is safe + assert(statefulProcessorHandle.isInstanceOf[StatefulProcessorHandleImpl]) val expiryRequest = message.getExpiryTimerRequest() val expiryTimestamp = expiryRequest.getExpiryTimestampMs if (!expiryTimestampIter.isDefined) { expiryTimestampIter = - Option(statefulProcessorHandle.getExpiredTimers(expiryTimestamp)) + Option(statefulProcessorHandle + .asInstanceOf[StatefulProcessorHandleImpl].getExpiredTimers(expiryTimestamp)) } // expiryTimestampIter could be None in the TWSPandasServerSuite if (!expiryTimestampIter.isDefined || !expiryTimestampIter.get.hasNext) { @@ -265,6 +288,9 @@ class TransformWithStateInPandasStateServer( case StatefulProcessorCall.MethodCase.SETHANDLESTATE => val requestedState = message.getSetHandleState.getState requestedState match { + case HandleState.PRE_INIT => + logInfo(log"set handle state to Pre-init") + statefulProcessorHandle.setHandleState(StatefulProcessorHandleState.PRE_INIT) case HandleState.CREATED => logInfo(log"set handle state to Created") statefulProcessorHandle.setHandleState(StatefulProcessorHandleState.CREATED) @@ -688,6 +714,22 @@ class TransformWithStateInPandasStateServer( outputStream.write(responseMessageBytes) } + def sendResponseWithStringVal( + status: Int, + errorMessage: String = null, + stringVal: String): Unit = { + val responseMessageBuilder = StateResponseWithStringTypeVal.newBuilder().setStatusCode(status) + if (status != 0 && errorMessage != null) { + responseMessageBuilder.setErrorMessage(errorMessage) + } + responseMessageBuilder.setValue(stringVal) + val responseMessage = responseMessageBuilder.build() + val responseMessageBytes = responseMessage.toByteArray + val byteLength = responseMessageBytes.length + outputStream.writeInt(byteLength) + outputStream.write(responseMessageBytes) + } + def sendIteratorAsArrowBatches[T]( iter: Iterator[T], outputSchema: StructType, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala index ea1f5e6ae1340..575e3d4072b8c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala @@ -24,12 +24,13 @@ import scala.collection.mutable.ArrayBuffer import net.razorvine.pickle.Pickler import org.apache.spark.api.python.{PythonEvalType, PythonFunction, PythonWorkerUtils, SpecialLengths} -import org.apache.spark.sql.{Column, DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.{Column, DataFrame, Dataset, SparkSession, TableArg, TableValuedFunctionArgument} import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Descending, Expression, FunctionTableSubqueryArgumentExpression, NamedArgumentExpression, NullsFirst, NullsLast, PythonUDAF, PythonUDF, PythonUDTF, PythonUDTFAnalyzeResult, PythonUDTFSelectedExpression, SortOrder, UnresolvedPolymorphicPythonUDTF} import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.{Generate, LogicalPlan, NamedParametersSupport, OneRowRelation} +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.errors.QueryCompilationErrors -import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} +import org.apache.spark.sql.internal.ExpressionUtils.expression import org.apache.spark.sql.types.{DataType, StructType} /** @@ -75,10 +76,10 @@ case class UserDefinedPythonFunction( * Returns a [[Column]] that will evaluate the UDF expression with the given input. */ def fromUDFExpr(expr: Expression): Column = { - expr match { + Column(expr match { case udaf: PythonUDAF => udaf.toAggregateExpression() case _ => expr - } + }) } } @@ -159,8 +160,16 @@ case class UserDefinedPythonTableFunction( } /** Returns a [[DataFrame]] that will evaluate to calling this UDTF with the given input. */ - def apply(session: SparkSession, exprs: Column*): DataFrame = { - val udtf = builder(exprs.map(session.expression), session.sessionState.sqlParser) + def apply(session: SparkSession, exprs: TableValuedFunctionArgument*): DataFrame = { + val parser = session.sessionState.sqlParser + val expressions = exprs.map { + case col: Column => session.expression(col) + case tableArg: TableArg => tableArg.expression + case other => throw new IllegalArgumentException( + s"Unsupported argument type: ${other.getClass.getName}" + ) + } + val udtf = builder(expressions, parser) Dataset.ofRows(session, udtf) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala index 148766f9d0026..221ca17ddf19d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala @@ -22,13 +22,13 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, Da import scala.collection.mutable import org.apache.spark.internal.Logging -import org.apache.spark.sql.{functions, DataFrame} +import org.apache.spark.sql.{functions, Column, DataFrame} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -52,13 +52,15 @@ object FrequentItems extends Logging { df: DataFrame, cols: Seq[String], support: Double): DataFrame = { + import df.sparkSession.expression require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val frequentItemCols = cols.map { col => - column(new CollectFrequentItems(functions.col(col), sizeOfMap)).as(s"${col}_freqItems") + Column(new CollectFrequentItems(expression(functions.col(col)), sizeOfMap)) + .as(s"${col}_freqItems") } df.select(frequentItemCols: _*) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala index 2a7e9818aedd9..511f4421e16ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala @@ -214,7 +214,7 @@ class IncrementalExecution( // filepath, and write this path out in the OperatorStateMetadata file case statefulOp: StatefulOperator if isFirstBatch => val stateSchemaVersion = statefulOp match { - case _: TransformWithStateExec => + case _: TransformWithStateExec | _: TransformWithStateInPandasExec => sparkSession.sessionState.conf. getConf(SQLConf.STREAMING_TRANSFORM_WITH_STATE_OP_STATE_SCHEMA_VERSION) case _ => STATE_SCHEMA_DEFAULT_VERSION @@ -439,6 +439,23 @@ class IncrementalExecution( eventTimeWatermarkForEviction = iwEviction) )) + // UpdateEventTimeColumnExec is used to tag the eventTime column, and validate + // emitted rows adhere to watermark in the output of transformWithStateInp. + // Hence, this node shares the same watermark value as TransformWithStateInPandasExec. + // This is the same as above in TransformWithStateExec. + // The only difference is TransformWithStateInPandasExec is analysed slightly different + // with no SerializeFromObjectExec wrapper. + case UpdateEventTimeColumnExec(eventTime, delay, None, t: TransformWithStateInPandasExec) + if t.stateInfo.isDefined => + val stateInfo = t.stateInfo.get + val iwLateEvents = inputWatermarkForLateEvents(stateInfo) + val iwEviction = inputWatermarkForEviction(stateInfo) + + UpdateEventTimeColumnExec(eventTime, delay, iwLateEvents, + t.copy( + eventTimeWatermarkForLateEvents = iwLateEvents, + eventTimeWatermarkForEviction = iwEviction) + ) case t: TransformWithStateExec if t.stateInfo.isDefined => t.copy( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala index 4c8dd6a193c25..4e32b80578155 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._ import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors} import org.apache.spark.sql.streaming.{ListState, TTLConfig} -import org.apache.spark.sql.types.StructType import org.apache.spark.util.NextIterator /** @@ -45,21 +44,13 @@ class ListStateImplWithTTL[S]( valEncoder: ExpressionEncoder[Any], ttlConfig: TTLConfig, batchTimestampMs: Long, - metrics: Map[String, SQLMetric] = Map.empty) - extends SingleKeyTTLStateImpl(stateName, store, keyExprEnc, batchTimestampMs) - with ListStateMetricsImpl - with ListState[S] { - - override def stateStore: StateStore = store - override def baseStateName: String = stateName - override def exprEncSchema: StructType = keyExprEnc.schema + metrics: Map[String, SQLMetric]) + extends OneToManyTTLState( + stateName, store, keyExprEnc.schema, ttlConfig, batchTimestampMs, metrics) with ListState[S] { private lazy val stateTypesEncoder = StateTypesEncoder(keyExprEnc, valEncoder, stateName, hasTtl = true) - private lazy val ttlExpirationMs = - StateTTL.calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) - initialize() private def initialize(): Unit = { @@ -106,35 +97,22 @@ class ListStateImplWithTTL[S]( validateNewState(newState) val encodedKey = stateTypesEncoder.encodeGroupingKey() - var isFirst = true - var entryCount = 0L - TWSMetricsUtils.resetMetric(metrics, "numUpdatedStateRows") - - newState.foreach { v => - val encodedValue = stateTypesEncoder.encodeValue(v, ttlExpirationMs) - if (isFirst) { - store.put(encodedKey, encodedValue, stateName) - isFirst = false - } else { - store.merge(encodedKey, encodedValue, stateName) - } - entryCount += 1 - TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") + val newStateUnsafeRows = newState.iterator.map { v => + stateTypesEncoder.encodeValue(v, ttlExpirationMs) } - upsertTTLForStateKey(encodedKey) - updateEntryCount(encodedKey, entryCount) + + updatePrimaryAndSecondaryIndices(true, encodedKey, newStateUnsafeRows, ttlExpirationMs) } /** Append an entry to the list. */ override def appendValue(newState: S): Unit = { StateStoreErrors.requireNonNullStateValue(newState, stateName) + val encodedKey = stateTypesEncoder.encodeGroupingKey() - val entryCount = getEntryCount(encodedKey) - store.merge(encodedKey, - stateTypesEncoder.encodeValue(newState, ttlExpirationMs), stateName) - TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") - upsertTTLForStateKey(encodedKey) - updateEntryCount(encodedKey, entryCount + 1) + val newStateUnsafeRow = stateTypesEncoder.encodeValue(newState, ttlExpirationMs) + + updatePrimaryAndSecondaryIndices(false, encodedKey, + Iterator.single(newStateUnsafeRow), ttlExpirationMs) } /** Append an entire list to the existing value. */ @@ -142,25 +120,21 @@ class ListStateImplWithTTL[S]( validateNewState(newState) val encodedKey = stateTypesEncoder.encodeGroupingKey() - var entryCount = getEntryCount(encodedKey) - newState.foreach { v => - val encodedValue = stateTypesEncoder.encodeValue(v, ttlExpirationMs) - store.merge(encodedKey, encodedValue, stateName) - entryCount += 1 - TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") + // The UnsafeRows created here are reused: we do NOT copy them. As a result, + // this iterator must only be used lazily, and it should never be materialized, + // unless you call newStateUnsafeRows.map(_.copy()). + val newStateUnsafeRows = newState.iterator.map { v => + stateTypesEncoder.encodeValue(v, ttlExpirationMs) } - upsertTTLForStateKey(encodedKey) - updateEntryCount(encodedKey, entryCount) + + updatePrimaryAndSecondaryIndices(false, encodedKey, + newStateUnsafeRows, ttlExpirationMs) } /** Remove this state. */ override def clear(): Unit = { - val encodedKey = stateTypesEncoder.encodeGroupingKey() - store.remove(encodedKey, stateName) - val entryCount = getEntryCount(encodedKey) - TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows", entryCount) - removeEntryCount(encodedKey) - clearTTLState() + val groupingKey = stateTypesEncoder.encodeGroupingKey() + clearAllStateForElementKey(groupingKey) } private def validateNewState(newState: Array[S]): Unit = { @@ -175,36 +149,41 @@ class ListStateImplWithTTL[S]( /** * Loops through all the values associated with the grouping key, and removes * the expired elements from the list. - * @param groupingKey grouping key for which cleanup should be performed. + * @param elementKey grouping key for which cleanup should be performed. */ - override def clearIfExpired(groupingKey: UnsafeRow): Long = { + override def clearExpiredValues(elementKey: UnsafeRow): ValueExpirationResult = { var numValuesExpired = 0L - val unsafeRowValuesIterator = store.valuesIterator(groupingKey, stateName) + val unsafeRowValuesIterator = store.valuesIterator(elementKey, stateName) // We clear the list, and use the iterator to put back all of the non-expired values - store.remove(groupingKey, stateName) - removeEntryCount(groupingKey) + store.remove(elementKey, stateName) + + var newMinExpirationMsOpt: Option[Long] = None var isFirst = true - var entryCount = 0L unsafeRowValuesIterator.foreach { encodedValue => if (!stateTypesEncoder.isExpired(encodedValue, batchTimestampMs)) { if (isFirst) { - store.put(groupingKey, encodedValue, stateName) isFirst = false + store.put(elementKey, encodedValue, stateName) } else { - store.merge(groupingKey, encodedValue, stateName) + store.merge(elementKey, encodedValue, stateName) + } + + // If it is not expired, it needs to be reinserted (either via put or merge), but + // it also has an expiration time that might be the new minimum. + val currentExpirationMs = stateTypesEncoder.decodeTtlExpirationMs(encodedValue) + + newMinExpirationMsOpt = newMinExpirationMsOpt match { + case Some(minExpirationMs) => + Some(math.min(minExpirationMs, currentExpirationMs.get)) + case None => + Some(currentExpirationMs.get) } - entryCount += 1 } else { numValuesExpired += 1 } } - updateEntryCount(groupingKey, entryCount) - TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows", numValuesExpired) - numValuesExpired - } - private def upsertTTLForStateKey(encodedGroupingKey: UnsafeRow): Unit = { - upsertTTLForStateKey(ttlExpirationMs, encodedGroupingKey) + ValueExpirationResult(numValuesExpired, newMinExpirationMsOpt) } /* @@ -238,11 +217,23 @@ class ListStateImplWithTTL[S]( } } + private[sql] def getMinValues(): Iterator[Long] = { + val groupingKey = stateTypesEncoder.encodeGroupingKey() + minIndexIterator() + .filter(_._1 == groupingKey) + .map(_._2) + } + /** - * Get all ttl values stored in ttl state for current implicit - * grouping key. + * Get the TTL value stored in TTL state for the current implicit grouping key, + * if it exists. */ - private[sql] def getValuesInTTLState(): Iterator[Long] = { - getValuesInTTLState(stateTypesEncoder.encodeGroupingKey()) + private[sql] def getValueInTTLState(): Option[Long] = { + val groupingKey = stateTypesEncoder.encodeGroupingKey() + val ttlRowsForGroupingKey = getTTLRows().filter(_.elementKey == groupingKey).toSeq + + assert(ttlRowsForGroupingKey.size <= 1, "Multiple TTLRows found for grouping key " + + s"$groupingKey. Expected at most 1. Found: ${ttlRowsForGroupingKey.mkString(", ")}.") + ttlRowsForGroupingKey.headOption.map(_.expirationMs) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala index 19704b6d1bd59..64581006555e7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._ import org.apache.spark.sql.execution.streaming.state.{PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors} @@ -48,17 +47,14 @@ class MapStateImplWithTTL[K, V]( valEncoder: ExpressionEncoder[Any], ttlConfig: TTLConfig, batchTimestampMs: Long, - metrics: Map[String, SQLMetric] = Map.empty) - extends CompositeKeyTTLStateImpl[K](stateName, store, - keyExprEnc, userKeyEnc, batchTimestampMs) - with MapState[K, V] with Logging { +metrics: Map[String, SQLMetric]) + extends OneToOneTTLState( + stateName, store, getCompositeKeySchema(keyExprEnc.schema, userKeyEnc.schema), ttlConfig, + batchTimestampMs, metrics) with MapState[K, V] with Logging { private val stateTypesEncoder = new CompositeKeyStateEncoder( keyExprEnc, userKeyEnc, valEncoder, stateName, hasTtl = true) - private val ttlExpirationMs = - StateTTL.calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) - initialize() private def initialize(): Unit = { @@ -102,15 +98,12 @@ class MapStateImplWithTTL[K, V]( StateStoreErrors.requireNonNullStateValue(key, stateName) StateStoreErrors.requireNonNullStateValue(value, stateName) - val encodedGroupingKey = stateTypesEncoder.encodeGroupingKey() - val encodedUserKey = stateTypesEncoder.encodeUserKey(key) - - val encodedValue = stateTypesEncoder.encodeValue(value, ttlExpirationMs) val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(key) - store.put(encodedCompositeKey, encodedValue, stateName) - TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") + val ttlExpirationMs = StateTTL + .calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) + val encodedValue = stateTypesEncoder.encodeValue(value, ttlExpirationMs) - upsertTTLForStateKey(ttlExpirationMs, encodedGroupingKey, encodedUserKey) + updatePrimaryAndSecondaryIndices(encodedCompositeKey, encodedValue, ttlExpirationMs) } /** Get the map associated with grouping key */ @@ -161,41 +154,12 @@ class MapStateImplWithTTL[K, V]( /** Remove this state. */ override def clear(): Unit = { - keys().foreach { itr => - removeKey(itr) - } - clearTTLState() - } - - /** - * Clears the user state associated with this grouping key - * if it has expired. This function is called by Spark to perform - * cleanup at the end of transformWithState processing. - * - * Spark uses a secondary index to determine if the user state for - * this grouping key has expired. However, its possible that the user - * has updated the TTL and secondary index is out of date. Implementations - * must validate that the user State has actually expired before cleanup based - * on their own State data. - * - * @param groupingKey grouping key for which cleanup should be performed. - * @param userKey user key for which cleanup should be performed. - */ - override def clearIfExpired( - groupingKeyRow: UnsafeRow, - userKeyRow: UnsafeRow): Long = { - val compositeKeyRow = stateTypesEncoder.encodeCompositeKey(groupingKeyRow, userKeyRow) + val encodedGroupingKey = stateTypesEncoder.encodeGroupingKey() + val unsafeRowPairIterator = store.prefixScan(encodedGroupingKey, stateName) - val retRow = store.get(compositeKeyRow, stateName) - var numRemovedElements = 0L - if (retRow != null) { - if (stateTypesEncoder.isExpired(retRow, batchTimestampMs)) { - store.remove(compositeKeyRow, stateName) - numRemovedElements += 1 - TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows") - } + unsafeRowPairIterator.foreach { rowPair => + clearAllStateForElementKey(rowPair.key) } - numRemovedElements } /* @@ -243,30 +207,18 @@ class MapStateImplWithTTL[K, V]( * grouping key. */ private[sql] def getKeyValuesInTTLState(): Iterator[(K, Long)] = { - val ttlIterator = ttlIndexIterator() val implicitGroupingKey = stateTypesEncoder.encodeGroupingKey() - var nextValue: Option[(K, Long)] = None - - new Iterator[(K, Long)] { - override def hasNext: Boolean = { - while (nextValue.isEmpty && ttlIterator.hasNext) { - val nextTtlValue = ttlIterator.next() - val groupingKey = nextTtlValue.groupingKey - if (groupingKey equals implicitGroupingKey.getStruct( - 0, keyExprEnc.schema.length)) { - val userKey = stateTypesEncoder.decodeUserKey( - nextTtlValue.userKey) - nextValue = Some(userKey.asInstanceOf[K], nextTtlValue.expirationMs) - } - } - nextValue.isDefined - } - - override def next(): (K, Long) = { - val result = nextValue.get - nextValue = None - result - } + .getStruct(0, keyExprEnc.schema.length) + + // We're getting composite rows back + getTTLRows().filter { ttlRow => + val compositeKey = ttlRow.elementKey + val groupingKey = compositeKey.getStruct(0, keyExprEnc.schema.length) + groupingKey == implicitGroupingKey + }.map { ttlRow => + val compositeKey = ttlRow.elementKey + val userKey = stateTypesEncoder.decodeCompositeKey(compositeKey) + (userKey.asInstanceOf[K], ttlRow.expirationMs) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala index 40d58e5a402a1..23e72fc4e3e2f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala @@ -513,6 +513,7 @@ class MicroBatchExecution( execCtx.startOffsets ++= execCtx.endOffsets watermarkTracker.setWatermark( math.max(watermarkTracker.currentWatermark, commitMetadata.nextBatchWatermarkMs)) + currentStateStoreCkptId ++= commitMetadata.stateUniqueIds } else if (latestCommittedBatchId == latestBatchId - 1) { execCtx.endOffsets.foreach { case (source: Source, end: Offset) => @@ -965,7 +966,8 @@ class MicroBatchExecution( updateStateStoreCkptId(execCtx, latestExecPlan) } execCtx.reportTimeTaken("commitOffsets") { - if (!commitLog.add(execCtx.batchId, CommitMetadata(watermarkTracker.currentWatermark))) { + if (!commitLog.add(execCtx.batchId, + CommitMetadata(watermarkTracker.currentWatermark, currentStateStoreCkptId.toMap))) { throw QueryExecutionErrors.concurrentStreamLogUpdate(execCtx.batchId) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala index e1e5b3a7ef88e..a599f3bc66118 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala @@ -102,7 +102,7 @@ object OffsetSeqMetadata extends Logging { FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION, STREAMING_AGGREGATION_STATE_FORMAT_VERSION, STREAMING_JOIN_STATE_FORMAT_VERSION, STATE_STORE_COMPRESSION_CODEC, STATE_STORE_ROCKSDB_FORMAT_VERSION, STATEFUL_OPERATOR_USE_STRICT_DISTRIBUTION, - PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN + PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN, STREAMING_STATE_STORE_ENCODING_FORMAT ) /** @@ -125,7 +125,8 @@ object OffsetSeqMetadata extends Logging { SymmetricHashJoinStateManager.legacyVersion.toString, STATE_STORE_COMPRESSION_CODEC.key -> CompressionCodec.LZ4, STATEFUL_OPERATOR_USE_STRICT_DISTRIBUTION.key -> "false", - PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN.key -> "true" + PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN.key -> "true", + STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "unsaferow" ) def apply(json: String): OffsetSeqMetadata = Serialization.read[OffsetSeqMetadata](json) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala index 7da8408f98b0f..585298fa4c993 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala @@ -20,10 +20,49 @@ import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._ import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, PrefixKeyScanStateEncoderSpec, StateStoreColFamilySchema} -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types._ object StateStoreColumnFamilySchemaUtils { + /** + * Avro uses zig-zag encoding for some fixed-length types, like Longs and Ints. For range scans + * we want to use big-endian encoding, so we need to convert the source schema to replace these + * types with BinaryType. + * + * @param schema The schema to convert + * @param ordinals If non-empty, only convert fields at these ordinals. + * If empty, convert all fields. + */ + def convertForRangeScan(schema: StructType, ordinals: Seq[Int] = Seq.empty): StructType = { + val ordinalSet = ordinals.toSet + + StructType(schema.fields.zipWithIndex.flatMap { case (field, idx) => + if ((ordinals.isEmpty || ordinalSet.contains(idx)) && isFixedSize(field.dataType)) { + // For each numeric field, create two fields: + // 1. Byte marker for null, positive, or negative values + // 2. The original numeric value in big-endian format + // Byte type is converted to Int in Avro, which doesn't work for us as Avro + // uses zig-zag encoding as opposed to big-endian for Ints + Seq( + StructField(s"${field.name}_marker", BinaryType, nullable = false), + field.copy(name = s"${field.name}_value", BinaryType) + ) + } else { + Seq(field) + } + }) + } + + private def isFixedSize(dataType: DataType): Boolean = dataType match { + case _: ByteType | _: BooleanType | _: ShortType | _: IntegerType | _: LongType | + _: FloatType | _: DoubleType => true + case _ => false + } + + def getTtlColFamilyName(stateName: String): String = { + "$ttl_" + stateName + } + def getValueStateSchema[T]( stateName: String, keyEncoder: ExpressionEncoder[Any], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala index d87de4c69c40a..a2b7ee4ba7916 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala @@ -30,6 +30,11 @@ import org.apache.spark.sql.types._ * files and to be passed into `RocksDBStateKey(/Value)Encoder`. */ object TransformWithStateKeyValueRowSchemaUtils { + /** + * Creates a schema that is the concatenation of the grouping key and a user-defined + * key. This is used by MapState to create a composite key that is then treated as + * an "elementKey" by OneToOneTTLState. + */ def getCompositeKeySchema( groupingKeySchema: StructType, userKeySchema: StructType): StructType = { @@ -38,24 +43,37 @@ object TransformWithStateKeyValueRowSchemaUtils { .add("userKey", new StructType(userKeySchema.fields)) } - def getSingleKeyTTLRowSchema(keySchema: StructType): StructType = + /** + * Represents the schema of keys in the TTL index, managed by TTLState implementations. + * There is no value associated with entries in the TTL index, so there is no method + * called, for example, getTTLValueSchema. + */ + def getTTLRowKeySchema(keySchema: StructType): StructType = new StructType() .add("expirationMs", LongType) - .add("groupingKey", keySchema) + .add("elementKey", keySchema) - def getCompositeKeyTTLRowSchema( - groupingKeySchema: StructType, - userKeySchema: StructType): StructType = + /** + * Represents the schema of a single long value, which is used to store the expiration + * timestamp of elements in the minimum index, managed by OneToManyTTLState. + */ + def getExpirationMsRowSchema(): StructType = new StructType() .add("expirationMs", LongType) - .add("groupingKey", new StructType(groupingKeySchema.fields)) - .add("userKey", new StructType(userKeySchema.fields)) + /** + * Represents the schema of an element with TTL in the primary index. We store the expiration + * of each value along with the value itself, since each value has its own TTL. It is used as + * the value schema of every value, for every stateful variable. + */ def getValueSchemaWithTTL(schema: StructType, hasTTL: Boolean): StructType = { if (hasTTL) { - new StructType().add("value", schema) + new StructType() + .add("value", schema) .add("ttlExpirationMs", LongType) - } else schema + } else { + schema + } } } @@ -118,7 +136,9 @@ class StateTypesEncoder[V]( def decodeValue(row: UnsafeRow): V = { if (hasTtl) { rowToObjDeserializer.apply(row.getStruct(0, valEncoder.schema.length)) - } else rowToObjDeserializer.apply(row) + } else { + rowToObjDeserializer.apply(row) + } } /** @@ -225,10 +245,6 @@ class CompositeKeyStateEncoder[K, V]( compositeKeyProjection(InternalRow(groupingKey, userKey)) } - def decodeUserKey(row: UnsafeRow): K = { - userKeyRowToObjDeserializer.apply(row) - } - /** * The input row is of composite Key schema. * Only user key is returned though grouping key also exist in the row. @@ -239,37 +255,14 @@ class CompositeKeyStateEncoder[K, V]( } /** Class for TTL with single key serialization */ -class SingleKeyTTLEncoder( - keyExprEnc: ExpressionEncoder[Any]) { - - private val ttlKeyProjection = UnsafeProjection.create( - getSingleKeyTTLRowSchema(keyExprEnc.schema)) - - def encodeTTLRow(expirationMs: Long, groupingKey: UnsafeRow): UnsafeRow = { - ttlKeyProjection.apply( - InternalRow(expirationMs, groupingKey.asInstanceOf[InternalRow])) - } -} - -/** Class for TTL with composite key serialization */ -class CompositeKeyTTLEncoder[K]( - keyExprEnc: ExpressionEncoder[Any], - userKeyEnc: ExpressionEncoder[Any]) { +class TTLEncoder(schema: StructType) { - private val ttlKeyProjection = UnsafeProjection.create( - getCompositeKeyTTLRowSchema(keyExprEnc.schema, userKeyEnc.schema)) + private val ttlKeyProjection = UnsafeProjection.create(getTTLRowKeySchema(schema)) - def encodeTTLRow( - expirationMs: Long, - groupingKey: UnsafeRow, - userKey: UnsafeRow): UnsafeRow = { + // Take a groupingKey UnsafeRow and turn it into a (expirationMs, groupingKey) UnsafeRow. + def encodeTTLRow(expirationMs: Long, elementKey: UnsafeRow): UnsafeRow = { ttlKeyProjection.apply( - InternalRow( - expirationMs, - groupingKey.getStruct(0, keyExprEnc.schema.length) - .asInstanceOf[InternalRow], - userKey.getStruct(0, userKeyEnc.schema.length) - .asInstanceOf[InternalRow])) + InternalRow(expirationMs, elementKey.asInstanceOf[InternalRow])) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala index 0f90fa8d9e490..5d13af0af7c43 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala @@ -189,7 +189,7 @@ class StatefulProcessorHandleImpl( def doTtlCleanup(): Unit = { val numValuesRemovedDueToTTLExpiry = metrics.get("numValuesRemovedDueToTTLExpiry").get ttlStates.forEach { s => - numValuesRemovedDueToTTLExpiry += s.clearExpiredState() + numValuesRemovedDueToTTLExpiry += s.clearExpiredStateForAllKeys() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index bd501c9357234..44202bb0d2944 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -715,6 +715,7 @@ abstract class StreamExecution( object StreamExecution { val QUERY_ID_KEY = "sql.streaming.queryId" + val RUN_ID_KEY = "sql.streaming.runId" val IS_CONTINUOUS_PROCESSING = "__is_continuous_processing" val IO_EXCEPTION_NAMES = Seq( classOf[InterruptedException].getName, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala index 87d1a15dff1a9..b4449f99d6ba5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala @@ -19,274 +19,529 @@ package org.apache.spark.sql.execution.streaming import java.time.Duration import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._ -import org.apache.spark.sql.execution.streaming.state.{RangeKeyScanStateEncoderSpec, StateStore} +import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, RangeKeyScanStateEncoderSpec, StateStore} +import org.apache.spark.sql.streaming.TTLConfig import org.apache.spark.sql.types._ -object StateTTLSchema { - val TTL_VALUE_ROW_SCHEMA: StructType = - StructType(Array(StructField("__dummy__", NullType))) -} - /** - * Encapsulates the ttl row information stored in [[SingleKeyTTLStateImpl]]. + * Any state variable that wants to support TTL must implement this trait, + * which they can do by extending [[OneToOneTTLState]] or [[OneToManyTTLState]]. * - * @param groupingKey grouping key for which ttl is set - * @param expirationMs expiration time for the grouping key - */ -case class SingleKeyTTLRow( - groupingKey: UnsafeRow, - expirationMs: Long) - -/** - * Encapsulates the ttl row information stored in [[CompositeKeyTTLStateImpl]]. + * The only required methods here are ones relating to evicting expired and all + * state, via clearExpiredStateForAllKeys and clearAllStateForElementKey, + * respectively. How classes do this is implementation detail, but the general + * pattern is to use secondary indexes to make sure cleanup scans + * theta(records to evict), not theta(all records). * - * @param groupingKey grouping key for which ttl is set - * @param userKey user key for which ttl is set - * @param expirationMs expiration time for the grouping key - */ -case class CompositeKeyTTLRow( - groupingKey: UnsafeRow, - userKey: UnsafeRow, - expirationMs: Long) - -/** - * Represents the underlying state for secondary TTL Index for a user defined - * state variable. + * There are two broad patterns of implementing stateful variables, and thus + * there are two broad patterns for implementing TTL. The first is when there + * is a one-to-one mapping between an element key [1] and a value; the primary + * and secondary index management for this case is implemented by + * [[OneToOneTTLState]]. When a single element key can have multiple values, + * all of which can expire at their own, unique times, then + * [[OneToManyTTLState]] should be used. + * + * In either case, implementations need to use some sort of secondary index + * that orders element keys by expiration time. This base functionality + * is provided by methods in this trait that read/write/delete to the + * so-called "TTL index". It is a secondary index with the layout of + * (expirationMs, elementKey) -> EMPTY_ROW. The expirationMs is big-endian + * encoded to allow for efficient range scans to find all expired keys. + * + * TTLState (or any abstract sub-classes) should never deal with encoding or + * decoding UnsafeRows to and from their user-facing types. The stateful variable + * themselves should be doing this; all other TTLState sub-classes should be concerned + * only with writing, reading, and deleting UnsafeRows and their associated + * expirations from the primary and secondary indexes. [2] + * + * [1]. You might ask, why call it "element key" instead of "grouping key"? + * This is because a single grouping key might have multiple elements, as in + * the case of a map, which has composite keys of the form (groupingKey, mapKey). + * In the case of ValueState, though, the element key is the grouping key. + * To generalize to both cases, this class should always use the term elementKey.) * - * This state allows Spark to query ttl values based on expiration time - * allowing efficient ttl cleanup. + * [2]. You might also ask, why design it this way? We want the TTLState abstract + * sub-classes to write to both the primary and secondary indexes, since they + * both need to stay in sync; co-locating the logic is cleanest. */ trait TTLState { + // Name of the state variable, e.g. the string the user passes to get{Value/List/Map}State + // in the init() method of a StatefulProcessor. + private[sql] def stateName: String + + // The StateStore instance used to store the state. There is only one instance shared + // among the primary and secondary indexes, since it uses virtual column families + // to keep the indexes separate. + private[sql] def store: StateStore + + // The schema of the primary key for the state variable. For value and list state, this + // is the grouping key. For map state, this is the composite key of the grouping key and + // a map key. + private[sql] def elementKeySchema: StructType + + // The timestamp at which the batch is being processed. All state variables that have + // an expiration at or before this timestamp must be cleaned up. + private[sql] def batchTimestampMs: Long + + // The configuration for this run of the streaming query. It may change between runs + // (e.g. user sets ttlConfig1, stops their query, updates to ttlConfig2, and then + // resumes their query). + private[sql] def ttlConfig: TTLConfig + + // A map from metric name to the underlying SQLMetric. This should not be updated + // by the underlying state variable, as the TTL state implementation should be + // handling all reads/writes/deletes to the indexes. + private[sql] def metrics: Map[String, SQLMetric] = Map.empty + + private final val TTL_INDEX = "$ttl_" + stateName + private final val TTL_INDEX_KEY_SCHEMA = getTTLRowKeySchema(elementKeySchema) + private final val TTL_EMPTY_VALUE_ROW_SCHEMA: StructType = + StructType(Array(StructField("__empty__", NullType))) + + private final val TTL_ENCODER = new TTLEncoder(elementKeySchema) + + // Empty row used for values + private final val TTL_EMPTY_VALUE_ROW = + UnsafeProjection.create(Array[DataType](NullType)).apply(InternalRow.apply(null)) - /** - * Perform the user state clean up based on ttl values stored in - * this state. NOTE that its not safe to call this operation concurrently - * when the user can also modify the underlying State. Cleanup should be initiated - * after arbitrary state operations are completed by the user. - * - * @return number of values cleaned up. - */ - def clearExpiredState(): Long -} + private[sql] final def ttlExpirationMs = StateTTL + .calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) -/** - * Manages the ttl information for user state keyed with a single key (grouping key). - */ -abstract class SingleKeyTTLStateImpl( - stateName: String, - store: StateStore, - keyExprEnc: ExpressionEncoder[Any], - ttlExpirationMs: Long) - extends TTLState { + store.createColFamilyIfAbsent( + TTL_INDEX, + TTL_INDEX_KEY_SCHEMA, + TTL_EMPTY_VALUE_ROW_SCHEMA, + RangeKeyScanStateEncoderSpec(TTL_INDEX_KEY_SCHEMA, Seq(0)), + isInternal = true + ) - import org.apache.spark.sql.execution.streaming.StateTTLSchema._ + private[sql] def insertIntoTTLIndex(expirationMs: Long, elementKey: UnsafeRow): Unit = { + val secondaryIndexKey = TTL_ENCODER.encodeTTLRow(expirationMs, elementKey) + store.put(secondaryIndexKey, TTL_EMPTY_VALUE_ROW, TTL_INDEX) + } - private val ttlColumnFamilyName = "$ttl_" + stateName - private val keySchema = getSingleKeyTTLRowSchema(keyExprEnc.schema) - private val keyTTLRowEncoder = new SingleKeyTTLEncoder(keyExprEnc) + // The deleteFromTTLIndex overload that takes an expiration time and elementKey as an + // argument is used when we need to _construct_ the key to delete from the TTL index. + // + // If we know the timestamp to delete and the elementKey, but don't have a pre-constructed + // UnsafeRow, then you should use this method to delete from the TTL index. + private[sql] def deleteFromTTLIndex(expirationMs: Long, elementKey: UnsafeRow): Unit = { + val secondaryIndexKey = TTL_ENCODER.encodeTTLRow(expirationMs, elementKey) + store.remove(secondaryIndexKey, TTL_INDEX) + } - // empty row used for values - private val EMPTY_ROW = - UnsafeProjection.create(Array[DataType](NullType)).apply(InternalRow.apply(null)) + // The deleteFromTTLIndex overload that takes an UnsafeRow as an argument is used when + // we're deleting elements from the TTL index that we are iterating over. + // + // If we were to use the other deleteFromTTLIndex method, we would have to re-encode the + // components into an UnsafeRow. It is more efficient to just pass the UnsafeRow that we + // read from the iterator. + private[sql] def deleteFromTTLIndex(ttlKey: UnsafeRow): Unit = { + store.remove(ttlKey, TTL_INDEX) + } - store.createColFamilyIfAbsent(ttlColumnFamilyName, keySchema, TTL_VALUE_ROW_SCHEMA, - RangeKeyScanStateEncoderSpec(keySchema, Seq(0)), isInternal = true) + private[sql] def toTTLRow(ttlKey: UnsafeRow): TTLRow = { + val expirationMs = ttlKey.getLong(0) + val elementKey = ttlKey.getStruct(1, TTL_INDEX_KEY_SCHEMA.length) + TTLRow(elementKey, expirationMs) + } - /** - * This function will be called when clear() on State Variables - * with ttl enabled is called. This function should clear any - * associated ttlState, since we are clearing the user state. - */ - def clearTTLState(): Unit = { - val iterator = store.iterator(ttlColumnFamilyName) - iterator.foreach { kv => - store.remove(kv.key, ttlColumnFamilyName) - } + private[sql] def getTTLRows(): Iterator[TTLRow] = { + store.iterator(TTL_INDEX).map(kv => toTTLRow(kv.key)) } - def upsertTTLForStateKey( - expirationMs: Long, - groupingKey: UnsafeRow): Unit = { - val encodedTtlKey = keyTTLRowEncoder.encodeTTLRow( - expirationMs, groupingKey) - store.put(encodedTtlKey, EMPTY_ROW, ttlColumnFamilyName) + // Returns an Iterator over all the keys in the TTL index that have expired. This method + // does not delete the keys from the TTL index; it is the responsibility of the caller + // to do so. + // + // The schema of the UnsafeRow returned by this iterator is (expirationMs, elementKey). + private[sql] def ttlEvictionIterator(): Iterator[UnsafeRow] = { + val ttlIterator = store.iterator(TTL_INDEX) + + // Recall that the format is (expirationMs, elementKey) -> TTL_EMPTY_VALUE_ROW, so + // kv.value doesn't ever need to be used. + ttlIterator.takeWhile { kv => + val expirationMs = kv.key.getLong(0) + StateTTL.isExpired(expirationMs, batchTimestampMs) + }.map(_.key) } + // Encapsulates a row stored in a TTL index. Exposed for testing. + private[sql] case class TTLRow(elementKey: UnsafeRow, expirationMs: Long) + /** - * Clears any state which has ttl older than [[ttlExpirationMs]]. + * Evicts the state associated with this stateful variable that has expired + * due to TTL. The eviction applies to all grouping keys, and to all indexes, + * primary or secondary. + * + * This method can be called at any time in the micro-batch execution, + * as long as it is allowed to complete before subsequent state operations are + * issued. Operations to the state variable should not be issued concurrently while + * this is running, since it may leave the state variable in an inconsistent state + * as it cleans up. + * + * @return number of values cleaned up. */ - override def clearExpiredState(): Long = { - val iterator = store.iterator(ttlColumnFamilyName) - var numValuesExpired = 0L + private[sql] def clearExpiredStateForAllKeys(): Long - iterator.takeWhile { kv => - val expirationMs = kv.key.getLong(0) - StateTTL.isExpired(expirationMs, ttlExpirationMs) - }.foreach { kv => - val groupingKey = kv.key.getStruct(1, keyExprEnc.schema.length) - numValuesExpired += clearIfExpired(groupingKey) - store.remove(kv.key, ttlColumnFamilyName) + /** + * When a user calls clear() on a stateful variable, this method is invoked to + * clear all of the state for the current (implicit) grouping key. It is responsible + * for deleting from the primary index as well as any secondary index(es). + * + * If a given state variable has to clean up multiple elementKeys (in MapState, for + * example, every key in the map is its own elementKey), then this method should + * be invoked for each of those keys. + */ + private[sql] def clearAllStateForElementKey(elementKey: UnsafeRow): Unit +} + +/** + * OneToOneTTLState is an implementation of [[TTLState]] that is used to manage + * TTL for state variables that need a single secondary index to efficiently manage + * records with an expiration. + * + * The primary index for state variables that can use a [[OneToOneTTLState]] have + * the form of: [elementKey -> (value, elementExpiration)]. You'll notice that, given + * a timestamp, it would take linear time to probe the primary index for all of its + * expired values. + * + * As a result, this class uses helper methods from [[TTLState]] to maintain the secondary + * index from [(elementExpiration, elementKey) -> EMPTY_ROW]. + * + * For an explanation of why this structure is not always sufficient (e.g. why the class + * [[OneToManyTTLState]] is needed), please visit its class-doc comment. + */ +abstract class OneToOneTTLState( + stateNameArg: String, + storeArg: StateStore, + elementKeySchemaArg: StructType, + ttlConfigArg: TTLConfig, + batchTimestampMsArg: Long, + metricsArg: Map[String, SQLMetric]) extends TTLState { + override private[sql] def stateName: String = stateNameArg + override private[sql] def store: StateStore = storeArg + override private[sql] def elementKeySchema: StructType = elementKeySchemaArg + override private[sql] def ttlConfig: TTLConfig = ttlConfigArg + override private[sql] def batchTimestampMs: Long = batchTimestampMsArg + override private[sql] def metrics: Map[String, SQLMetric] = metricsArg + + /** + * This method updates the TTL for the given elementKey to be expirationMs, + * updating both the primary and secondary indices if needed. + * + * Note that an elementKey may be the state variable's grouping key, _or_ it + * could be a composite key. MapState is an example of a state variable that + * has composite keys, which has the structure of the groupingKey followed by + * the specific key in the map. This method doesn't need to know what type of + * key is being used, though, since in either case, it's just an UnsafeRow. + * + * @param elementKey the key for which the TTL should be updated, which may + * either be the encoded grouping key, or the grouping key + * and some user-defined key. + * @param elementValue the value to update the primary index with. It is of the + * form (value, expirationMs). + * @param expirationMs the new expiration timestamp to use for elementKey. + */ + private[sql] def updatePrimaryAndSecondaryIndices( + elementKey: UnsafeRow, + elementValue: UnsafeRow, + expirationMs: Long): Unit = { + val existingPrimaryValue = store.get(elementKey, stateName) + + // Doesn't exist. Insert into the primary and TTL indexes. + if (existingPrimaryValue == null) { + store.put(elementKey, elementValue, stateName) + TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") + insertIntoTTLIndex(expirationMs, elementKey) + } else { + // If the values are equal, then they must be equal in actual value and the expiration + // timestamp. We don't need to update any index in this case. + if (elementValue != existingPrimaryValue) { + store.put(elementKey, elementValue, stateName) + TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") + + // Small optimization: the value could have changed, but the expirationMs could have + // stayed the same. We only put into the TTL index if the expirationMs has changed. + val existingExpirationMs = existingPrimaryValue.getLong(1) + if (existingExpirationMs != expirationMs) { + deleteFromTTLIndex(existingExpirationMs, elementKey) + insertIntoTTLIndex(expirationMs, elementKey) + } + } } - numValuesExpired } - private[sql] def ttlIndexIterator(): Iterator[SingleKeyTTLRow] = { - val ttlIterator = store.iterator(ttlColumnFamilyName) + override private[sql] def clearExpiredStateForAllKeys(): Long = { + var numValuesExpired = 0L - new Iterator[SingleKeyTTLRow] { - override def hasNext: Boolean = ttlIterator.hasNext + ttlEvictionIterator().foreach { ttlKey => + // Delete from secondary index + deleteFromTTLIndex(ttlKey) + // Delete from primary index + store.remove(toTTLRow(ttlKey).elementKey, stateName) - override def next(): SingleKeyTTLRow = { - val kv = ttlIterator.next() - SingleKeyTTLRow( - expirationMs = kv.key.getLong(0), - groupingKey = kv.key.getStruct(1, keyExprEnc.schema.length) - ) - } + numValuesExpired += 1 } + + TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows", numValuesExpired) + numValuesExpired } - private[sql] def getValuesInTTLState(groupingKey: UnsafeRow): Iterator[Long] = { - val ttlIterator = ttlIndexIterator() - var nextValue: Option[Long] = None - - new Iterator[Long] { - override def hasNext: Boolean = { - while (nextValue.isEmpty && ttlIterator.hasNext) { - val nextTtlValue = ttlIterator.next() - val valueGroupingKey = nextTtlValue.groupingKey - if (valueGroupingKey equals groupingKey) { - nextValue = Some(nextTtlValue.expirationMs) - } - } - nextValue.isDefined - } + override private[sql] def clearAllStateForElementKey(elementKey: UnsafeRow): Unit = { + val existingPrimaryValue = store.get(elementKey, stateName) + if (existingPrimaryValue != null) { + val existingExpirationMs = existingPrimaryValue.getLong(1) - override def next(): Long = { - val result = nextValue.get - nextValue = None - result - } + store.remove(elementKey, stateName) + TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows") + + deleteFromTTLIndex(existingExpirationMs, elementKey) } } - - /** - * Clears the user state associated with this grouping key - * if it has expired. This function is called by Spark to perform - * cleanup at the end of transformWithState processing. - * - * Spark uses a secondary index to determine if the user state for - * this grouping key has expired. However, its possible that the user - * has updated the TTL and secondary index is out of date. Implementations - * must validate that the user State has actually expired before cleanup based - * on their own State data. - * - * @param groupingKey grouping key for which cleanup should be performed. - * - * @return true if the state was cleared, false otherwise. - */ - def clearIfExpired(groupingKey: UnsafeRow): Long } /** - * Manages the ttl information for user state keyed with a single key (grouping key). + * [[OneToManyTTLState]] is an implementation of [[TTLState]] for stateful variables + * that associate a single key with multiple values; every value has its own expiration + * timestamp. + * + * We need an efficient way to find all the values that have expired, but we cannot + * issue point-wise deletes to the elements, since they are merged together using the + * RocksDB StringAppendOperator for merging. As such, we cannot keep a secondary index + * on the key (expirationMs, groupingKey, indexInList), since we have no way to delete a + * specific indexInList from the RocksDB value. (In the future, we could write a custom + * merge operator that can handle tombstones for deleted indexes, but RocksDB doesn't + * support custom merge operators written in Java/Scala.) + * + * Instead, we manage expiration per grouping key instead. Our secondary index will look + * like (expirationMs, groupingKey) -> EMPTY_ROW. This way, we can quickly find all the + * grouping keys that contain at least one element that has expired. + * + * To make sure that we aren't "late" in cleaning up expired values, this secondary index + * maps from the minimum expiration in a list and a grouping key to the EMPTY_VALUE. This + * index is called the "TTL index" in the code (to be consistent with [[OneToOneTTLState]]), + * though it behaves more like a work queue of lists that need to be cleaned up. + * + * Since a grouping key may have a large list and we need to quickly know what the + * minimum expiration is, we need to reverse this work queue index. This reversed index + * maps from key to the minimum expiration in the list, and it is called the "min-expiry" index. + * + * Note: currently, this is only used by ListState with TTL. */ -abstract class CompositeKeyTTLStateImpl[K]( - stateName: String, - store: StateStore, - keyExprEnc: ExpressionEncoder[Any], - userKeyEncoder: ExpressionEncoder[Any], - ttlExpirationMs: Long) - extends TTLState { - - import org.apache.spark.sql.execution.streaming.StateTTLSchema._ - - private val ttlColumnFamilyName = "$ttl_" + stateName - private val keySchema = getCompositeKeyTTLRowSchema( - keyExprEnc.schema, userKeyEncoder.schema +abstract class OneToManyTTLState( + stateNameArg: String, + storeArg: StateStore, + elementKeySchemaArg: StructType, + ttlConfigArg: TTLConfig, + batchTimestampMsArg: Long, + metricsArg: Map[String, SQLMetric]) extends TTLState { + override private[sql] def stateName: String = stateNameArg + override private[sql] def store: StateStore = storeArg + override private[sql] def elementKeySchema: StructType = elementKeySchemaArg + override private[sql] def ttlConfig: TTLConfig = ttlConfigArg + override private[sql] def batchTimestampMs: Long = batchTimestampMsArg + override private[sql] def metrics: Map[String, SQLMetric] = metricsArg + + // Schema of the min-expiry index: elementKey -> minExpirationMs + private val MIN_INDEX = "$min_" + stateName + private val MIN_INDEX_SCHEMA = elementKeySchema + private val MIN_INDEX_VALUE_SCHEMA = getExpirationMsRowSchema() + + // Projects a Long into an UnsafeRow + private val minIndexValueProjector = UnsafeProjection.create(MIN_INDEX_VALUE_SCHEMA) + + // Schema of the entry count index: elementKey -> count + private val COUNT_INDEX = "$count_" + stateName + private val COUNT_INDEX_VALUE_SCHEMA: StructType = + StructType(Seq(StructField("count", LongType, nullable = false))) + private val countIndexValueProjector = UnsafeProjection.create(COUNT_INDEX_VALUE_SCHEMA) + + // Reused internal row that we use to create an UnsafeRow with the schema of + // COUNT_INDEX_VALUE_SCHEMA and the desired value. It is not thread safe (although, anyway, + // this class is not thread safe). + private val reusedCountIndexValueRow = new GenericInternalRow(1) + + store.createColFamilyIfAbsent( + MIN_INDEX, + MIN_INDEX_SCHEMA, + MIN_INDEX_VALUE_SCHEMA, + NoPrefixKeyStateEncoderSpec(MIN_INDEX_SCHEMA), + isInternal = true ) - private val keyRowEncoder = new CompositeKeyTTLEncoder[K]( - keyExprEnc, userKeyEncoder) + store.createColFamilyIfAbsent( + COUNT_INDEX, + elementKeySchema, + COUNT_INDEX_VALUE_SCHEMA, + NoPrefixKeyStateEncoderSpec(elementKeySchema), + isInternal = true + ) - // empty row used for values - private val EMPTY_ROW = - UnsafeProjection.create(Array[DataType](NullType)).apply(InternalRow.apply(null)) + // Helper method to get the number of entries in the list state for a given element key + private def getEntryCount(elementKey: UnsafeRow): Long = { + val countRow = store.get(elementKey, COUNT_INDEX) + if (countRow != null) { + countRow.getLong(0) + } else { + 0L + } + } - store.createColFamilyIfAbsent(ttlColumnFamilyName, keySchema, - TTL_VALUE_ROW_SCHEMA, RangeKeyScanStateEncoderSpec(keySchema, - Seq(0)), isInternal = true) + // Helper function to update the number of entries in the list state for a given element key + private def updateEntryCount(elementKey: UnsafeRow, updatedCount: Long): Unit = { + reusedCountIndexValueRow.setLong(0, updatedCount) + store.put(elementKey, + countIndexValueProjector(reusedCountIndexValueRow.asInstanceOf[InternalRow]), + COUNT_INDEX + ) + } - def clearTTLState(): Unit = { - val iterator = store.iterator(ttlColumnFamilyName) - iterator.foreach { kv => - store.remove(kv.key, ttlColumnFamilyName) - } + // Helper function to remove the number of entries in the list state for a given element key + private def removeEntryCount(elementKey: UnsafeRow): Unit = { + store.remove(elementKey, COUNT_INDEX) } - def upsertTTLForStateKey( - expirationMs: Long, - groupingKey: UnsafeRow, - userKey: UnsafeRow): Unit = { - val encodedTtlKey = keyRowEncoder.encodeTTLRow( - expirationMs, groupingKey, userKey) - store.put(encodedTtlKey, EMPTY_ROW, ttlColumnFamilyName) + private def writePrimaryIndexEntries( + overwritePrimaryIndex: Boolean, + elementKey: UnsafeRow, + elementValues: Iterator[UnsafeRow]): Unit = { + val initialEntryCount = if (overwritePrimaryIndex) { + removeEntryCount(elementKey) + 0 + } else { + getEntryCount(elementKey) + } + + // Manually keep track of the count so that we can update the count index. We don't + // want to call elementValues.size since that will try to re-read the iterator. + var numNewElements = 0 + + // If we're overwriting the primary index, then we only need to put the first value, + // and then we can merge the rest. + var isFirst = true + elementValues.foreach { value => + numNewElements += 1 + if (isFirst && overwritePrimaryIndex) { + isFirst = false + store.put(elementKey, value, stateName) + } else { + store.merge(elementKey, value, stateName) + } + } + + TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows", numNewElements) + updateEntryCount(elementKey, initialEntryCount + numNewElements) } - /** - * Clears any state which has ttl older than [[ttlExpirationMs]]. - */ - override def clearExpiredState(): Long = { - val iterator = store.iterator(ttlColumnFamilyName) - var numRemovedElements = 0L - iterator.takeWhile { kv => - val expirationMs = kv.key.getLong(0) - StateTTL.isExpired(expirationMs, ttlExpirationMs) - }.foreach { kv => - numRemovedElements += clearIfExpired( - kv.key.getStruct(1, keyExprEnc.schema.length), - kv.key.getStruct(2, userKeyEncoder.schema.length)) - store.remove(kv.key, ttlColumnFamilyName) + private[sql] def updatePrimaryAndSecondaryIndices( + overwritePrimaryIndex: Boolean, + elementKey: UnsafeRow, + elementValues: Iterator[UnsafeRow], + expirationMs: Long): Unit = { + val existingMinExpirationUnsafeRow = store.get(elementKey, MIN_INDEX) + + writePrimaryIndexEntries(overwritePrimaryIndex, elementKey, elementValues) + + // If nothing exists in the minimum index, then we need to make sure to write + // the minimum and the TTL indices. There's nothing to clean-up from the + // secondary index, since it's empty. + if (existingMinExpirationUnsafeRow == null) { + // Insert into the min-expiry and TTL index, in no particular order. + store.put(elementKey, minIndexValueProjector(InternalRow(expirationMs)), MIN_INDEX) + insertIntoTTLIndex(expirationMs, elementKey) + } else { + val existingMinExpiration = existingMinExpirationUnsafeRow.getLong(0) + + if (overwritePrimaryIndex || expirationMs < existingMinExpiration) { + // We don't actually have to delete from the min-expiry index, since we're going + // to overwrite it on the next line. However, since the TTL index has the existing + // minimum expiration in it, we need to delete that. + deleteFromTTLIndex(existingMinExpiration, elementKey) + + // Insert into the min-expiry and TTL index, in no particular order. + store.put(elementKey, minIndexValueProjector(InternalRow(expirationMs)), MIN_INDEX) + insertIntoTTLIndex(expirationMs, elementKey) + } } - numRemovedElements } - private[sql] def ttlIndexIterator(): Iterator[CompositeKeyTTLRow] = { - val ttlIterator = store.iterator(ttlColumnFamilyName) + // The return type of clearExpiredValues. For a one-to-many stateful variable, cleanup + // must go through all of the values. numValuesExpired represents the number of entries + // that were removed (for metrics), and newMinExpirationMs is the new minimum expiration + // for the values remaining in the state variable. + case class ValueExpirationResult( + numValuesExpired: Long, + newMinExpirationMs: Option[Long]) - new Iterator[CompositeKeyTTLRow] { - override def hasNext: Boolean = ttlIterator.hasNext + // Clears all the expired values for the given elementKey. + protected def clearExpiredValues(elementKey: UnsafeRow): ValueExpirationResult - override def next(): CompositeKeyTTLRow = { - val kv = ttlIterator.next() - CompositeKeyTTLRow( - expirationMs = kv.key.getLong(0), - groupingKey = kv.key.getStruct(1, keyExprEnc.schema.length), - userKey = kv.key.getStruct(2, userKeyEncoder.schema.length) - ) + override private[sql] def clearExpiredStateForAllKeys(): Long = { + var totalNumValuesExpired = 0L + + ttlEvictionIterator().foreach { ttlKey => + val ttlRow = toTTLRow(ttlKey) + val elementKey = ttlRow.elementKey + + // Delete from TTL index and minimum index + deleteFromTTLIndex(ttlKey) + store.remove(elementKey, MIN_INDEX) + + // Now, we need the specific implementation to remove all the values associated with + // elementKey. + val valueExpirationResult = clearExpiredValues(elementKey) + + valueExpirationResult.newMinExpirationMs.foreach { newExpirationMs => + // Insert into the min-expiry and TTL index, in no particular order. + store.put(elementKey, minIndexValueProjector(InternalRow(newExpirationMs)), MIN_INDEX) + insertIntoTTLIndex(newExpirationMs, elementKey) } + + // If we have records [foo, bar, baz] and bar and baz are expiring, then, the + // entryCountBeforeExpirations would be 3. The numValuesExpired would be 2, and so the + // newEntryCount would be 3 - 2 = 1. + val entryCountBeforeExpirations = getEntryCount(elementKey) + val numValuesExpired = valueExpirationResult.numValuesExpired + val newEntryCount = entryCountBeforeExpirations - numValuesExpired + + TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows", numValuesExpired) + + if (newEntryCount == 0) { + removeEntryCount(elementKey) + } else { + updateEntryCount(elementKey, newEntryCount) + } + + totalNumValuesExpired += numValuesExpired } + + totalNumValuesExpired } - /** - * Clears the user state associated with this grouping key - * if it has expired. This function is called by Spark to perform - * cleanup at the end of transformWithState processing. - * - * Spark uses a secondary index to determine if the user state for - * this grouping key has expired. However, its possible that the user - * has updated the TTL and secondary index is out of date. Implementations - * must validate that the user State has actually expired before cleanup based - * on their own State data. - * - * @param groupingKey grouping key for which cleanup should be performed. - * @param userKey user key for which cleanup should be performed. - */ - def clearIfExpired(groupingKeyRow: UnsafeRow, - userKeyRow: UnsafeRow): Long + override private[sql] def clearAllStateForElementKey(elementKey: UnsafeRow): Unit = { + val existingMinExpirationUnsafeRow = store.get(elementKey, MIN_INDEX) + if (existingMinExpirationUnsafeRow != null) { + val existingMinExpiration = existingMinExpirationUnsafeRow.getLong(0) + + store.remove(elementKey, stateName) + TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows", getEntryCount(elementKey)) + removeEntryCount(elementKey) + + store.remove(elementKey, MIN_INDEX) + deleteFromTTLIndex(existingMinExpiration, elementKey) + } + } + + // Exposed for testing. + private[sql] def minIndexIterator(): Iterator[(UnsafeRow, Long)] = { + store + .iterator(MIN_INDEX) + .map(kv => (kv.key, kv.value.getLong(0))) + } } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala index d0fbaf6600609..5d20f53449c59 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala @@ -178,7 +178,7 @@ class TimerStateImpl( val rowPair = iter.next() val keyRow = rowPair.key val result = getTimerRowFromSecIndex(keyRow) - if (result._2 < expiryTimestampMs) { + if (result._2 <= expiryTimestampMs) { result } else { finished = true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala index f4705b89d5a87..aabbb5f8cacef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala @@ -20,7 +20,6 @@ import java.util.UUID import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD @@ -76,17 +75,32 @@ case class TransformWithStateExec( initialStateDataAttrs: Seq[Attribute], initialStateDeserializer: Expression, initialState: SparkPlan) - extends BinaryExecNode with StateStoreWriter with WatermarkSupport with ObjectProducerExec { + extends BinaryExecNode + with StateStoreWriter + with WatermarkSupport + with ObjectProducerExec + with TransformWithStateMetadataUtils { override def shortName: String = "transformWithStateExec" // dummy value schema, the real schema will get during state variable init time private val DUMMY_VALUE_ROW_SCHEMA = new StructType().add("value", BinaryType) + // We need to just initialize key and value deserializer once per partition. + // The deserializers need to be lazily created on the executor since they + // are not serializable. + // Ideas for for improvement can be found here: + // https://issues.apache.org/jira/browse/SPARK-50437 + private lazy val getKeyObj = + ObjectOperator.deserializeRowToObject(keyDeserializer, groupingAttributes) + + private lazy val getValueObj = + ObjectOperator.deserializeRowToObject(valueDeserializer, dataAttributes) + override def shouldRunAnotherBatch(newInputWatermark: Long): Boolean = { if (timeMode == ProcessingTime) { - // TODO: check if we can return true only if actual timers are registered, or there is - // expired state + // TODO SPARK-50180: check if we can return true only if actual timers are registered, + // or there is expired state true } else if (outputMode == OutputMode.Append || outputMode == OutputMode.Update) { eventTimeWatermarkForEviction.isDefined && @@ -111,32 +125,32 @@ case class TransformWithStateExec( driverProcessorHandle } + /** + * This method is used for the driver-side stateful processor after we + * have collected all the necessary schemas. + * This instance of the stateful processor won't be used again. + */ + private def closeProcessorHandle(): Unit = { + statefulProcessor.close() + statefulProcessor.setHandle(null) + } + /** * Fetching the columnFamilySchemas from the StatefulProcessorHandle * after init is called. */ - private def getColFamilySchemas(): Map[String, StateStoreColFamilySchema] = { + override def getColFamilySchemas(): Map[String, StateStoreColFamilySchema] = { val columnFamilySchemas = getDriverProcessorHandle().getColumnFamilySchemas closeProcessorHandle() columnFamilySchemas } - private def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo] = { + override def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo] = { val stateVariableInfos = getDriverProcessorHandle().getStateVariableInfos closeProcessorHandle() stateVariableInfos } - /** - * This method is used for the driver-side stateful processor after we - * have collected all the necessary schemas. - * This instance of the stateful processor won't be used again. - */ - private def closeProcessorHandle(): Unit = { - statefulProcessor.close() - statefulProcessor.setHandle(null) - } - /** * Controls watermark propagation to downstream modes. If timeMode is * ProcessingTime, the output rows cannot be interpreted in eventTime, hence @@ -230,11 +244,6 @@ case class TransformWithStateExec( private def handleInputRows(keyRow: UnsafeRow, valueRowIter: Iterator[InternalRow]): Iterator[InternalRow] = { - val getKeyObj = - ObjectOperator.deserializeRowToObject(keyDeserializer, groupingAttributes) - - val getValueObj = - ObjectOperator.deserializeRowToObject(valueDeserializer, dataAttributes) val getOutputRow = ObjectOperator.wrapObjectToRow(outputObjectType) @@ -261,8 +270,6 @@ case class TransformWithStateExec( private def processInitialStateRows( keyRow: UnsafeRow, initStateIter: Iterator[InternalRow]): Unit = { - val getKeyObj = - ObjectOperator.deserializeRowToObject(keyDeserializer, groupingAttributes) val getInitStateValueObj = ObjectOperator.deserializeRowToObject(initialStateDeserializer, initialStateDataAttrs) @@ -453,84 +460,22 @@ case class TransformWithStateExec( hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): List[StateSchemaValidationResult] = { - assert(stateSchemaVersion >= 3) - val newSchemas = getColFamilySchemas() - val stateSchemaDir = stateSchemaDirPath() - val newStateSchemaFilePath = - new Path(stateSchemaDir, s"${batchId}_${UUID.randomUUID().toString}") - val metadataPath = new Path(getStateInfo.checkpointLocation, s"${getStateInfo.operatorId}") - val metadataReader = OperatorStateMetadataReader.createReader( - metadataPath, hadoopConf, operatorStateMetadataVersion, batchId) - val operatorStateMetadata = try { - metadataReader.read() - } catch { - // If this is the first time we are running the query, there will be no metadata - // and this error is expected. In this case, we return None. - case ex: Exception if batchId == 0 => - None - } - - val oldStateSchemaFilePath: Option[Path] = operatorStateMetadata match { - case Some(metadata) => - metadata match { - case v2: OperatorStateMetadataV2 => - Some(new Path(v2.stateStoreInfo.head.stateSchemaFilePath)) - case _ => None - } - case None => None - } - List(StateSchemaCompatibilityChecker. - validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, - newSchemas.values.toList, session.sessionState, stateSchemaVersion, - storeName = StateStoreId.DEFAULT_STORE_NAME, - oldSchemaFilePath = oldStateSchemaFilePath, - newSchemaFilePath = Some(newStateSchemaFilePath))) + val info = getStateInfo + validateAndWriteStateSchema(hadoopConf, batchId, stateSchemaVersion, + info, session, operatorStateMetadataVersion) } /** Metadata of this stateful operator and its states stores. */ override def operatorStateMetadata( stateSchemaPaths: List[String]): OperatorStateMetadata = { val info = getStateInfo - val operatorInfo = OperatorInfoV1(info.operatorId, shortName) - // stateSchemaFilePath should be populated at this point - val stateStoreInfo = - Array(StateStoreMetadataV2( - StateStoreId.DEFAULT_STORE_NAME, 0, info.numPartitions, stateSchemaPaths.head)) - - val operatorProperties = TransformWithStateOperatorProperties( - timeMode.toString, - outputMode.toString, - getStateVariableInfos().values.toList - ) - OperatorStateMetadataV2(operatorInfo, stateStoreInfo, operatorProperties.json) - } - - private def stateSchemaDirPath(): Path = { - val storeName = StateStoreId.DEFAULT_STORE_NAME - val stateCheckpointPath = - new Path(getStateInfo.checkpointLocation, - s"${getStateInfo.operatorId.toString}") - - val stateSchemaPath = new Path(stateCheckpointPath, "_stateSchema") - val storeNamePath = new Path(stateSchemaPath, storeName) - storeNamePath + getOperatorStateMetadata(stateSchemaPaths, info, shortName, timeMode, outputMode) } override def validateNewMetadata( oldOperatorMetadata: OperatorStateMetadata, newOperatorMetadata: OperatorStateMetadata): Unit = { - (oldOperatorMetadata, newOperatorMetadata) match { - case ( - oldMetadataV2: OperatorStateMetadataV2, - newMetadataV2: OperatorStateMetadataV2) => - val oldOperatorProps = TransformWithStateOperatorProperties.fromJson( - oldMetadataV2.operatorPropertiesJson) - val newOperatorProps = TransformWithStateOperatorProperties.fromJson( - newMetadataV2.operatorPropertiesJson) - TransformWithStateOperatorProperties.validateOperatorProperties( - oldOperatorProps, newOperatorProps) - case (_, _) => - } + validateNewMetadataForTWS(oldOperatorMetadata, newOperatorMetadata) } override protected def doExecute(): RDD[InternalRow] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala index bc67cee57fef8..34dddeab59d29 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala @@ -16,6 +16,10 @@ */ package org.apache.spark.sql.execution.streaming +import java.util.UUID + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.JsonAST._ import org.json4s.JsonDSL._ @@ -23,9 +27,10 @@ import org.json4s.jackson.JsonMethods import org.json4s.jackson.JsonMethods.{compact, render} import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.StateVariableType.StateVariableType -import org.apache.spark.sql.execution.streaming.state.StateStoreErrors -import org.apache.spark.sql.streaming.TimeMode +import org.apache.spark.sql.execution.streaming.state.{OperatorInfoV1, OperatorStateMetadata, OperatorStateMetadataReader, OperatorStateMetadataV2, StateSchemaCompatibilityChecker, StateSchemaValidationResult, StateStoreColFamilySchema, StateStoreErrors, StateStoreId, StateStoreMetadataV2} +import org.apache.spark.sql.streaming.{OutputMode, TimeMode} /** * This file contains utility classes and functions for managing state variables in @@ -158,3 +163,104 @@ object TransformWithStateOperatorProperties extends Logging { } } } + +/** + * This trait contains utils functions related to TransformWithState metadata. + * This is used both in Scala and Python side of TransformWithState metadata support when calling + * `init()` with DriverStatefulProcessorHandleImpl, and get the state schema and state metadata + * on driver during physical planning phase. + */ +trait TransformWithStateMetadataUtils extends Logging { + def getColFamilySchemas(): Map[String, StateStoreColFamilySchema] + + def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo] + + def getOperatorStateMetadata( + stateSchemaPaths: List[String], + info: StatefulOperatorStateInfo, + shortName: String, + timeMode: TimeMode, + outputMode: OutputMode): OperatorStateMetadata = { + val operatorInfo = OperatorInfoV1(info.operatorId, shortName) + // stateSchemaFilePath should be populated at this point + val stateStoreInfo = + Array(StateStoreMetadataV2( + StateStoreId.DEFAULT_STORE_NAME, 0, info.numPartitions, stateSchemaPaths.head)) + + val operatorProperties = TransformWithStateOperatorProperties( + timeMode.toString, + outputMode.toString, + getStateVariableInfos().values.toList + ) + OperatorStateMetadataV2(operatorInfo, stateStoreInfo, operatorProperties.json) + } + + def validateAndWriteStateSchema( + hadoopConf: Configuration, + batchId: Long, + stateSchemaVersion: Int, + info: StatefulOperatorStateInfo, + session: SparkSession, + operatorStateMetadataVersion: Int = 2): List[StateSchemaValidationResult] = { + assert(stateSchemaVersion >= 3) + val newSchemas = getColFamilySchemas() + val stateSchemaDir = stateSchemaDirPath(info) + val newStateSchemaFilePath = + new Path(stateSchemaDir, s"${batchId}_${UUID.randomUUID().toString}") + val metadataPath = new Path(info.checkpointLocation, s"${info.operatorId}") + val metadataReader = OperatorStateMetadataReader.createReader( + metadataPath, hadoopConf, operatorStateMetadataVersion, batchId) + val operatorStateMetadata = try { + metadataReader.read() + } catch { + // If this is the first time we are running the query, there will be no metadata + // and this error is expected. In this case, we return None. + case _: Exception if batchId == 0 => + None + } + + val oldStateSchemaFilePath: Option[Path] = operatorStateMetadata match { + case Some(metadata) => + metadata match { + case v2: OperatorStateMetadataV2 => + Some(new Path(v2.stateStoreInfo.head.stateSchemaFilePath)) + case _ => None + } + case None => None + } + // state schema file written here, writing the new schema list we passed here + List(StateSchemaCompatibilityChecker. + validateAndMaybeEvolveStateSchema(info, hadoopConf, + newSchemas.values.toList, session.sessionState, stateSchemaVersion, + storeName = StateStoreId.DEFAULT_STORE_NAME, + oldSchemaFilePath = oldStateSchemaFilePath, + newSchemaFilePath = Some(newStateSchemaFilePath))) + } + + def validateNewMetadataForTWS( + oldOperatorMetadata: OperatorStateMetadata, + newOperatorMetadata: OperatorStateMetadata): Unit = { + (oldOperatorMetadata, newOperatorMetadata) match { + case ( + oldMetadataV2: OperatorStateMetadataV2, + newMetadataV2: OperatorStateMetadataV2) => + val oldOperatorProps = TransformWithStateOperatorProperties.fromJson( + oldMetadataV2.operatorPropertiesJson) + val newOperatorProps = TransformWithStateOperatorProperties.fromJson( + newMetadataV2.operatorPropertiesJson) + TransformWithStateOperatorProperties.validateOperatorProperties( + oldOperatorProps, newOperatorProps) + case (_, _) => + } + } + + private def stateSchemaDirPath(info: StatefulOperatorStateInfo): Path = { + val storeName = StateStoreId.DEFAULT_STORE_NAME + val stateCheckpointPath = + new Path(info.checkpointLocation, s"${info.operatorId.toString}") + + val stateSchemaPath = new Path(stateCheckpointPath, "_stateSchema") + val storeNamePath = new Path(stateSchemaPath, storeName) + storeNamePath + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala index 60eea5842645e..87e4596f67309 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.streaming import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._ import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateStore} @@ -44,20 +43,20 @@ class ValueStateImplWithTTL[S]( ttlConfig: TTLConfig, batchTimestampMs: Long, metrics: Map[String, SQLMetric] = Map.empty) - extends SingleKeyTTLStateImpl( - stateName, store, keyExprEnc, batchTimestampMs) with ValueState[S] { + extends OneToOneTTLState( + stateName, store, keyExprEnc.schema, ttlConfig, batchTimestampMs, metrics) with ValueState[S] { - private val stateTypesEncoder = StateTypesEncoder(keyExprEnc, valEncoder, - stateName, hasTtl = true) - private val ttlExpirationMs = - StateTTL.calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) + private val stateTypesEncoder = + StateTypesEncoder(keyExprEnc, valEncoder, stateName, hasTtl = true) initialize() private def initialize(): Unit = { store.createColFamilyIfAbsent(stateName, - keyExprEnc.schema, getValueSchemaWithTTL(valEncoder.schema, true), - NoPrefixKeyStateEncoderSpec(keyExprEnc.schema)) + keyExprEnc.schema, + getValueSchemaWithTTL(valEncoder.schema, true), + NoPrefixKeyStateEncoderSpec(keyExprEnc.schema) + ) } /** Function to check if state exists. Returns true if present and false otherwise */ @@ -76,6 +75,7 @@ class ValueStateImplWithTTL[S]( val retRow = store.get(encodedGroupingKey, stateName) if (retRow != null) { + // Getting the 0th ordinal of the struct using valEncoder val resState = stateTypesEncoder.decodeValue(retRow) if (!stateTypesEncoder.isExpired(retRow, batchTimestampMs)) { @@ -90,33 +90,19 @@ class ValueStateImplWithTTL[S]( /** Function to update and overwrite state associated with given key */ override def update(newState: S): Unit = { + val encodedKey = stateTypesEncoder.encodeGroupingKey() + + val ttlExpirationMs = StateTTL + .calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) val encodedValue = stateTypesEncoder.encodeValue(newState, ttlExpirationMs) - val serializedGroupingKey = stateTypesEncoder.encodeGroupingKey() - store.put(serializedGroupingKey, - encodedValue, stateName) - TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows") - upsertTTLForStateKey(ttlExpirationMs, serializedGroupingKey) + + updatePrimaryAndSecondaryIndices(encodedKey, encodedValue, ttlExpirationMs) } /** Function to remove state for given key */ override def clear(): Unit = { - store.remove(stateTypesEncoder.encodeGroupingKey(), stateName) - TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows") - clearTTLState() - } - - def clearIfExpired(groupingKey: UnsafeRow): Long = { - val retRow = store.get(groupingKey, stateName) - - var result = 0L - if (retRow != null) { - if (stateTypesEncoder.isExpired(retRow, batchTimestampMs)) { - store.remove(groupingKey, stateName) - TWSMetricsUtils.incrementMetric(metrics, "numRemovedStateRows") - result = 1L - } - } - result + val groupingKey = stateTypesEncoder.encodeGroupingKey() + clearAllStateForElementKey(groupingKey) } /* @@ -161,11 +147,16 @@ class ValueStateImplWithTTL[S]( } /** - * Get all ttl values stored in ttl state for current implicit - * grouping key. + * Get the TTL value stored in TTL state for the current implicit grouping key, + * if it exists. */ - private[sql] def getValuesInTTLState(): Iterator[Long] = { - getValuesInTTLState(stateTypesEncoder.encodeGroupingKey()) + private[sql] def getValueInTTLState(): Option[Long] = { + val groupingKey = stateTypesEncoder.encodeGroupingKey() + val ttlRowsForGroupingKey = getTTLRows().filter(_.elementKey == groupingKey).toSeq + + assert(ttlRowsForGroupingKey.size <= 1, "Multiple TTLRows found for grouping key " + + s"$groupingKey. Expected at most 1. Found: ${ttlRowsForGroupingKey.mkString(", ")}.") + ttlRowsForGroupingKey.headOption.map(_.expirationMs) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkPropagator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkPropagator.scala index f0950063b1613..aaf8cbd69ea20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkPropagator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkPropagator.scala @@ -124,12 +124,14 @@ class UseSingleWatermarkPropagator extends WatermarkPropagator { /** * This implementation simulates propagation of watermark among operators. * - * The simulation algorithm traverses the physical plan tree via post-order (children first) to - * calculate (input watermark, output watermark) for all nodes. + * It is considered a "simulation" because watermarks are not being physically sent between + * operators, but rather propagated up the tree via post-order (children first) traversal of + * the query plan. This allows Structured Streaming to determine the new (input watermark, output + * watermark) for all nodes. * * For each node, below logic is applied: * - * - Input watermark for specific node is decided by `min(input watermarks from all children)`. + * - Input watermark for specific node is decided by `min(output watermarks from all children)`. * -- Children providing no input watermark (DEFAULT_WATERMARK_MS) are excluded. * -- If there is no valid input watermark from children, input watermark = DEFAULT_WATERMARK_MS. * - Output watermark for specific node is decided as following: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala index ae06e82335b12..2deccb845fea2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala @@ -291,7 +291,8 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with newMap } catch { - case e: SparkException if e.getCondition.contains("CANNOT_LOAD_STATE_STORE") => + case e: SparkException + if Option(e.getCondition).exists(_.contains("CANNOT_LOAD_STATE_STORE")) => throw e case e: OutOfMemoryError => throw QueryExecutionErrors.notEnoughMemoryToLoadStore( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala index f8e9885cef14e..56f253b523358 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala @@ -77,19 +77,6 @@ class RocksDB( import RocksDB._ - case class RocksDBSnapshot( - checkpointDir: File, - version: Long, - numKeys: Long, - columnFamilyMapping: Map[String, Short], - maxColumnFamilyId: Short, - dfsFileSuffix: String, - fileMapping: Map[String, RocksDBSnapshotFile]) { - def close(): Unit = { - silentDeleteRecursively(checkpointDir, s"Free up local checkpoint of snapshot $version") - } - } - @volatile private var lastSnapshotVersion = 0L RocksDBLoader.loadLibrary() @@ -158,7 +145,7 @@ class RocksDB( @volatile private var db: NativeRocksDB = _ @volatile private var changelogWriter: Option[StateStoreChangelogWriter] = None private val enableChangelogCheckpointing: Boolean = conf.enableChangelogCheckpointing - @volatile private var loadedVersion = -1L // -1 = nothing valid is loaded + @volatile protected var loadedVersion: Long = -1L // -1 = nothing valid is loaded // variables to manage checkpoint ID. Once a checkpointing finishes, it needs to return // `lastCommittedStateStoreCkptId` as the committed checkpointID, as well as @@ -170,10 +157,11 @@ class RocksDB( // we have to use a new one. We have to update `sessionStateStoreCkptId` if we reload a previous // batch version, as we would have to use a new checkpointID for re-committing a version. // The reusing is to help debugging but is not required for the algorithm to work. - private var lastCommitBasedStateStoreCkptId: Option[String] = None - private var lastCommittedStateStoreCkptId: Option[String] = None - private var loadedStateStoreCkptId: Option[String] = None - private var sessionStateStoreCkptId: Option[String] = None + protected var lastCommitBasedStateStoreCkptId: Option[String] = None + protected var lastCommittedStateStoreCkptId: Option[String] = None + protected var loadedStateStoreCkptId: Option[String] = None + protected var sessionStateStoreCkptId: Option[String] = None + protected[sql] val lineageManager: RocksDBLineageManager = new RocksDBLineageManager @volatile private var numKeysOnLoadedVersion = 0L @volatile private var numKeysOnWritingVersion = 0L @@ -278,70 +266,129 @@ class RocksDB( // We send snapshots that needs to be uploaded by the maintenance thread to this queue private val snapshotsToUploadQueue = new ConcurrentLinkedQueue[RocksDBSnapshot]() + /** + * Read the lineage from the changelog files. It first get the changelog reader + * of the correct changelog version and then read the lineage information from the file. + * The changelog file is named as version_stateStoreCkptId.changelog + * @param version version of the changelog file, used to load changelog file. + * @param stateStoreCkptId uniqueId of the changelog file, used to load changelog file. + * @return the lineage stored in the changelog file + */ + private def getLineageFromChangelogFile( + version: Long, + stateStoreCkptId: Option[String]): Array[LineageItem] = { + var changelogReader: StateStoreChangelogReader = null + var currLineage: Array[LineageItem] = Array.empty + try { + changelogReader = fileManager.getChangelogReader(version, stateStoreCkptId) + currLineage = changelogReader.lineage + logInfo(log"Loading lineage: " + + log"${MDC(LogKeys.LINEAGE, lineageManager)} from " + + log"changelog version: ${MDC(LogKeys.VERSION_NUM, version)} " + + log"uniqueId: ${MDC(LogKeys.UUID, stateStoreCkptId.getOrElse(""))}.") + } finally { + if (changelogReader != null) { + changelogReader.closeIfNeeded() + } + } + currLineage + } + + /** * Load the given version of data in a native RocksDB instance. * Note that this will copy all the necessary file from DFS to local disk as needed, * and possibly restart the native RocksDB instance. */ - def load( + private def loadWithCheckpointId( version: Long, - stateStoreCkptId: Option[String] = None, + stateStoreCkptId: Option[String], readOnly: Boolean = false): RocksDB = { - assert(version >= 0) - acquire(LoadStore) - recordedMetrics = None - logInfo(log"Loading ${MDC(LogKeys.VERSION_NUM, version)}") + // An array contains lineage information from [snapShotVersion, version] + // (inclusive in both ends) + var currVersionLineage: Array[LineageItem] = lineageManager.getLineageForCurrVersion() try { - if (loadedVersion != version || - (enableStateStoreCheckpointIds && stateStoreCkptId.isDefined && - (loadedStateStoreCkptId.isEmpty || stateStoreCkptId.get != loadedStateStoreCkptId.get))) { + if (loadedVersion != version || (loadedStateStoreCkptId.isEmpty || + stateStoreCkptId.get != loadedStateStoreCkptId.get)) { closeDB(ignoreException = false) - val latestSnapshotVersion = fileManager.getLatestSnapshotVersion(version) + + val (latestSnapshotVersion, latestSnapshotUniqueId) = { + // Special handling when version is 0. + // When loading the very first version (0), stateStoreCkptId does not need to be defined + // because there won't be 0.changelog / 0.zip file created in RocksDB under v2. + if (version == 0) { + assert(stateStoreCkptId.isEmpty, + "stateStoreCkptId should be empty when version is zero") + (0L, None) + // When there is a snapshot file, it is the ground truth, we can skip + // reconstructing the lineage from changelog file. + } else if (fileManager.existsSnapshotFile(version, stateStoreCkptId)) { + currVersionLineage = Array(LineageItem(version, stateStoreCkptId.get)) + (version, stateStoreCkptId) + } else { + currVersionLineage = getLineageFromChangelogFile(version, stateStoreCkptId) :+ + LineageItem(version, stateStoreCkptId.get) + currVersionLineage = currVersionLineage.sortBy(_.version) + + val latestSnapshotVersionsAndUniqueId = + fileManager.getLatestSnapshotVersionAndUniqueIdFromLineage(currVersionLineage) + latestSnapshotVersionsAndUniqueId match { + case Some(pair) => (pair._1, Option(pair._2)) + case None if currVersionLineage.head.version == 1L => + logDebug(log"Cannot find latest snapshot based on lineage but first version " + + log"is 1, use 0 as default. Lineage: ${MDC(LogKeys.LINEAGE, lineageManager)}") + (0L, None) + case _ => + throw QueryExecutionErrors.cannotFindBaseSnapshotCheckpoint( + printLineageItems(currVersionLineage)) + } + } + } + + logInfo(log"Loaded latestSnapshotVersion: ${ + MDC(LogKeys.SNAPSHOT_VERSION, latestSnapshotVersion)}, latestSnapshotUniqueId: ${ + MDC(LogKeys.UUID, latestSnapshotUniqueId)}") + val metadata = fileManager.loadCheckpointFromDfs(latestSnapshotVersion, - workingDir, rocksDBFileMapping) + workingDir, rocksDBFileMapping, latestSnapshotUniqueId) + loadedVersion = latestSnapshotVersion // reset the last snapshot version to the latest available snapshot version lastSnapshotVersion = latestSnapshotVersion + lineageManager.resetLineage(currVersionLineage) // Initialize maxVersion upon successful load from DFS fileManager.setMaxSeenVersion(version) - setInitialCFInfo() - metadata.columnFamilyMapping.foreach { mapping => - colFamilyNameToIdMap.putAll(mapping.asJava) - } + openLocalRocksDB(metadata) - metadata.maxColumnFamilyId.foreach { maxId => - maxColumnFamilyId.set(maxId) + if (loadedVersion != version) { + val versionsAndUniqueIds = currVersionLineage.collect { + case i if i.version > loadedVersion && i.version <= version => + (i.version, Option(i.checkpointUniqueId)) + } + replayChangelog(versionsAndUniqueIds) + loadedVersion = version + lineageManager.resetLineage(currVersionLineage) } - openDB() - numKeysOnWritingVersion = if (!conf.trackTotalNumberOfRows) { - // we don't track the total number of rows - discard the number being track - -1L - } else if (metadata.numKeys < 0) { - // we track the total number of rows, but the snapshot doesn't have tracking number - // need to count keys now - countKeys() - } else { - metadata.numKeys - } - if (loadedVersion != version) replayChangelog(version) // After changelog replay the numKeysOnWritingVersion will be updated to // the correct number of keys in the loaded version. numKeysOnLoadedVersion = numKeysOnWritingVersion fileManagerMetrics = fileManager.latestLoadCheckpointMetrics } - if (enableStateStoreCheckpointIds) { - lastCommitBasedStateStoreCkptId = None - loadedStateStoreCkptId = stateStoreCkptId - sessionStateStoreCkptId = Some(java.util.UUID.randomUUID.toString) - } + + lastCommitBasedStateStoreCkptId = None + loadedStateStoreCkptId = stateStoreCkptId + sessionStateStoreCkptId = Some(java.util.UUID.randomUUID.toString) lastCommittedStateStoreCkptId = None + if (conf.resetStatsOnLoad) { nativeStats.reset } - logInfo(log"Loaded ${MDC(LogKeys.VERSION_NUM, version)}") + + logInfo(log"Loaded ${MDC(LogKeys.VERSION_NUM, version)} " + + log"with uniqueId ${MDC(LogKeys.UUID, stateStoreCkptId)}") } catch { case t: Throwable => loadedVersion = -1 // invalidate loaded data @@ -349,6 +396,67 @@ class RocksDB( lastCommittedStateStoreCkptId = None loadedStateStoreCkptId = None sessionStateStoreCkptId = None + lineageManager.clear() + throw t + } + if (enableChangelogCheckpointing && !readOnly) { + // Make sure we don't leak resource. + changelogWriter.foreach(_.abort()) + // Initialize the changelog writer with lineage info + // The lineage stored in changelog files should normally start with + // the version of a snapshot, except for the first few versions. + // Because they are solely loaded from changelog file. + // (e.g. with default minDeltasForSnapshot, there is only 1_uuid1.changelog, no 1_uuid1.zip) + // It should end with exactly one version before the change log's version. + changelogWriter = Some(fileManager.getChangeLogWriter( + version + 1, + useColumnFamilies, + sessionStateStoreCkptId, + Some(currVersionLineage))) + } + this + } + + private def loadWithoutCheckpointId( + version: Long, + readOnly: Boolean = false): RocksDB = { + try { + if (loadedVersion != version) { + closeDB(ignoreException = false) + val latestSnapshotVersion = fileManager.getLatestSnapshotVersion(version) + val metadata = fileManager.loadCheckpointFromDfs( + latestSnapshotVersion, + workingDir, + rocksDBFileMapping) + + loadedVersion = latestSnapshotVersion + + // reset the last snapshot version to the latest available snapshot version + lastSnapshotVersion = latestSnapshotVersion + + // Initialize maxVersion upon successful load from DFS + fileManager.setMaxSeenVersion(version) + + openLocalRocksDB(metadata) + + if (loadedVersion != version) { + val versionsAndUniqueIds: Array[(Long, Option[String])] = + (loadedVersion + 1 to version).map((_, None)).toArray + replayChangelog(versionsAndUniqueIds) + loadedVersion = version + } + // After changelog replay the numKeysOnWritingVersion will be updated to + // the correct number of keys in the loaded version. + numKeysOnLoadedVersion = numKeysOnWritingVersion + fileManagerMetrics = fileManager.latestLoadCheckpointMetrics + } + if (conf.resetStatsOnLoad) { + nativeStats.reset + } + logInfo(log"Loaded ${MDC(LogKeys.VERSION_NUM, version)}") + } catch { + case t: Throwable => + loadedVersion = -1 // invalidate loaded data throw t } if (enableChangelogCheckpointing && !readOnly) { @@ -359,6 +467,48 @@ class RocksDB( this } + /** + * Initialize key metrics based on the metadata loaded from DFS and open local RocksDB. + */ + private def openLocalRocksDB(metadata: RocksDBCheckpointMetadata): Unit = { + setInitialCFInfo() + metadata.columnFamilyMapping.foreach { mapping => + colFamilyNameToIdMap.putAll(mapping.asJava) + } + + metadata.maxColumnFamilyId.foreach { maxId => + maxColumnFamilyId.set(maxId) + } + openDB() + numKeysOnWritingVersion = if (!conf.trackTotalNumberOfRows) { + // we don't track the total number of rows - discard the number being track + -1L + } else if (metadata.numKeys < 0) { + // we track the total number of rows, but the snapshot doesn't have tracking number + // need to count keys now + countKeys() + } else { + metadata.numKeys + } + } + + def load( + version: Long, + stateStoreCkptId: Option[String] = None, + readOnly: Boolean = false): RocksDB = { + assert(version >= 0) + acquire(LoadStore) + recordedMetrics = None + logInfo(log"Loading ${MDC(LogKeys.VERSION_NUM, version)} with stateStoreCkptId: ${ + MDC(LogKeys.UUID, stateStoreCkptId.getOrElse(""))}") + if (stateStoreCkptId.isDefined || enableStateStoreCheckpointIds && version == 0) { + loadWithCheckpointId(version, stateStoreCkptId, readOnly) + } else { + loadWithoutCheckpointId(version, readOnly) + } + this + } + /** * Load from the start snapshot version and apply all the changelog records to reach the * end version. Note that this will copy all the necessary files from DFS to local disk as needed, @@ -417,7 +567,12 @@ class RocksDB( } else { metadata.numKeys } - if (loadedVersion != endVersion) replayChangelog(endVersion) + if (loadedVersion != endVersion) { + val versionsAndUniqueIds: Array[(Long, Option[String])] = + (loadedVersion + 1 to endVersion).map((_, None)).toArray + replayChangelog(versionsAndUniqueIds) + loadedVersion = endVersion + } // After changelog replay the numKeysOnWritingVersion will be updated to // the correct number of keys in the loaded version. numKeysOnLoadedVersion = numKeysOnWritingVersion @@ -431,16 +586,23 @@ class RocksDB( /** * Replay change log from the loaded version to the target version. */ - private def replayChangelog(endVersion: Long): Unit = { + private def replayChangelog(versionsAndUniqueIds: Array[(Long, Option[String])]): Unit = { + assert(!versionsAndUniqueIds.isEmpty && versionsAndUniqueIds.head._1 == loadedVersion + 1, + s"Replay changelog should start from one version after loadedVersion: $loadedVersion," + + s" but it is not." + ) + logInfo(log"Replaying changelog from version " + log"${MDC(LogKeys.LOADED_VERSION, loadedVersion)} -> " + - log"${MDC(LogKeys.END_VERSION, endVersion)}") - for (v <- loadedVersion + 1 to endVersion) { - logInfo(log"Replaying changelog on version " + - log"${MDC(LogKeys.VERSION_NUM, v)}") + log"${MDC(LogKeys.END_VERSION, versionsAndUniqueIds.lastOption.map(_._1))}") + + versionsAndUniqueIds.foreach { case (v, uniqueId) => + logInfo(log"replaying changelog from version ${MDC(LogKeys.VERSION_NUM, v)} with " + + log"unique Id: ${MDC(LogKeys.UUID, uniqueId)}") + var changelogReader: StateStoreChangelogReader = null try { - changelogReader = fileManager.getChangelogReader(v, useColumnFamilies) + changelogReader = fileManager.getChangelogReader(v, uniqueId) changelogReader.foreach { case (recordType, key, value) => recordType match { case RecordType.PUT_RECORD => @@ -457,7 +619,6 @@ class RocksDB( if (changelogReader != null) changelogReader.closeIfNeeded() } } - loadedVersion = endVersion } /** @@ -496,7 +657,6 @@ class RocksDB( * @note This update is not committed to disk until commit() is called. */ def merge(key: Array[Byte], value: Array[Byte]): Unit = { - if (conf.trackTotalNumberOfRows) { val oldValue = db.get(readOptions, key) if (oldValue == null) { @@ -527,7 +687,6 @@ class RocksDB( * Get an iterator of all committed and uncommitted key-value pairs. */ def iterator(): Iterator[ByteArrayPair] = { - val iter = db.newIterator() logInfo(log"Getting iterator from version ${MDC(LogKeys.LOADED_VERSION, loadedVersion)}") iter.seekToFirst() @@ -612,46 +771,11 @@ class RocksDB( try { logInfo(log"Flushing updates for ${MDC(LogKeys.VERSION_NUM, newVersion)}") - var compactTimeMs = 0L - var flushTimeMs = 0L - var checkpointTimeMs = 0L var snapshot: Option[RocksDBSnapshot] = None - if (shouldCreateSnapshot() || shouldForceSnapshot.get()) { - // Need to flush the change to disk before creating a checkpoint - // because rocksdb wal is disabled. - logInfo(log"Flushing updates for ${MDC(LogKeys.VERSION_NUM, newVersion)}") - flushTimeMs = timeTakenMs { - db.flush(flushOptions) - } - - if (conf.compactOnCommit) { - logInfo("Compacting") - compactTimeMs = timeTakenMs { - db.compactRange() - } - } - - checkpointTimeMs = timeTakenMs { - val checkpointDir = createTempDir("checkpoint") - logInfo(log"Creating checkpoint for ${MDC(LogKeys.VERSION_NUM, newVersion)} " + - log"in ${MDC(LogKeys.PATH, checkpointDir)}") - // Make sure the directory does not exist. Native RocksDB fails if the directory to - // checkpoint exists. - Utils.deleteRecursively(checkpointDir) - // We no longer pause background operation before creating a RocksDB checkpoint because - // it is unnecessary. The captured snapshot will still be consistent with ongoing - // background operations. - val cp = Checkpoint.create(db) - cp.createCheckpoint(checkpointDir.toString) - // if changelog checkpointing is disabled, the snapshot is uploaded synchronously - // inside the uploadSnapshot() called below. - // If changelog checkpointing is enabled, snapshot will be uploaded asynchronously - // during state store maintenance. - snapshot = Some(createSnapshot(checkpointDir, newVersion, - colFamilyNameToIdMap.asScala.toMap, maxColumnFamilyId.get().toShort)) - lastSnapshotVersion = newVersion - } + val (newSnapshot, snapshotLatency) = createSnapshot(newVersion, sessionStateStoreCkptId) + snapshot = newSnapshot + commitLatencyMs ++= snapshotLatency } logInfo(log"Syncing checkpoint for ${MDC(LogKeys.VERSION_NUM, newVersion)} to DFS") @@ -663,12 +787,7 @@ class RocksDB( var isUploaded = false if (shouldForceSnapshot.get()) { assert(snapshot.isDefined) - fileManagerMetrics = uploadSnapshot( - snapshot.get, - fileManager, - rocksDBFileMapping.snapshotsPendingUpload, - loggingId - ) + uploadSnapshot(snapshot.get) isUploaded = true shouldForceSnapshot.set(false) } @@ -686,15 +805,22 @@ class RocksDB( } else { assert(changelogWriter.isEmpty) assert(snapshot.isDefined) - fileManagerMetrics = uploadSnapshot( - snapshot.get, - fileManager, - rocksDBFileMapping.snapshotsPendingUpload, - loggingId - ) + uploadSnapshot(snapshot.get) } } + if (enableStateStoreCheckpointIds) { + lastCommitBasedStateStoreCkptId = loadedStateStoreCkptId + lastCommittedStateStoreCkptId = sessionStateStoreCkptId + loadedStateStoreCkptId = sessionStateStoreCkptId + lineageManager.appendLineageItem(LineageItem(newVersion, sessionStateStoreCkptId.get)) + logInfo(log"Update checkpoint IDs and lineage: ${MDC( + LogKeys.LOADED_CHECKPOINT_ID, loadedStateStoreCkptId)}," + + log" ${MDC(LogKeys.LAST_COMMITTED_CHECKPOINT_ID, lastCommittedStateStoreCkptId)}," + + log" ${MDC(LogKeys.LAST_COMMIT_BASED_CHECKPOINT_ID, lastCommitBasedStateStoreCkptId)}," + + log" ${MDC(LogKeys.LINEAGE, lineageManager)}") + } + // Set maxVersion when checkpoint files are synced to DFS successfully // We need to handle this explicitly in RocksDB as we could use different // changeLogWriter instances in fileManager instance when committing @@ -702,15 +828,7 @@ class RocksDB( numKeysOnLoadedVersion = numKeysOnWritingVersion loadedVersion = newVersion - if (enableStateStoreCheckpointIds) { - lastCommitBasedStateStoreCkptId = loadedStateStoreCkptId - lastCommittedStateStoreCkptId = sessionStateStoreCkptId - loadedStateStoreCkptId = sessionStateStoreCkptId - } commitLatencyMs ++= Map( - "flush" -> flushTimeMs, - "compact" -> compactTimeMs, - "checkpoint" -> checkpointTimeMs, "fileSync" -> fileSyncTimeMs ) recordedMetrics = Some(metrics) @@ -736,6 +854,69 @@ class RocksDB( } else true } + private def createSnapshot( + version: Long, + checkpointUniqueId: Option[String]): (Option[RocksDBSnapshot], Map[String, Long]) = { + // Need to flush the change to disk before creating a checkpoint + // because rocksdb wal is disabled. + logInfo(log"Flushing updates for ${MDC(LogKeys.VERSION_NUM, version)}") + val flushTimeMs = timeTakenMs { + db.flush(flushOptions) + } + val compactTimeMs = if (conf.compactOnCommit) { + logInfo(log"Compacting") + timeTakenMs { db.compactRange() } + } else 0L + + var snapshot: Option[RocksDBSnapshot] = None + + val checkpointTimeMs = timeTakenMs { + val checkpointDir = createTempDir("checkpoint") + logInfo(log"Creating checkpoint for ${MDC(LogKeys.VERSION_NUM, version)} in " + + log"${MDC(LogKeys.CHECKPOINT_PATH, checkpointDir)}") + // Make sure the directory does not exist. Native RocksDB fails if the directory to + // checkpoint exists. + Utils.deleteRecursively(checkpointDir) + // We no longer pause background operation before creating a RocksDB checkpoint because + // it is unnecessary. The captured snapshot will still be consistent with ongoing + // background operations. + val cp = Checkpoint.create(db) + cp.createCheckpoint(checkpointDir.toString) + + val (dfsFileSuffix, immutableFileMapping) = rocksDBFileMapping.createSnapshotFileMapping( + fileManager, checkpointDir, version) + val newSnapshot = Some(RocksDBSnapshot( + checkpointDir, + version, + numKeysOnWritingVersion, + colFamilyNameToIdMap.asScala.toMap, + maxColumnFamilyId.get().toShort, + dfsFileSuffix, + immutableFileMapping, + checkpointUniqueId)) + + snapshot = newSnapshot + lastSnapshotVersion = version + } + + (snapshot, + Map( + "flush" -> flushTimeMs, + "compact" -> compactTimeMs, + "checkpoint" -> checkpointTimeMs + ) + ) + } + + private[sql] def uploadSnapshot(snapshot: RocksDBSnapshot): Unit = { + fileManagerMetrics = uploadSnapshot( + snapshot, + fileManager, + rocksDBFileMapping.snapshotsPendingUpload, + loggingId + ) + } + /** * Drop uncommitted changes, and roll back to previous version. */ @@ -748,6 +929,7 @@ class RocksDB( lastCommittedStateStoreCkptId = None loadedStateStoreCkptId = None sessionStateStoreCkptId = None + lineageManager.clear() changelogWriter.foreach(_.abort()) // Make sure changelogWriter gets recreated next time. changelogWriter = None @@ -772,16 +954,13 @@ class RocksDB( } if (mostRecentSnapshot.isDefined) { - fileManagerMetrics = uploadSnapshot( - mostRecentSnapshot.get, - fileManager, - rocksDBFileMapping.snapshotsPendingUpload, - loggingId - ) + uploadSnapshot(mostRecentSnapshot.get) } } val cleanupTime = timeTakenMs { - fileManager.deleteOldVersions(conf.minVersionsToRetain, conf.minVersionsToDelete) + fileManager.deleteOldVersions( + numVersionsToRetain = conf.minVersionsToRetain, + minVersionsToDelete = conf.minVersionsToDelete) } logInfo(log"Cleaned old data, time taken: ${MDC(LogKeys.TIME_UNITS, cleanupTime)} ms") } @@ -916,18 +1095,6 @@ class RocksDB( rocksDBMetricsOpt } - private def createSnapshot( - checkpointDir: File, - version: Long, - columnFamilyMapping: Map[String, Short], - maxColumnFamilyId: Short): RocksDBSnapshot = { - val (dfsFileSuffix, immutableFileMapping) = rocksDBFileMapping.createSnapshotFileMapping( - fileManager, checkpointDir, version) - - RocksDBSnapshot(checkpointDir, version, numKeysOnWritingVersion, - columnFamilyMapping, maxColumnFamilyId, dfsFileSuffix, immutableFileMapping) - } - /** * Function to acquire RocksDB instance lock that allows for synchronized access to the state * store instance @@ -1036,6 +1203,49 @@ class RocksDB( Option(acquiredThreadInfo).map(_.copy()) } + /** Upload the snapshot to DFS and remove it from snapshots pending */ + private def uploadSnapshot( + snapshot: RocksDBSnapshot, + fileManager: RocksDBFileManager, + snapshotsPendingUpload: Set[RocksDBVersionSnapshotInfo], + loggingId: String): RocksDBFileManagerMetrics = { + var fileManagerMetrics: RocksDBFileManagerMetrics = null + try { + val uploadTime = timeTakenMs { + fileManager.saveCheckpointToDfs( + snapshot.checkpointDir, + snapshot.version, + snapshot.numKeys, + snapshot.fileMapping, + Some(snapshot.columnFamilyMapping), + Some(snapshot.maxColumnFamilyId), + snapshot.uniqueId + ) + fileManagerMetrics = fileManager.latestSaveCheckpointMetrics + + val snapshotInfo = RocksDBVersionSnapshotInfo(snapshot.version, snapshot.dfsFileSuffix) + // We are only removing the uploaded snapshot info from the pending set, + // to let the file mapping (i.e. query threads) know that the snapshot (i.e. and its files) + // have been uploaded to DFS. We don't touch the file mapping here to avoid corrupting it. + snapshotsPendingUpload.remove(snapshotInfo) + } + // This is relative aggressive because that even if the uploading succeeds, + // it is not necessarily the one written to the commit log. But we can always load lineage + // from commit log so it is fine. + lineageManager.resetLineage(lineageManager.getLineageForCurrVersion() + .filter(i => i.version >= snapshot.version)) + logInfo(log"${MDC(LogKeys.LOG_ID, loggingId)}: " + + log"Upload snapshot of version ${MDC(LogKeys.VERSION_NUM, snapshot.version)}, " + + log"with uniqueId: ${MDC(LogKeys.UUID, snapshot.uniqueId)} " + + log"time taken: ${MDC(LogKeys.TIME_UNITS, uploadTime)} ms. " + + log"Current lineage: ${MDC(LogKeys.LINEAGE, lineageManager)}") + } finally { + snapshot.close() + } + + fileManagerMetrics + } + /** Create a native RocksDB logger that forwards native logs to log4j with correct log levels. */ private def createLogger(): Logger = { val dbLogger = new Logger(rocksDbOptions.infoLogLevel()) { @@ -1072,6 +1282,24 @@ class RocksDB( Utils.createDirectory(localRootDir.getAbsolutePath, prefix) } + override protected def logName: String = s"${super.logName} $loggingId" +} + +object RocksDB extends Logging { + case class RocksDBSnapshot( + checkpointDir: File, + version: Long, + numKeys: Long, + columnFamilyMapping: Map[String, Short], + maxColumnFamilyId: Short, + dfsFileSuffix: String, + fileMapping: Map[String, RocksDBSnapshotFile], + uniqueId: Option[String] = None) { + def close(): Unit = { + silentDeleteRecursively(checkpointDir, s"Free up local checkpoint of snapshot $version") + } + } + /** Attempt to delete recursively, and log the error if any */ private def silentDeleteRecursively(file: File, msg: String): Unit = { try { @@ -1083,40 +1311,9 @@ class RocksDB( } } - override protected def logName: String = s"${super.logName} $loggingId" -} - -object RocksDB extends Logging { - - /** Upload the snapshot to DFS and remove it from snapshots pending */ - private def uploadSnapshot( - snapshot: RocksDB#RocksDBSnapshot, - fileManager: RocksDBFileManager, - snapshotsPendingUpload: Set[RocksDBVersionSnapshotInfo], - loggingId: String): RocksDBFileManagerMetrics = { - var fileManagerMetrics: RocksDBFileManagerMetrics = null - try { - val uploadTime = timeTakenMs { - fileManager.saveCheckpointToDfs(snapshot.checkpointDir, - snapshot.version, snapshot.numKeys, snapshot.fileMapping, - Some(snapshot.columnFamilyMapping), Some(snapshot.maxColumnFamilyId)) - fileManagerMetrics = fileManager.latestSaveCheckpointMetrics - - val snapshotInfo = RocksDBVersionSnapshotInfo(snapshot.version, snapshot.dfsFileSuffix) - // We are only removing the uploaded snapshot info from the pending set, - // to let the file mapping (i.e. query threads) know that the snapshot (i.e. and its files) - // have been uploaded to DFS. We don't touch the file mapping here to avoid corrupting it. - snapshotsPendingUpload.remove(snapshotInfo) - } - logInfo(log"${MDC(LogKeys.LOG_ID, loggingId)}: Upload snapshot of version " + - log"${MDC(LogKeys.VERSION_NUM, snapshot.version)}," + - log" time taken: ${MDC(LogKeys.TIME_UNITS, uploadTime)} ms") - } finally { - snapshot.close() - } - - fileManagerMetrics - } + private def printLineageItems(lineage: Array[LineageItem]): String = lineage.map { + case LineageItem(l, optStr) => s"$l:$optStr" + }.mkString(" ") /** Records the duration of running `body` for the next query progress update. */ private def timeTakenMs(body: => Unit): Long = Utils.timeTakenMs(body)._2 @@ -1533,3 +1730,40 @@ case class AcquiredThreadInfo( } } +/** + * A helper class to manage the lineage information when checkpoint unique id is enabled. + * "lineage" is an array of LineageItem (version, uniqueId) pair. + * + * The first item of "lineage" should normally be the version of a snapshot, except + * for the first few versions. Because they are solely loaded from changelog file. + * (i.e. with default minDeltasForSnapshot, there is only 1_uuid1.changelog, no 1_uuid1.zip) + * + * The last item of "lineage" corresponds to one version before the to-be-committed version. + */ +private[sql] class RocksDBLineageManager { + @volatile private var lineage: Array[LineageItem] = Array.empty + + override def toString: String = lineage.map { + case LineageItem(version, uuid) => s"$version: $uuid" + }.mkString(" ") + + def appendLineageItem(item: LineageItem): Unit = { + lineage = lineage :+ item + } + + def resetLineage(newLineage: Array[LineageItem]): Unit = { + lineage = newLineage + } + + def getLineageForCurrVersion(): Array[LineageItem] = { + lineage.clone() + } + + def contains(item: LineageItem): Boolean = { + lineage.contains(item) + } + + def clear(): Unit = { + lineage = Array.empty + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala index 6b13ff31c9d50..e42a46dfbe15a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala @@ -41,6 +41,7 @@ import org.apache.spark.internal.{Logging, LogKeys, MDC, MessageWithContext} import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.streaming.CheckpointFileManager +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils @@ -151,60 +152,79 @@ class RocksDBFileManager( private var minSeenVersion = 1L @volatile private var rootDirChecked: Boolean = false - private val versionToRocksDBFiles = new ConcurrentHashMap[Long, Seq[RocksDBImmutableFile]] - private def getChangelogVersion(useColumnFamilies: Boolean): Short = { - val changelogVersion: Short = if (useColumnFamilies) { - 2 - } else { - 1 + // (version, checkpointUniqueId) -> immutable files + private val versionToRocksDBFiles = + new ConcurrentHashMap[(Long, Option[String]), Seq[RocksDBImmutableFile]]() + + /** + * Get the changelog version based on rocksDB features. + * @return the version of changelog + */ + private def getChangelogWriterVersion( + useColumnFamilies: Boolean, + stateStoreCheckpointIdEnabled: Boolean): Short = { + (useColumnFamilies, stateStoreCheckpointIdEnabled) match { + case (false, false) => 1 + case (true, false) => 2 + case (false, true) => 3 + case _ => 4 } - changelogVersion } def getChangeLogWriter( version: Long, - useColumnFamilies: Boolean = false): StateStoreChangelogWriter = { - val changelogFile = dfsChangelogFile(version) + useColumnFamilies: Boolean = false, + checkpointUniqueId: Option[String] = None, + stateStoreCheckpointIdLineage: Option[Array[LineageItem]] = None + ): StateStoreChangelogWriter = { + val changelogFile = dfsChangelogFile(version, checkpointUniqueId) if (!rootDirChecked) { val rootDir = new Path(dfsRootDir) if (!fm.exists(rootDir)) fm.mkdirs(rootDir) rootDirChecked = true } - val changelogVersion = getChangelogVersion(useColumnFamilies) + val enableStateStoreCheckpointIds = checkpointUniqueId.isDefined + val changelogVersion = getChangelogWriterVersion( + useColumnFamilies, enableStateStoreCheckpointIds) + val changelogWriter = changelogVersion match { case 1 => new StateStoreChangelogWriterV1(fm, changelogFile, codec) case 2 => new StateStoreChangelogWriterV2(fm, changelogFile, codec) + case 3 => + assert(enableStateStoreCheckpointIds && stateStoreCheckpointIdLineage.isDefined, + "StateStoreChangelogWriterV3 should only be initialized when " + + "state store checkpoint unique id is enabled") + new StateStoreChangelogWriterV3(fm, changelogFile, codec, stateStoreCheckpointIdLineage.get) + case 4 => + assert(enableStateStoreCheckpointIds && stateStoreCheckpointIdLineage.isDefined, + "StateStoreChangelogWriterV4 should only be initialized when " + + "state store checkpoint unique id is enabled") + new StateStoreChangelogWriterV4(fm, changelogFile, codec, stateStoreCheckpointIdLineage.get) case _ => throw QueryExecutionErrors.invalidChangeLogWriterVersion(changelogVersion) } + + logInfo(log"Loaded change log reader version " + + log"${MDC(LogKeys.FILE_VERSION, changelogWriter.version)}") + changelogWriter } // Get the changelog file at version def getChangelogReader( version: Long, - useColumnFamilies: Boolean = false): StateStoreChangelogReader = { - val changelogFile = dfsChangelogFile(version) - - // Note that ideally we should get the version for the reader from the - // changelog itself. However, since we don't record this for v1, we need to - // rely on external arguments to make this call today. Within the reader, we verify - // for the correctness of the decided/expected version. We might revisit this pattern - // as we add more changelog versions in the future. - val changelogVersion = getChangelogVersion(useColumnFamilies) - val changelogReader = changelogVersion match { - case 1 => - new StateStoreChangelogReaderV1(fm, changelogFile, codec) - case 2 => - new StateStoreChangelogReaderV2(fm, changelogFile, codec) - case _ => - throw QueryExecutionErrors.invalidChangeLogReaderVersion(changelogVersion) - } - changelogReader + checkpointUniqueId: Option[String] = None): StateStoreChangelogReader = { + val changelogFile = dfsChangelogFile(version, checkpointUniqueId) + val reader = new StateStoreChangelogReaderFactory(fm, changelogFile, codec) + .constructChangelogReader() + + logInfo(log"Loaded change log reader version ${MDC(LogKeys.FILE_VERSION, reader.version)}") + + reader } /** @@ -230,13 +250,15 @@ class RocksDBFileManager( numKeys: Long, fileMapping: Map[String, RocksDBSnapshotFile], columnFamilyMapping: Option[Map[String, Short]] = None, - maxColumnFamilyId: Option[Short] = None): Unit = { + maxColumnFamilyId: Option[Short] = None, + checkpointUniqueId: Option[String] = None): Unit = { logFilesInDir(checkpointDir, log"Saving checkpoint files " + log"for version ${MDC(LogKeys.VERSION_NUM, version)}") val (localImmutableFiles, localOtherFiles) = listRocksDBFiles(checkpointDir) - val rocksDBFiles = saveImmutableFilesToDfs(version, localImmutableFiles, fileMapping) - val metadata = RocksDBCheckpointMetadata( - rocksDBFiles, numKeys, columnFamilyMapping, maxColumnFamilyId) + val rocksDBFiles = saveImmutableFilesToDfs( + version, localImmutableFiles, fileMapping, checkpointUniqueId) + val metadata = RocksDBCheckpointMetadata(rocksDBFiles, numKeys, columnFamilyMapping, + maxColumnFamilyId) val metadataFile = localMetadataFile(checkpointDir) metadata.writeToFile(metadataFile) logInfo(log"Written metadata for version ${MDC(LogKeys.VERSION_NUM, version)}:\n" + @@ -255,8 +277,9 @@ class RocksDBFileManager( rootDirChecked = true } } - zipToDfsFile(localOtherFiles :+ metadataFile, dfsBatchZipFile(version)) - logInfo(log"Saved checkpoint file for version ${MDC(LogKeys.VERSION_NUM, version)}") + zipToDfsFile(localOtherFiles :+ metadataFile, dfsBatchZipFile(version, checkpointUniqueId)) + logInfo(log"Saved checkpoint file for version ${MDC(LogKeys.VERSION_NUM, version)} " + + log"checkpointUniqueId: ${MDC(LogKeys.UUID, checkpointUniqueId.getOrElse(""))}") } /** @@ -268,12 +291,14 @@ class RocksDBFileManager( def loadCheckpointFromDfs( version: Long, localDir: File, - rocksDBFileMapping: RocksDBFileMapping): RocksDBCheckpointMetadata = { - logInfo(log"Loading checkpoint files for version ${MDC(LogKeys.VERSION_NUM, version)}") + rocksDBFileMapping: RocksDBFileMapping, + checkpointUniqueId: Option[String] = None): RocksDBCheckpointMetadata = { + logInfo(log"Loading checkpoint files for version ${MDC(LogKeys.VERSION_NUM, version)} " + + log"checkpointUniqueId: ${MDC(LogKeys.UUID, checkpointUniqueId.getOrElse(""))}") // The unique ids of SST files are checked when opening a rocksdb instance. The SST files // in larger versions can't be reused even if they have the same size and name because // they belong to another rocksdb instance. - versionToRocksDBFiles.keySet().removeIf(_ >= version) + versionToRocksDBFiles.keySet().removeIf(_._1 >= version) val metadata = if (version == 0) { if (localDir.exists) Utils.deleteRecursively(localDir) localDir.mkdirs() @@ -281,7 +306,7 @@ class RocksDBFileManager( } else { // Delete all non-immutable files in local dir, and unzip new ones from DFS commit file listRocksDBFiles(localDir)._2.foreach(_.delete()) - Utils.unzipFilesFromFile(fs, dfsBatchZipFile(version), localDir) + Utils.unzipFilesFromFile(fs, dfsBatchZipFile(version, checkpointUniqueId), localDir) // Copy the necessary immutable files val metadataFile = localMetadataFile(localDir) @@ -289,7 +314,7 @@ class RocksDBFileManager( logInfo(log"Read metadata for version ${MDC(LogKeys.VERSION_NUM, version)}:\n" + log"${MDC(LogKeys.METADATA_JSON, metadata.prettyJson)}") loadImmutableFilesFromDfs(metadata.immutableFiles, localDir, rocksDBFileMapping, version) - versionToRocksDBFiles.put(version, metadata.immutableFiles) + versionToRocksDBFiles.put((version, checkpointUniqueId), metadata.immutableFiles) metadataFile.delete() metadata } @@ -298,6 +323,17 @@ class RocksDBFileManager( metadata } + // Return if there is a snapshot file at the corresponding version + // and optionally with checkpointunique id, e.g. version.zip or version_uniqueId.zip + def existsSnapshotFile(version: Long, checkpointUniqueId: Option[String] = None): Boolean = { + if (!rootDirChecked) { + val path = new Path(dfsRootDir) + if (!fm.exists(path)) fm.mkdirs(path) + rootDirChecked = true + } + fm.exists(dfsBatchZipFile(version, checkpointUniqueId)) + } + // Get latest snapshot version <= version def getLatestSnapshotVersion(version: Long): Long = { val path = new Path(dfsRootDir) @@ -316,20 +352,52 @@ class RocksDBFileManager( } } + /** + * Based on the ground truth lineage loaded from changelog file (lineage), this function + * does file listing to find all snapshot (version, uniqueId) pairs, and finds + * the ground truth latest snapshot (version, uniqueId) the db instance needs to load. + * + * @param lineage The ground truth lineage loaded from changelog file, sorted by id + * @return The ground truth latest snapshot (version, uniqueId) the db instance needs to load, + * when the return value is None it means ther is no such snapshot found. + */ + def getLatestSnapshotVersionAndUniqueIdFromLineage( + lineage: Array[LineageItem]): Option[(Long, String)] = { + val path = new Path(dfsRootDir) + if (fm.exists(path)) { + fm.list(path, onlyZipFiles) + .map(_.getPath.getName.stripSuffix(".zip").split("_")) + .collect { + case Array(ver, id) if lineage.contains(LineageItem(ver.toLong, id)) => + (ver.toLong, id) + } + .sortBy(_._1) + .reverse + .headOption + } else { + None + } + } /** Get the latest version available in the DFS directory. If no data present, it returns 0. */ def getLatestVersion(): Long = { val path = new Path(dfsRootDir) if (fm.exists(path)) { val files = fm.list(path).map(_.getPath) - val changelogFileVersions = files - .filter(onlyChangelogFiles.accept) - .map(_.getName.stripSuffix(".changelog")) - .map(_.toLong) - val snapshotFileVersions = files - .filter(onlyZipFiles.accept) - .map(_.getName.stripSuffix(".zip")) - .map(_.toLong) + val changelogFileVersions = files.filter(onlyChangelogFiles.accept) + .map { fileName => + fileName.getName.stripSuffix(".changelog").split("_") match { + case Array(version, _) => version.toLong + case Array(version) => version.toLong + } + } + val snapshotFileVersions = files.filter(onlyZipFiles.accept) + .map { fileName => + fileName.getName.stripSuffix(".zip").split("_") match { + case Array(version, _) => version.toLong + case Array(version) => version.toLong + } + } val versions = changelogFileVersions ++ snapshotFileVersions versions.foldLeft(0L)(math.max) } else { @@ -370,15 +438,18 @@ class RocksDBFileManager( } } - private def deleteChangelogFiles(versionsToDelete: Array[Long]): Unit = { - versionsToDelete.foreach { version => + private def deleteChangelogFiles( + versionsAndUniqueIdsToDelete: Array[(Long, Option[String])]): Unit = { + versionsAndUniqueIdsToDelete.foreach { case (version, uniqueId) => try { - fm.delete(dfsChangelogFile(version)) - logInfo(log"Deleted changelog file ${MDC(LogKeys.VERSION_NUM, version)}") + fm.delete(dfsChangelogFile(version, uniqueId)) + logInfo(log"Deleted changelog file ${MDC(LogKeys.VERSION_NUM, version)} uniqueId: " + + log"${MDC(LogKeys.UUID, uniqueId.getOrElse(""))}") } catch { case e: Exception => logWarning( - log"Error deleting changelog file for version ${MDC(LogKeys.FILE_VERSION, version)}", e) + log"Error deleting changelog file for version ${MDC(LogKeys.FILE_VERSION, version)} " + + log"uniqueId: ${MDC(LogKeys.UUID, uniqueId.getOrElse(""))}", e) } } } @@ -468,38 +539,43 @@ class RocksDBFileManager( val snapshotFiles = allFiles.filter(file => onlyZipFiles.accept(file)) val changelogFiles = allFiles.filter(file => onlyChangelogFiles.accept(file)) // All versions present in DFS, sorted - val sortedSnapshotVersions = snapshotFiles - .map(_.getName.stripSuffix(".zip")) - .map(_.toLong) - .sorted + val sortedSnapshotVersionsAndUniqueIds = snapshotFiles + .map(_.getName.stripSuffix(".zip").split("_")) + .map { + case Array(version, uniqueId) => (version.toLong, Some(uniqueId)) + case Array(version) => (version.toLong, None) + } + .sortBy(_._1) // Return if no versions generated yet - if (sortedSnapshotVersions.isEmpty) return + if (sortedSnapshotVersionsAndUniqueIds.isEmpty) return // Find the versions to delete - val maxSnapshotVersionPresent = sortedSnapshotVersions.last + val maxSnapshotVersionPresent = sortedSnapshotVersionsAndUniqueIds.last._1 // In order to reconstruct numVersionsToRetain version, retain the latest snapshot // that satisfies (version <= maxSnapshotVersionPresent - numVersionsToRetain + 1). // If none of the snapshots satisfy the condition, minVersionToRetain will be 0 and // no version gets deleted. - val minVersionToRetain = sortedSnapshotVersions + val minVersionToRetain = sortedSnapshotVersionsAndUniqueIds + .map(_._1) .filter(_ <= maxSnapshotVersionPresent - numVersionsToRetain + 1) .foldLeft(0L)(math.max) // When snapshotVersionToDelete is non-empty, there are at least 2 snapshot versions. // We only delete orphan files when there are at least 2 versions, // which avoid deleting files for running tasks. - val snapshotVersionsToDelete = sortedSnapshotVersions.filter(_ < minVersionToRetain) - if (snapshotVersionsToDelete.isEmpty) return - + val snapshotVersionsAndUniqueIdsToDelete = sortedSnapshotVersionsAndUniqueIds + .filter(_._1 < minVersionToRetain) + val snapshotVersionsToDelete = snapshotVersionsAndUniqueIdsToDelete.map(_._1) + if (snapshotVersionsAndUniqueIdsToDelete.isEmpty) return // Resolve RocksDB files for all the versions and find the max version each file is used val fileToMaxUsedVersion = new mutable.HashMap[String, Long] - sortedSnapshotVersions.foreach { version => - val files = Option(versionToRocksDBFiles.get(version)).getOrElse { - val newResolvedFiles = getImmutableFilesFromVersionZip(version) - versionToRocksDBFiles.put(version, newResolvedFiles) + sortedSnapshotVersionsAndUniqueIds.foreach { case (version, uniqueId) => + val files = Option(versionToRocksDBFiles.get((version, uniqueId))).getOrElse { + val newResolvedFiles = getImmutableFilesFromVersionZip(version, uniqueId) + versionToRocksDBFiles.put((version, uniqueId), newResolvedFiles) newResolvedFiles } files.foreach(f => fileToMaxUsedVersion(f.dfsFileName) = @@ -542,11 +618,11 @@ class RocksDBFileManager( } // Delete the version files and forget about them - snapshotVersionsToDelete.foreach { version => - val versionFile = dfsBatchZipFile(version) + snapshotVersionsAndUniqueIdsToDelete.foreach { case (version, uniqueId) => + val versionFile = dfsBatchZipFile(version, uniqueId) try { fm.delete(versionFile) - versionToRocksDBFiles.remove(version) + versionToRocksDBFiles.remove((version, uniqueId)) logDebug(s"Deleted version $version") } catch { case e: Exception => @@ -558,10 +634,16 @@ class RocksDBFileManager( log"(failed to delete" + log"${MDC(LogKeys.NUM_FILES_FAILED_TO_DELETE, failedToDelete)} files) " + log"not used in versions >= ${MDC(LogKeys.MIN_VERSION_NUM, minVersionToRetain)}") - val changelogVersionsToDelete = changelogFiles - .map(_.getName.stripSuffix(".changelog")).map(_.toLong) - .filter(_ < minVersionToRetain) - deleteChangelogFiles(changelogVersionsToDelete) + + val changelogVersionsAndUniqueIdsToDelete: Array[(Long, Option[String])] = changelogFiles + .map(_.getName.stripSuffix(".changelog").split("_")) + .map { + case Array(version, uniqueId) => (version.toLong, Option(uniqueId)) + case Array(version) => (version.toLong, None) + } + .filter(_._1 < minVersionToRetain) + + deleteChangelogFiles(changelogVersionsAndUniqueIdsToDelete) // Always set minSeenVersion for regular deletion frequency even if deletion fails. // This is safe because subsequent calls retry deleting old version files @@ -572,10 +654,12 @@ class RocksDBFileManager( private def saveImmutableFilesToDfs( version: Long, localFiles: Seq[File], - fileMappings: Map[String, RocksDBSnapshotFile]): Seq[RocksDBImmutableFile] = { + fileMappings: Map[String, RocksDBSnapshotFile], + checkpointUniqueId: Option[String] = None): Seq[RocksDBImmutableFile] = { // Get the immutable files used in previous versions, as some of those uploaded files can be // reused for this version - logInfo(log"Saving RocksDB files to DFS for ${MDC(LogKeys.VERSION_NUM, version)}") + logInfo(log"Saving RocksDB files to DFS for version ${MDC(LogKeys.VERSION_NUM, version)} " + + log"uniqueId: ${MDC(LogKeys.UUID, checkpointUniqueId.getOrElse(""))}") var bytesCopied = 0L var filesCopied = 0L @@ -611,7 +695,7 @@ class RocksDBFileManager( log"(${MDC(LogKeys.NUM_BYTES, bytesCopied)} bytes) from local to" + log" DFS for version ${MDC(LogKeys.VERSION_NUM, version)}. " + log"${MDC(LogKeys.NUM_FILES_REUSED, filesReused)} files reused without copying.") - versionToRocksDBFiles.put(version, immutableFiles) + versionToRocksDBFiles.put((version, checkpointUniqueId), immutableFiles) saveCheckpointMetrics = RocksDBFileManagerMetrics( bytesCopied = bytesCopied, filesCopied = filesCopied, @@ -699,10 +783,11 @@ class RocksDBFileManager( } /** Get the SST files required for a version from the version zip file in DFS */ - private def getImmutableFilesFromVersionZip(version: Long): Seq[RocksDBImmutableFile] = { + private def getImmutableFilesFromVersionZip( + version: Long, checkpointUniqueId: Option[String] = None): Seq[RocksDBImmutableFile] = { Utils.deleteRecursively(localTempDir) localTempDir.mkdirs() - Utils.unzipFilesFromFile(fs, dfsBatchZipFile(version), localTempDir) + Utils.unzipFilesFromFile(fs, dfsBatchZipFile(version, checkpointUniqueId), localTempDir) val metadataFile = localMetadataFile(localTempDir) val metadata = RocksDBCheckpointMetadata.readFromFile(metadataFile) metadata.immutableFiles @@ -774,10 +859,14 @@ class RocksDBFileManager( immutableFile.dfsFileName.substring(suffixStart + 1, suffixEnd) } - private def dfsBatchZipFile(version: Long): Path = new Path(s"$dfsRootDir/$version.zip") + private def dfsBatchZipFile(version: Long, checkpointUniqueId: Option[String] = None): Path = + checkpointUniqueId.map(id => new Path(s"$dfsRootDir/${version}_$id.zip")) + .getOrElse(new Path(s"$dfsRootDir/$version.zip")) // We use changelog suffix intentionally so that we can tell the difference from changelog file of // HDFSBackedStateStore which is named version.delta. - private def dfsChangelogFile(version: Long): Path = new Path(s"$dfsRootDir/$version.changelog") + private def dfsChangelogFile(version: Long, checkpointUniqueId: Option[String] = None): Path = + checkpointUniqueId.map(id => new Path(s"$dfsRootDir/${version}_$id.changelog")) + .getOrElse(new Path(s"$dfsRootDir/$version.changelog")) private def localMetadataFile(parentDir: File): File = new File(parentDir, "metadata") @@ -873,7 +962,7 @@ case class RocksDBCheckpointMetadata( /** Helper class for [[RocksDBCheckpointMetadata]] */ object RocksDBCheckpointMetadata { - val VERSION = 1 + val VERSION = SQLConf.get.stateStoreCheckpointFormatVersion implicit val format: Formats = Serialization.formats(NoTypeHints) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateEncoder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateEncoder.scala index 4c7a226e0973f..46b4ad205c2fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateEncoder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateEncoder.scala @@ -17,14 +17,22 @@ package org.apache.spark.sql.execution.streaming.state +import java.io.ByteArrayOutputStream import java.lang.Double.{doubleToRawLongBits, longBitsToDouble} import java.lang.Float.{floatToRawIntBits, intBitsToFloat} import java.nio.{ByteBuffer, ByteOrder} +import org.apache.avro.Schema +import org.apache.avro.generic.{GenericData, GenericDatumReader, GenericDatumWriter, GenericRecord} +import org.apache.avro.io.{DecoderFactory, EncoderFactory} + import org.apache.spark.internal.Logging +import org.apache.spark.sql.avro.{AvroDeserializer, AvroOptions, AvroSerializer, SchemaConverters} +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, JoinedRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter -import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider.{STATE_ENCODING_NUM_VERSION_BYTES, STATE_ENCODING_VERSION, VIRTUAL_COL_FAMILY_PREFIX_BYTES} +import org.apache.spark.sql.execution.streaming.StateStoreColumnFamilySchemaUtils +import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider.{SCHEMA_ID_PREFIX_BYTES, STATE_ENCODING_NUM_VERSION_BYTES, STATE_ENCODING_VERSION, VIRTUAL_COL_FAMILY_PREFIX_BYTES} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform @@ -43,90 +51,208 @@ sealed trait RocksDBValueStateEncoder { def decodeValues(valueBytes: Array[Byte]): Iterator[UnsafeRow] } -abstract class RocksDBKeyStateEncoderBase( - useColumnFamilies: Boolean, - virtualColFamilyId: Option[Short] = None) extends RocksDBKeyStateEncoder { - def offsetForColFamilyPrefix: Int = - if (useColumnFamilies) VIRTUAL_COL_FAMILY_PREFIX_BYTES else 0 +/** + * Contains schema version information for both key and value schemas in a state store. + * This information is used to support schema evolution, allowing state schemas to be + * modified over time while maintaining compatibility with existing state data. + * + * @param keySchemaId A unique identifier for the version of the key schema. + * Used to track and handle changes to the key schema structure. + * @param valueSchemaId A unique identifier for the version of the value schema. + * Used to track and handle changes to the value schema structure. + */ +case class StateSchemaInfo( + keySchemaId: Short, + valueSchemaId: Short +) + +/** + * Represents a row of state data along with its schema version. + * Used during state storage operations to track which schema version was used + * to encode the data, enabling proper decoding even when schemas have evolved. + * + * @param schemaId The version identifier for the schema that was used to encode this row. + * This could be either a key schema ID or value schema ID depending on context. + * @param bytes The actual encoded data bytes for this row. When using Avro encoding, + * these bytes contain the Avro-serialized data. For UnsafeRow encoding, + * these contain the binary-encoded row data. + */ +case class StateSchemaIdRow( + schemaId: Short, + bytes: Array[Byte] +) +/** + * The DataEncoder can encode UnsafeRows into raw bytes in two ways: + * - Using the direct byte layout of the UnsafeRow + * - Converting the UnsafeRow into an Avro row, and encoding that + * In both of these cases, the raw bytes that are written into RockDB have + * headers, footers and other metadata, but they also have data that is provided + * by the callers. The metadata in each row does not need to be written as Avro or UnsafeRow, + * but the actual data provided by the caller does. + * The classes that use this trait require specialized partial encoding which makes them much + * easier to cache and use, which is why each DataEncoder deals with multiple schemas. + */ +trait DataEncoder { /** - * Get Byte Array for the virtual column family id that is used as prefix for - * key state rows. + * Encodes a complete key row into bytes. Used as the primary key for state lookups. + * + * @param row An UnsafeRow containing all key columns as defined in the keySchema + * @return Serialized byte array representation of the key */ - override def getColumnFamilyIdBytes(): Array[Byte] = { - assert(useColumnFamilies, "Cannot return virtual Column Family Id Bytes" + - " because multiple Column is not supported for this encoder") - val encodedBytes = new Array[Byte](VIRTUAL_COL_FAMILY_PREFIX_BYTES) - Platform.putShort(encodedBytes, Platform.BYTE_ARRAY_OFFSET, virtualColFamilyId.get) - encodedBytes - } + def encodeKey(row: UnsafeRow): Array[Byte] /** - * Encode and put column family Id as a prefix to a pre-allocated byte array. + * Encodes the non-prefix portion of a key row. Used with prefix scan and + * range scan state lookups where the key is split into prefix and remaining portions. * - * @param numBytes - size of byte array to be created for storing key row (without - * column family prefix) - * @return Array[Byte] for an array byte to put encoded key bytes - * Int for a starting offset to put the encoded key bytes + * For prefix scans: Encodes columns after the prefix columns + * For range scans: Encodes columns not included in the ordering columns + * + * @param row An UnsafeRow containing only the remaining key columns + * @return Serialized byte array of the remaining key portion + * @throws UnsupportedOperationException if called on an encoder that doesn't support split keys */ - protected def encodeColumnFamilyPrefix(numBytes: Int): (Array[Byte], Int) = { - val encodedBytes = new Array[Byte](numBytes + offsetForColFamilyPrefix) - var offset = Platform.BYTE_ARRAY_OFFSET - if (useColumnFamilies) { - Platform.putShort(encodedBytes, Platform.BYTE_ARRAY_OFFSET, virtualColFamilyId.get) - offset = Platform.BYTE_ARRAY_OFFSET + offsetForColFamilyPrefix - } - (encodedBytes, offset) - } + def encodeRemainingKey(row: UnsafeRow): Array[Byte] /** - * Get starting offset for decoding an encoded key byte array. + * Encodes key columns used for range scanning, ensuring proper sort order in RocksDB. + * + * This method handles special encoding for numeric types to maintain correct sort order: + * - Adds sign byte markers for numeric types + * - Flips bits for negative floating point values + * - Preserves null ordering + * + * @param row An UnsafeRow containing the columns needed for range scan + * (specified by orderingOrdinals) + * @return Serialized bytes that will maintain correct sort order in RocksDB + * @throws UnsupportedOperationException if called on an encoder that doesn't support range scans */ - protected def decodeKeyStartOffset: Int = { - if (useColumnFamilies) { - Platform.BYTE_ARRAY_OFFSET + VIRTUAL_COL_FAMILY_PREFIX_BYTES - } else Platform.BYTE_ARRAY_OFFSET - } + def encodePrefixKeyForRangeScan(row: UnsafeRow): Array[Byte] + + /** + * Encodes a value row into bytes. + * + * @param row An UnsafeRow containing the value columns as defined in the valueSchema + * @return Serialized byte array representation of the value + */ + def encodeValue(row: UnsafeRow): Array[Byte] + + /** + * Decodes a complete key from its serialized byte form. + * + * For NoPrefixKeyStateEncoder: Decodes the entire key + * For PrefixKeyScanStateEncoder: Decodes only the prefix portion + * + * @param bytes Serialized byte array containing the encoded key + * @return UnsafeRow containing the decoded key columns + * @throws UnsupportedOperationException for unsupported encoder types + */ + def decodeKey(bytes: Array[Byte]): UnsafeRow + + /** + * Decodes the remaining portion of a split key from its serialized form. + * + * For PrefixKeyScanStateEncoder: Decodes columns after the prefix + * For RangeKeyScanStateEncoder: Decodes non-ordering columns + * + * @param bytes Serialized byte array containing the encoded remaining key portion + * @return UnsafeRow containing the decoded remaining key columns + * @throws UnsupportedOperationException if called on an encoder that doesn't support split keys + */ + def decodeRemainingKey(bytes: Array[Byte]): UnsafeRow + + /** + * Decodes range scan key bytes back into an UnsafeRow, preserving proper ordering. + * + * This method reverses the special encoding done by encodePrefixKeyForRangeScan: + * - Interprets sign byte markers + * - Reverses bit flipping for negative floating point values + * - Handles null values + * + * @param bytes Serialized byte array containing the encoded range scan key + * @return UnsafeRow containing the decoded range scan columns + * @throws UnsupportedOperationException if called on an encoder that doesn't support range scans + */ + def decodePrefixKeyForRangeScan(bytes: Array[Byte]): UnsafeRow + + /** + * Decodes a value from its serialized byte form. + * + * @param bytes Serialized byte array containing the encoded value + * @return UnsafeRow containing the decoded value columns + */ + def decodeValue(bytes: Array[Byte]): UnsafeRow + + def supportsSchemaEvolution: Boolean } -object RocksDBStateEncoder { - def getKeyEncoder( - keyStateEncoderSpec: KeyStateEncoderSpec, - useColumnFamilies: Boolean, - virtualColFamilyId: Option[Short] = None): RocksDBKeyStateEncoder = { - // Return the key state encoder based on the requested type - keyStateEncoderSpec match { - case NoPrefixKeyStateEncoderSpec(keySchema) => - new NoPrefixKeyStateEncoder(keySchema, useColumnFamilies, virtualColFamilyId) +abstract class RocksDBDataEncoder( + keyStateEncoderSpec: KeyStateEncoderSpec, + valueSchema: StructType) extends DataEncoder { - case PrefixKeyScanStateEncoderSpec(keySchema, numColsPrefixKey) => - new PrefixKeyScanStateEncoder(keySchema, numColsPrefixKey, - useColumnFamilies, virtualColFamilyId) + val keySchema = keyStateEncoderSpec.keySchema + val reusedKeyRow = new UnsafeRow(keyStateEncoderSpec.keySchema.length) + val reusedValueRow = new UnsafeRow(valueSchema.length) - case RangeKeyScanStateEncoderSpec(keySchema, orderingOrdinals) => - new RangeKeyScanStateEncoder(keySchema, orderingOrdinals, - useColumnFamilies, virtualColFamilyId) + // bit masks used for checking sign or flipping all bits for negative float/double values + val floatFlipBitMask = 0xFFFFFFFF + val floatSignBitMask = 0x80000000 - case _ => - throw new IllegalArgumentException(s"Unsupported key state encoder spec: " + - s"$keyStateEncoderSpec") - } + val doubleFlipBitMask = 0xFFFFFFFFFFFFFFFFL + val doubleSignBitMask = 0x8000000000000000L + + // Byte markers used to identify whether the value is null, negative or positive + // To ensure sorted ordering, we use the lowest byte value for negative numbers followed by + // positive numbers and then null values. + val negativeValMarker: Byte = 0x00.toByte + val positiveValMarker: Byte = 0x01.toByte + val nullValMarker: Byte = 0x02.toByte + + def encodeWithStateSchemaId(schemaIdRow: StateSchemaIdRow): Array[Byte] = { + // Create result array big enough for all prefixes plus data + val data = schemaIdRow.bytes + val schemaId = schemaIdRow.schemaId + val result = new Array[Byte](SCHEMA_ID_PREFIX_BYTES + data.length) + var offset = Platform.BYTE_ARRAY_OFFSET + + Platform.putShort(result, offset, schemaId) + offset += SCHEMA_ID_PREFIX_BYTES + + // Write the actual data + Platform.copyMemory( + data, Platform.BYTE_ARRAY_OFFSET, + result, offset, + data.length + ) + result } - def getValueEncoder( - valueSchema: StructType, - useMultipleValuesPerKey: Boolean): RocksDBValueStateEncoder = { - if (useMultipleValuesPerKey) { - new MultiValuedStateEncoder(valueSchema) - } else { - new SingleValueStateEncoder(valueSchema) - } + def decodeStateSchemaIdRow(bytes: Array[Byte]): StateSchemaIdRow = { + var offset = Platform.BYTE_ARRAY_OFFSET + + // Read column family ID if present + val schemaId = Platform.getShort(bytes, offset) + offset += SCHEMA_ID_PREFIX_BYTES + + // Extract the actual data + val dataLength = bytes.length - SCHEMA_ID_PREFIX_BYTES + val data = new Array[Byte](dataLength) + Platform.copyMemory( + bytes, offset, + data, Platform.BYTE_ARRAY_OFFSET, + dataLength + ) + + StateSchemaIdRow(schemaId, data) } - def getColumnFamilyIdBytes(virtualColFamilyId: Short): Array[Byte] = { - val encodedBytes = new Array[Byte](VIRTUAL_COL_FAMILY_PREFIX_BYTES) - Platform.putShort(encodedBytes, Platform.BYTE_ARRAY_OFFSET, virtualColFamilyId) - encodedBytes + def unsupportedOperationForKeyStateEncoder( + operation: String + ): UnsupportedOperationException = { + new UnsupportedOperationException( + s"Method $operation not supported for encoder spec type " + + s"${keyStateEncoderSpec.getClass.getSimpleName}") } /** @@ -150,39 +276,937 @@ object RocksDBStateEncoder { val row = new UnsafeRow(numFields) decodeToUnsafeRow(bytes, row) } else { - null + null + } + } + + def decodeToUnsafeRow(bytes: Array[Byte], reusedRow: UnsafeRow): UnsafeRow = { + if (bytes != null) { + // Platform.BYTE_ARRAY_OFFSET is the recommended way refer to the 1st offset. See Platform. + reusedRow.pointTo( + bytes, + Platform.BYTE_ARRAY_OFFSET + STATE_ENCODING_NUM_VERSION_BYTES, + bytes.length - STATE_ENCODING_NUM_VERSION_BYTES) + reusedRow + } else { + null + } + } +} + +class UnsafeRowDataEncoder( + keyStateEncoderSpec: KeyStateEncoderSpec, + valueSchema: StructType, + stateSchemaInfo: Option[StateSchemaInfo] +) extends RocksDBDataEncoder(keyStateEncoderSpec, valueSchema) { + + override def supportsSchemaEvolution: Boolean = false + + override def encodeKey(row: UnsafeRow): Array[Byte] = { + encodeUnsafeRow(row) + } + + override def encodeRemainingKey(row: UnsafeRow): Array[Byte] = { + encodeUnsafeRow(row) + } + + override def encodePrefixKeyForRangeScan(row: UnsafeRow): Array[Byte] = { + assert(keyStateEncoderSpec.isInstanceOf[RangeKeyScanStateEncoderSpec]) + val rsk = keyStateEncoderSpec.asInstanceOf[RangeKeyScanStateEncoderSpec] + val rangeScanKeyFieldsWithOrdinal = rsk.orderingOrdinals.map { ordinal => + val field = rsk.keySchema(ordinal) + (field, ordinal) + } + val writer = new UnsafeRowWriter(rsk.orderingOrdinals.length) + writer.resetRowWriter() + rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) => + val field = fieldWithOrdinal._1 + val value = row.get(idx, field.dataType) + // Note that we cannot allocate a smaller buffer here even if the value is null + // because the effective byte array is considered variable size and needs to have + // the same size across all rows for the ordering to work as expected. + val bbuf = ByteBuffer.allocate(field.dataType.defaultSize + 1) + bbuf.order(ByteOrder.BIG_ENDIAN) + if (value == null) { + bbuf.put(nullValMarker) + writer.write(idx, bbuf.array()) + } else { + field.dataType match { + case BooleanType => + case ByteType => + val byteVal = value.asInstanceOf[Byte] + val signCol = if (byteVal < 0) { + negativeValMarker + } else { + positiveValMarker + } + bbuf.put(signCol) + bbuf.put(byteVal) + writer.write(idx, bbuf.array()) + + case ShortType => + val shortVal = value.asInstanceOf[Short] + val signCol = if (shortVal < 0) { + negativeValMarker + } else { + positiveValMarker + } + bbuf.put(signCol) + bbuf.putShort(shortVal) + writer.write(idx, bbuf.array()) + + case IntegerType => + val intVal = value.asInstanceOf[Int] + val signCol = if (intVal < 0) { + negativeValMarker + } else { + positiveValMarker + } + bbuf.put(signCol) + bbuf.putInt(intVal) + writer.write(idx, bbuf.array()) + + case LongType => + val longVal = value.asInstanceOf[Long] + val signCol = if (longVal < 0) { + negativeValMarker + } else { + positiveValMarker + } + bbuf.put(signCol) + bbuf.putLong(longVal) + writer.write(idx, bbuf.array()) + + case FloatType => + val floatVal = value.asInstanceOf[Float] + val rawBits = floatToRawIntBits(floatVal) + // perform sign comparison using bit manipulation to ensure NaN values are handled + // correctly + if ((rawBits & floatSignBitMask) != 0) { + // for negative values, we need to flip all the bits to ensure correct ordering + val updatedVal = rawBits ^ floatFlipBitMask + bbuf.put(negativeValMarker) + // convert the bits back to float + bbuf.putFloat(intBitsToFloat(updatedVal)) + } else { + bbuf.put(positiveValMarker) + bbuf.putFloat(floatVal) + } + writer.write(idx, bbuf.array()) + + case DoubleType => + val doubleVal = value.asInstanceOf[Double] + val rawBits = doubleToRawLongBits(doubleVal) + // perform sign comparison using bit manipulation to ensure NaN values are handled + // correctly + if ((rawBits & doubleSignBitMask) != 0) { + // for negative values, we need to flip all the bits to ensure correct ordering + val updatedVal = rawBits ^ doubleFlipBitMask + bbuf.put(negativeValMarker) + // convert the bits back to double + bbuf.putDouble(longBitsToDouble(updatedVal)) + } else { + bbuf.put(positiveValMarker) + bbuf.putDouble(doubleVal) + } + writer.write(idx, bbuf.array()) + } + } + } + encodeUnsafeRow(writer.getRow()) + } + + override def encodeValue(row: UnsafeRow): Array[Byte] = encodeUnsafeRow(row) + + override def decodeKey(bytes: Array[Byte]): UnsafeRow = { + keyStateEncoderSpec match { + case NoPrefixKeyStateEncoderSpec(_) => + decodeToUnsafeRow(bytes, reusedKeyRow) + case PrefixKeyScanStateEncoderSpec(_, numColsPrefixKey) => + decodeToUnsafeRow(bytes, numFields = numColsPrefixKey) + case _ => throw unsupportedOperationForKeyStateEncoder("decodeKey") + } + } + + override def decodeRemainingKey(bytes: Array[Byte]): UnsafeRow = { + keyStateEncoderSpec match { + case PrefixKeyScanStateEncoderSpec(_, numColsPrefixKey) => + decodeToUnsafeRow(bytes, numFields = numColsPrefixKey) + case RangeKeyScanStateEncoderSpec(_, orderingOrdinals) => + decodeToUnsafeRow(bytes, keySchema.length - orderingOrdinals.length) + case _ => throw unsupportedOperationForKeyStateEncoder("decodeRemainingKey") + } + } + + override def decodePrefixKeyForRangeScan(bytes: Array[Byte]): UnsafeRow = { + assert(keyStateEncoderSpec.isInstanceOf[RangeKeyScanStateEncoderSpec]) + val rsk = keyStateEncoderSpec.asInstanceOf[RangeKeyScanStateEncoderSpec] + val writer = new UnsafeRowWriter(rsk.orderingOrdinals.length) + val rangeScanKeyFieldsWithOrdinal = rsk.orderingOrdinals.map { ordinal => + val field = rsk.keySchema(ordinal) + (field, ordinal) + } + writer.resetRowWriter() + val row = decodeToUnsafeRow(bytes, numFields = rsk.orderingOrdinals.length) + rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) => + val field = fieldWithOrdinal._1 + + val value = row.getBinary(idx) + val bbuf = ByteBuffer.wrap(value.asInstanceOf[Array[Byte]]) + bbuf.order(ByteOrder.BIG_ENDIAN) + val isNullOrSignCol = bbuf.get() + if (isNullOrSignCol == nullValMarker) { + // set the column to null and skip reading the next byte(s) + writer.setNullAt(idx) + } else { + field.dataType match { + case BooleanType => + case ByteType => + writer.write(idx, bbuf.get) + + case ShortType => + writer.write(idx, bbuf.getShort) + + case IntegerType => + writer.write(idx, bbuf.getInt) + + case LongType => + writer.write(idx, bbuf.getLong) + + case FloatType => + if (isNullOrSignCol == negativeValMarker) { + // if the number is negative, get the raw binary bits for the float + // and flip the bits back + val updatedVal = floatToRawIntBits(bbuf.getFloat) ^ floatFlipBitMask + writer.write(idx, intBitsToFloat(updatedVal)) + } else { + writer.write(idx, bbuf.getFloat) + } + + case DoubleType => + if (isNullOrSignCol == negativeValMarker) { + // if the number is negative, get the raw binary bits for the double + // and flip the bits back + val updatedVal = doubleToRawLongBits(bbuf.getDouble) ^ doubleFlipBitMask + writer.write(idx, longBitsToDouble(updatedVal)) + } else { + writer.write(idx, bbuf.getDouble) + } + } + } + } + writer.getRow() + } + + override def decodeValue(bytes: Array[Byte]): UnsafeRow = decodeToUnsafeRow(bytes, reusedValueRow) +} + +/** + * Encoder that uses Avro for serializing state store data with schema evolution support. + * The encoded format varies depending on the key type and whether it's a key or value: + * + * For prefix and range scan keys: + * |--prefix---|--schemaId (2 bytes)--|--remainingKeyBytes (avro-encoded)--| + * where: + * - prefix: Variable length prefix for scan operations + * - schemaId: 2 byte short integer identifying the schema version + * - remainingKeyBytes: Avro-encoded remaining key data + * + * For no-prefix keys and values: + * |--schemaId (2 bytes)--|--avroEncodedBytes--| + * where: + * - schemaId: 2 byte short integer identifying the schema version + * - avroEncodedBytes: Variable length Avro-encoded data + * + * The schema ID allows the state store to identify which schema version was used + * to encode the data, enabling proper decoding even when schemas have evolved over time. + * + * @param keyStateEncoderSpec Specification for how to encode keys (prefix/range scan) + * @param valueSchema Schema for the values to be encoded + * @param stateSchemaInfo Schema version information for both keys and values + */ +class AvroStateEncoder( + keyStateEncoderSpec: KeyStateEncoderSpec, + valueSchema: StructType, + stateSchemaInfo: Option[StateSchemaInfo] +) extends RocksDBDataEncoder(keyStateEncoderSpec, valueSchema) with Logging { + + private val avroEncoder = createAvroEnc(keyStateEncoderSpec, valueSchema) + // Avro schema used by the avro encoders + private lazy val keyAvroType: Schema = SchemaConverters.toAvroType(keySchema) + private lazy val keyProj = UnsafeProjection.create(keySchema) + + private lazy val valueAvroType: Schema = SchemaConverters.toAvroType(valueSchema) + private lazy val valueProj = UnsafeProjection.create(valueSchema) + + // Prefix Key schema and projection definitions used by the Avro Serializers + // and Deserializers + private lazy val prefixKeySchema = keyStateEncoderSpec match { + case PrefixKeyScanStateEncoderSpec(keySchema, numColsPrefixKey) => + StructType(keySchema.take (numColsPrefixKey)) + case _ => throw unsupportedOperationForKeyStateEncoder("prefixKeySchema") + } + private lazy val prefixKeyAvroType = SchemaConverters.toAvroType(prefixKeySchema) + private lazy val prefixKeyProj = UnsafeProjection.create(prefixKeySchema) + + // Range Key schema nd projection definitions used by the Avro Serializers and + // Deserializers + private lazy val rangeScanKeyFieldsWithOrdinal = keyStateEncoderSpec match { + case RangeKeyScanStateEncoderSpec(keySchema, orderingOrdinals) => + orderingOrdinals.map { ordinal => + val field = keySchema(ordinal) + (field, ordinal) + } + case _ => + throw unsupportedOperationForKeyStateEncoder("rangeScanKey") + } + + private lazy val rangeScanAvroSchema = StateStoreColumnFamilySchemaUtils.convertForRangeScan( + StructType(rangeScanKeyFieldsWithOrdinal.map(_._1).toArray)) + + private lazy val rangeScanAvroType = SchemaConverters.toAvroType(rangeScanAvroSchema) + + private lazy val rangeScanAvroProjection = UnsafeProjection.create(rangeScanAvroSchema) + + // Existing remainder key schema definitions + // Remaining Key schema and projection definitions used by the Avro Serializers + // and Deserializers + private lazy val remainingKeySchema = keyStateEncoderSpec match { + case PrefixKeyScanStateEncoderSpec(keySchema, numColsPrefixKey) => + StructType(keySchema.drop(numColsPrefixKey)) + case RangeKeyScanStateEncoderSpec(keySchema, orderingOrdinals) => + StructType(0.until(keySchema.length).diff(orderingOrdinals).map(keySchema(_))) + case _ => throw unsupportedOperationForKeyStateEncoder("remainingKeySchema") + } + + private lazy val remainingKeyAvroType = SchemaConverters.toAvroType(remainingKeySchema) + + private lazy val remainingKeyAvroProjection = UnsafeProjection.create(remainingKeySchema) + + private def getAvroSerializer(schema: StructType): AvroSerializer = { + val avroType = SchemaConverters.toAvroType(schema) + new AvroSerializer(schema, avroType, nullable = false) + } + + private def getAvroDeserializer(schema: StructType): AvroDeserializer = { + val avroType = SchemaConverters.toAvroType(schema) + val avroOptions = AvroOptions(Map.empty) + new AvroDeserializer(avroType, schema, + avroOptions.datetimeRebaseModeInRead, avroOptions.useStableIdForUnionType, + avroOptions.stableIdPrefixForUnionType, avroOptions.recursiveFieldMaxDepth) + } + + /** + * Creates an AvroEncoder that handles both key and value serialization/deserialization. + * This method sets up the complete encoding infrastructure needed for state store operations. + * + * The encoder handles different key encoding specifications: + * - NoPrefixKeyStateEncoderSpec: Simple key encoding without prefix + * - PrefixKeyScanStateEncoderSpec: Keys with prefix for efficient scanning + * - RangeKeyScanStateEncoderSpec: Keys with ordering requirements for range scans + * + * For prefix scan cases, it also creates separate encoders for the suffix portion of keys. + * + * @param keyStateEncoderSpec Specification for how to encode keys + * @param valueSchema Schema for the values to be encoded + * @return An AvroEncoder containing all necessary serializers and deserializers + */ + private def createAvroEnc( + keyStateEncoderSpec: KeyStateEncoderSpec, + valueSchema: StructType): AvroEncoder = { + val valueSerializer = getAvroSerializer(valueSchema) + val valueDeserializer = getAvroDeserializer(valueSchema) + + // Get key schema based on encoder spec type + val keySchema = keyStateEncoderSpec match { + case NoPrefixKeyStateEncoderSpec(schema) => + schema + case PrefixKeyScanStateEncoderSpec(schema, numColsPrefixKey) => + StructType(schema.take(numColsPrefixKey)) + case RangeKeyScanStateEncoderSpec(schema, orderingOrdinals) => + val remainingSchema = { + 0.until(schema.length).diff(orderingOrdinals).map { ordinal => + schema(ordinal) + } + } + StructType(remainingSchema) + } + + // Handle suffix key schema for prefix scan case + val suffixKeySchema = keyStateEncoderSpec match { + case PrefixKeyScanStateEncoderSpec(schema, numColsPrefixKey) => + Some(StructType(schema.drop(numColsPrefixKey))) + case _ => + None + } + + val keySerializer = getAvroSerializer(keySchema) + val keyDeserializer = getAvroDeserializer(keySchema) + + // Create the AvroEncoder with all components + AvroEncoder( + keySerializer, + keyDeserializer, + valueSerializer, + valueDeserializer, + suffixKeySchema.map(getAvroSerializer), + suffixKeySchema.map(getAvroDeserializer) + ) + } + + override def supportsSchemaEvolution: Boolean = true + + /** + * This method takes an UnsafeRow, and serializes to a byte array using Avro encoding. + */ + def encodeUnsafeRowToAvro( + row: UnsafeRow, + avroSerializer: AvroSerializer, + valueAvroType: Schema, + out: ByteArrayOutputStream): Array[Byte] = { + // InternalRow -> Avro.GenericDataRecord + val avroData = + avroSerializer.serialize(row) + out.reset() + val encoder = EncoderFactory.get().directBinaryEncoder(out, null) + val writer = new GenericDatumWriter[Any]( + valueAvroType) // Defining Avro writer for this struct type + writer.write(avroData, encoder) // Avro.GenericDataRecord -> byte array + encoder.flush() + out.toByteArray + } + + /** + * This method takes a byte array written using Avro encoding, and + * deserializes to an UnsafeRow using the Avro deserializer + */ + def decodeFromAvroToUnsafeRow( + valueBytes: Array[Byte], + avroDeserializer: AvroDeserializer, + valueAvroType: Schema, + valueProj: UnsafeProjection): UnsafeRow = { + if (valueBytes != null) { + val reader = new GenericDatumReader[Any](valueAvroType) + val decoder = DecoderFactory.get().binaryDecoder( + valueBytes, 0, valueBytes.length, null) + // bytes -> Avro.GenericDataRecord + val genericData = reader.read(null, decoder) + // Avro.GenericDataRecord -> InternalRow + val internalRow = avroDeserializer.deserialize( + genericData).orNull.asInstanceOf[InternalRow] + // InternalRow -> UnsafeRow + valueProj.apply(internalRow) + } else { + null + } + } + + private val out = new ByteArrayOutputStream + + override def encodeKey(row: UnsafeRow): Array[Byte] = { + keyStateEncoderSpec match { + case NoPrefixKeyStateEncoderSpec(_) => + val avroRow = + encodeUnsafeRowToAvro(row, avroEncoder.keySerializer, keyAvroType, out) + // prepend stateSchemaId to the Avro-encoded key portion for NoPrefixKeys + encodeWithStateSchemaId( + StateSchemaIdRow(stateSchemaInfo.get.keySchemaId, avroRow)) + case PrefixKeyScanStateEncoderSpec(_, _) => + encodeUnsafeRowToAvro(row, avroEncoder.keySerializer, prefixKeyAvroType, out) + case _ => throw unsupportedOperationForKeyStateEncoder("encodeKey") + } + } + + override def encodeRemainingKey(row: UnsafeRow): Array[Byte] = { + val avroRow = keyStateEncoderSpec match { + case PrefixKeyScanStateEncoderSpec(_, _) => + encodeUnsafeRowToAvro(row, avroEncoder.suffixKeySerializer.get, remainingKeyAvroType, out) + case RangeKeyScanStateEncoderSpec(_, _) => + encodeUnsafeRowToAvro(row, avroEncoder.keySerializer, remainingKeyAvroType, out) + case _ => throw unsupportedOperationForKeyStateEncoder("encodeRemainingKey") + } + // prepend stateSchemaId to the remaining key portion + encodeWithStateSchemaId( + StateSchemaIdRow(stateSchemaInfo.get.keySchemaId, avroRow)) + } + + /** + * Encodes an UnsafeRow into an Avro-compatible byte array format for range scan operations. + * + * This method transforms row data into a binary format that preserves ordering when + * used in range scans. + * For each field in the row: + * - A marker byte is written to indicate null status or sign (for numeric types) + * - The value is written in big-endian format + * + * Special handling is implemented for: + * - Null values: marked with nullValMarker followed by zero bytes + * - Negative numbers: marked with negativeValMarker + * - Floating point numbers: bit manipulation to handle sign and NaN values correctly + * + * @param row The UnsafeRow to encode + * @param avroType The Avro schema defining the structure for encoding + * @return Array[Byte] containing the Avro-encoded data that preserves ordering for range scans + * @throws UnsupportedOperationException if a field's data type is not supported for range + * scan encoding + */ + override def encodePrefixKeyForRangeScan(row: UnsafeRow): Array[Byte] = { + val record = new GenericData.Record(rangeScanAvroType) + var fieldIdx = 0 + rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) => + val field = fieldWithOrdinal._1 + val value = row.get(idx, field.dataType) + + // Create marker byte buffer + val markerBuffer = ByteBuffer.allocate(1) + markerBuffer.order(ByteOrder.BIG_ENDIAN) + + if (value == null) { + markerBuffer.put(nullValMarker) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + record.put(fieldIdx + 1, ByteBuffer.wrap(new Array[Byte](field.dataType.defaultSize))) + } else { + field.dataType match { + case BooleanType => + markerBuffer.put(positiveValMarker) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + val valueBuffer = ByteBuffer.allocate(1) + valueBuffer.put(if (value.asInstanceOf[Boolean]) 1.toByte else 0.toByte) + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case ByteType => + val byteVal = value.asInstanceOf[Byte] + markerBuffer.put(if (byteVal < 0) negativeValMarker else positiveValMarker) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + + val valueBuffer = ByteBuffer.allocate(1) + valueBuffer.order(ByteOrder.BIG_ENDIAN) + valueBuffer.put(byteVal) + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case ShortType => + val shortVal = value.asInstanceOf[Short] + markerBuffer.put(if (shortVal < 0) negativeValMarker else positiveValMarker) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + + val valueBuffer = ByteBuffer.allocate(2) + valueBuffer.order(ByteOrder.BIG_ENDIAN) + valueBuffer.putShort(shortVal) + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case IntegerType => + val intVal = value.asInstanceOf[Int] + markerBuffer.put(if (intVal < 0) negativeValMarker else positiveValMarker) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + + val valueBuffer = ByteBuffer.allocate(4) + valueBuffer.order(ByteOrder.BIG_ENDIAN) + valueBuffer.putInt(intVal) + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case LongType => + val longVal = value.asInstanceOf[Long] + markerBuffer.put(if (longVal < 0) negativeValMarker else positiveValMarker) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + + val valueBuffer = ByteBuffer.allocate(8) + valueBuffer.order(ByteOrder.BIG_ENDIAN) + valueBuffer.putLong(longVal) + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case FloatType => + val floatVal = value.asInstanceOf[Float] + val rawBits = floatToRawIntBits(floatVal) + markerBuffer.put(if ((rawBits & floatSignBitMask) != 0) { + negativeValMarker + } else { + positiveValMarker + }) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + + val valueBuffer = ByteBuffer.allocate(4) + valueBuffer.order(ByteOrder.BIG_ENDIAN) + if ((rawBits & floatSignBitMask) != 0) { + val updatedVal = rawBits ^ floatFlipBitMask + valueBuffer.putFloat(intBitsToFloat(updatedVal)) + } else { + valueBuffer.putFloat(floatVal) + } + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case DoubleType => + val doubleVal = value.asInstanceOf[Double] + val rawBits = doubleToRawLongBits(doubleVal) + markerBuffer.put(if ((rawBits & doubleSignBitMask) != 0) { + negativeValMarker + } else { + positiveValMarker + }) + record.put(fieldIdx, ByteBuffer.wrap(markerBuffer.array())) + + val valueBuffer = ByteBuffer.allocate(8) + valueBuffer.order(ByteOrder.BIG_ENDIAN) + if ((rawBits & doubleSignBitMask) != 0) { + val updatedVal = rawBits ^ doubleFlipBitMask + valueBuffer.putDouble(longBitsToDouble(updatedVal)) + } else { + valueBuffer.putDouble(doubleVal) + } + record.put(fieldIdx + 1, ByteBuffer.wrap(valueBuffer.array())) + + case _ => throw new UnsupportedOperationException( + s"Range scan encoding not supported for data type: ${field.dataType}") + } + } + fieldIdx += 2 + } + + out.reset() + val writer = new GenericDatumWriter[GenericRecord](rangeScanAvroType) + val encoder = EncoderFactory.get().binaryEncoder(out, null) + writer.write(record, encoder) + encoder.flush() + out.toByteArray + } + + override def encodeValue(row: UnsafeRow): Array[Byte] = { + val avroRow = encodeUnsafeRowToAvro(row, avroEncoder.valueSerializer, valueAvroType, out) + // prepend stateSchemaId to the Avro-encoded value portion + encodeWithStateSchemaId(StateSchemaIdRow(stateSchemaInfo.get.valueSchemaId, avroRow)) + } + + override def decodeKey(bytes: Array[Byte]): UnsafeRow = { + keyStateEncoderSpec match { + case NoPrefixKeyStateEncoderSpec(_) => + val schemaIdRow = decodeStateSchemaIdRow(bytes) + decodeFromAvroToUnsafeRow( + schemaIdRow.bytes, avroEncoder.keyDeserializer, keyAvroType, keyProj) + case PrefixKeyScanStateEncoderSpec(_, _) => + decodeFromAvroToUnsafeRow( + bytes, avroEncoder.keyDeserializer, prefixKeyAvroType, prefixKeyProj) + case _ => throw unsupportedOperationForKeyStateEncoder("decodeKey") + } + } + + + override def decodeRemainingKey(bytes: Array[Byte]): UnsafeRow = { + val schemaIdRow = decodeStateSchemaIdRow(bytes) + keyStateEncoderSpec match { + case PrefixKeyScanStateEncoderSpec(_, _) => + decodeFromAvroToUnsafeRow(schemaIdRow.bytes, + avroEncoder.suffixKeyDeserializer.get, remainingKeyAvroType, remainingKeyAvroProjection) + case RangeKeyScanStateEncoderSpec(_, _) => + decodeFromAvroToUnsafeRow( + schemaIdRow.bytes, + avroEncoder.keyDeserializer, remainingKeyAvroType, remainingKeyAvroProjection) + case _ => throw unsupportedOperationForKeyStateEncoder("decodeRemainingKey") + } + } + + /** + * Decodes an Avro-encoded byte array back into an UnsafeRow for range scan operations. + * + * This method reverses the encoding process performed by encodePrefixKeyForRangeScan: + * - Reads the marker byte to determine null status or sign + * - Reconstructs the original values from big-endian format + * - Handles special cases for floating point numbers by reversing bit manipulations + * + * The decoding process preserves the original data types and values, including: + * - Null values marked by nullValMarker + * - Sign information for numeric types + * - Proper restoration of negative floating point values + * + * @param bytes The Avro-encoded byte array to decode + * @param avroType The Avro schema defining the structure for decoding + * @return UnsafeRow containing the decoded data + * @throws UnsupportedOperationException if a field's data type is not supported for range + * scan decoding + */ + override def decodePrefixKeyForRangeScan(bytes: Array[Byte]): UnsafeRow = { + val reader = new GenericDatumReader[GenericRecord](rangeScanAvroType) + val decoder = DecoderFactory.get().binaryDecoder(bytes, 0, bytes.length, null) + val record = reader.read(null, decoder) + + val rowWriter = new UnsafeRowWriter(rangeScanKeyFieldsWithOrdinal.length) + rowWriter.resetRowWriter() + + var fieldIdx = 0 + rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) => + val field = fieldWithOrdinal._1 + + val markerBytes = record.get(fieldIdx).asInstanceOf[ByteBuffer].array() + val markerBuf = ByteBuffer.wrap(markerBytes) + markerBuf.order(ByteOrder.BIG_ENDIAN) + val marker = markerBuf.get() + + if (marker == nullValMarker) { + rowWriter.setNullAt(idx) + } else { + field.dataType match { + case BooleanType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + rowWriter.write(idx, bytes(0) == 1) + + case ByteType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + val valueBuf = ByteBuffer.wrap(bytes) + valueBuf.order(ByteOrder.BIG_ENDIAN) + rowWriter.write(idx, valueBuf.get()) + + case ShortType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + val valueBuf = ByteBuffer.wrap(bytes) + valueBuf.order(ByteOrder.BIG_ENDIAN) + rowWriter.write(idx, valueBuf.getShort()) + + case IntegerType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + val valueBuf = ByteBuffer.wrap(bytes) + valueBuf.order(ByteOrder.BIG_ENDIAN) + rowWriter.write(idx, valueBuf.getInt()) + + case LongType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + val valueBuf = ByteBuffer.wrap(bytes) + valueBuf.order(ByteOrder.BIG_ENDIAN) + rowWriter.write(idx, valueBuf.getLong()) + + case FloatType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + val valueBuf = ByteBuffer.wrap(bytes) + valueBuf.order(ByteOrder.BIG_ENDIAN) + if (marker == negativeValMarker) { + val floatVal = valueBuf.getFloat + val updatedVal = floatToRawIntBits(floatVal) ^ floatFlipBitMask + rowWriter.write(idx, intBitsToFloat(updatedVal)) + } else { + rowWriter.write(idx, valueBuf.getFloat()) + } + + case DoubleType => + val bytes = record.get(fieldIdx + 1).asInstanceOf[ByteBuffer].array() + val valueBuf = ByteBuffer.wrap(bytes) + valueBuf.order(ByteOrder.BIG_ENDIAN) + if (marker == negativeValMarker) { + val doubleVal = valueBuf.getDouble + val updatedVal = doubleToRawLongBits(doubleVal) ^ doubleFlipBitMask + rowWriter.write(idx, longBitsToDouble(updatedVal)) + } else { + rowWriter.write(idx, valueBuf.getDouble()) + } + + case _ => throw new UnsupportedOperationException( + s"Range scan decoding not supported for data type: ${field.dataType}") + } + } + fieldIdx += 2 + } + + rowWriter.getRow() + } + + override def decodeValue(bytes: Array[Byte]): UnsafeRow = { + val schemaIdRow = decodeStateSchemaIdRow(bytes) + decodeFromAvroToUnsafeRow( + schemaIdRow.bytes, avroEncoder.valueDeserializer, valueAvroType, valueProj) + } +} + +/** + * Information about a RocksDB column family used for state storage. + * + * @param colFamilyName The name of the column family in RocksDB + * @param virtualColumnFamilyId A unique identifier for the virtual column family, + * used as a prefix in encoded state rows to distinguish + * between different column families + */ +case class ColumnFamilyInfo( + colFamilyName: String, + virtualColumnFamilyId: Short +) + +/** + * Metadata prefixes stored at the beginning of encoded state rows. + * These prefixes allow for schema evolution and column family organization + * in the state store. + * + * @param columnFamilyId Optional identifier for the virtual column family. + * When present, allows organizing state data into + * different column families in RocksDB. + */ +case class StateRowPrefix( + columnFamilyId: Option[Short] +) + +class StateRowPrefixEncoder( + useColumnFamilies: Boolean, + columnFamilyInfo: Option[ColumnFamilyInfo] +) { + + private val numColFamilyBytes = if (useColumnFamilies) { + VIRTUAL_COL_FAMILY_PREFIX_BYTES + } else { + 0 + } + + def getNumPrefixBytes: Int = numColFamilyBytes + + val out = new ByteArrayOutputStream + + /** + * Get Byte Array for the virtual column family id that is used as prefix for + * key state rows. + */ + def getColumnFamilyIdBytes(): Array[Byte] = { + assert(useColumnFamilies, "Cannot return virtual Column Family Id Bytes" + + " because multiple Column is not supported for this encoder") + val encodedBytes = new Array[Byte](VIRTUAL_COL_FAMILY_PREFIX_BYTES) + val virtualColFamilyId = columnFamilyInfo.get.virtualColumnFamilyId + Platform.putShort(encodedBytes, Platform.BYTE_ARRAY_OFFSET, virtualColFamilyId) + encodedBytes + } + + /** + * Encodes a state row by adding schema and column family ID prefixes if enabled. + * + * @param data The byte array containing the data to be prefixed + * @return A new byte array containing the prefixed data. If no prefixing is needed + * (neither schema evolution nor column families are enabled), returns a copy + * of the input array to maintain consistency with the prefixed case. + */ + def encodeStateRowWithPrefix(data: Array[Byte]): Array[Byte] = { + // Create result array big enough for all prefixes plus data + val result = new Array[Byte](getNumPrefixBytes + data.length) + var offset = Platform.BYTE_ARRAY_OFFSET + + // Write column family ID if enabled + if (useColumnFamilies) { + val colFamilyId = columnFamilyInfo.get.virtualColumnFamilyId + Platform.putShort(result, offset, colFamilyId) + offset += VIRTUAL_COL_FAMILY_PREFIX_BYTES + } + + // Write the actual data + Platform.copyMemory( + data, Platform.BYTE_ARRAY_OFFSET, + result, offset, + data.length + ) + + result + } + + def decodeStateRowPrefix(stateRow: Array[Byte]): StateRowPrefix = { + var offset = Platform.BYTE_ARRAY_OFFSET + + // Read column family ID if present + val colFamilyId = if (useColumnFamilies) { + val id = Platform.getShort(stateRow, offset) + offset += VIRTUAL_COL_FAMILY_PREFIX_BYTES + Some(id) + } else { + None + } + + StateRowPrefix(colFamilyId) + } + + def decodeStateRowData(stateRow: Array[Byte]): Array[Byte] = { + val offset = Platform.BYTE_ARRAY_OFFSET + getNumPrefixBytes + + // Extract the actual data + val dataLength = stateRow.length - getNumPrefixBytes + val data = new Array[Byte](dataLength) + Platform.copyMemory( + stateRow, offset, + data, Platform.BYTE_ARRAY_OFFSET, + dataLength + ) + data + } +} + +/** + * Factory object for creating state encoders used by RocksDB state store. + * + * The encoders created by this object handle serialization and deserialization of state data, + * supporting both key and value encoding with various access patterns + * (e.g., prefix scan, range scan). + */ +object RocksDBStateEncoder extends Logging { + + /** + * Creates a key encoder based on the specified encoding strategy and configuration. + * + * @param dataEncoder The underlying encoder that handles the actual data encoding/decoding + * @param keyStateEncoderSpec Specification defining the key encoding strategy + * (no prefix, prefix scan, or range scan) + * @param useColumnFamilies Whether to use RocksDB column families for storage + * @param virtualColFamilyId Optional column family identifier when column families are enabled + * @return A configured RocksDBKeyStateEncoder instance + */ + def getKeyEncoder( + dataEncoder: RocksDBDataEncoder, + keyStateEncoderSpec: KeyStateEncoderSpec, + useColumnFamilies: Boolean, + columnFamilyInfo: Option[ColumnFamilyInfo] = None): RocksDBKeyStateEncoder = { + keyStateEncoderSpec.toEncoder(dataEncoder, useColumnFamilies, columnFamilyInfo) + } + + /** + * Creates a value encoder that supports either single or multiple values per key. + * + * @param dataEncoder The underlying encoder that handles the actual data encoding/decoding + * @param valueSchema Schema defining the structure of values to be encoded + * @param useMultipleValuesPerKey If true, creates an encoder that can handle multiple values + * per key; if false, creates an encoder for single values + * @return A configured RocksDBValueStateEncoder instance + */ + def getValueEncoder( + dataEncoder: RocksDBDataEncoder, + valueSchema: StructType, + useMultipleValuesPerKey: Boolean): RocksDBValueStateEncoder = { + if (useMultipleValuesPerKey) { + new MultiValuedStateEncoder(dataEncoder, valueSchema) + } else { + new SingleValueStateEncoder(dataEncoder, valueSchema) } } - def decodeToUnsafeRow(bytes: Array[Byte], reusedRow: UnsafeRow): UnsafeRow = { - if (bytes != null) { - // Platform.BYTE_ARRAY_OFFSET is the recommended way refer to the 1st offset. See Platform. - reusedRow.pointTo( - bytes, - Platform.BYTE_ARRAY_OFFSET + STATE_ENCODING_NUM_VERSION_BYTES, - bytes.length - STATE_ENCODING_NUM_VERSION_BYTES) - reusedRow - } else { - null - } + /** + * Encodes a virtual column family ID into a byte array suitable for RocksDB. + * + * This method creates a fixed-size byte array prefixed with the virtual column family ID, + * which is used to partition data within RocksDB. + * + * @param virtualColFamilyId The column family identifier to encode + * @return A byte array containing the encoded column family ID + */ + def getColumnFamilyIdBytes(virtualColFamilyId: Short): Array[Byte] = { + val encodedBytes = new Array[Byte](VIRTUAL_COL_FAMILY_PREFIX_BYTES) + Platform.putShort(encodedBytes, Platform.BYTE_ARRAY_OFFSET, virtualColFamilyId) + encodedBytes } } /** * RocksDB Key Encoder for UnsafeRow that supports prefix scan * + * @param dataEncoder - the encoder that handles actual encoding/decoding of data * @param keySchema - schema of the key to be encoded * @param numColsPrefixKey - number of columns to be used for prefix key * @param useColumnFamilies - if column family is enabled for this encoder */ class PrefixKeyScanStateEncoder( + dataEncoder: RocksDBDataEncoder, keySchema: StructType, numColsPrefixKey: Int, useColumnFamilies: Boolean = false, - virtualColFamilyId: Option[Short] = None) - extends RocksDBKeyStateEncoderBase(useColumnFamilies, virtualColFamilyId) { - - import RocksDBStateEncoder._ + columnFamilyInfo: Option[ColumnFamilyInfo] = None) + extends StateRowPrefixEncoder( + useColumnFamilies, + columnFamilyInfo + ) with RocksDBKeyStateEncoder with Logging { private val prefixKeyFieldsWithIdx: Seq[(StructField, Int)] = { keySchema.zipWithIndex.take(numColsPrefixKey) @@ -210,43 +1234,53 @@ class PrefixKeyScanStateEncoder( private val joinedRowOnKey = new JoinedRow() override def encodeKey(row: UnsafeRow): Array[Byte] = { - val prefixKeyEncoded = encodeUnsafeRow(extractPrefixKey(row)) - val remainingEncoded = encodeUnsafeRow(remainingKeyProjection(row)) + // First encode prefix and remaining key parts + val prefixKeyEncoded = dataEncoder.encodeKey(extractPrefixKey(row)) + val remainingEncoded = dataEncoder.encodeRemainingKey(remainingKeyProjection(row)) - val (encodedBytes, startingOffset) = encodeColumnFamilyPrefix( - prefixKeyEncoded.length + remainingEncoded.length + 4 + // Combine prefix key and remaining key into single array + val combinedData = new Array[Byte](4 + prefixKeyEncoded.length + remainingEncoded.length) + Platform.putInt(combinedData, Platform.BYTE_ARRAY_OFFSET, prefixKeyEncoded.length) + Platform.copyMemory( + prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + combinedData, Platform.BYTE_ARRAY_OFFSET + 4, + prefixKeyEncoded.length + ) + Platform.copyMemory( + remainingEncoded, Platform.BYTE_ARRAY_OFFSET, + combinedData, Platform.BYTE_ARRAY_OFFSET + 4 + prefixKeyEncoded.length, + remainingEncoded.length ) - Platform.putInt(encodedBytes, startingOffset, prefixKeyEncoded.length) - Platform.copyMemory(prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, startingOffset + 4, prefixKeyEncoded.length) - // NOTE: We don't put the length of remainingEncoded as we can calculate later - // on deserialization. - Platform.copyMemory(remainingEncoded, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, startingOffset + 4 + prefixKeyEncoded.length, - remainingEncoded.length) - - encodedBytes + // Add state row prefix using encoder + encodeStateRowWithPrefix(combinedData) } override def decodeKey(keyBytes: Array[Byte]): UnsafeRow = { - val prefixKeyEncodedLen = Platform.getInt(keyBytes, decodeKeyStartOffset) - val prefixKeyEncoded = new Array[Byte](prefixKeyEncodedLen) - Platform.copyMemory(keyBytes, decodeKeyStartOffset + 4, - prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, prefixKeyEncodedLen) + // First decode the metadata prefixes and get the actual key data + val keyData = decodeStateRowData(keyBytes) - // Here we calculate the remainingKeyEncodedLen leveraging the length of keyBytes - val remainingKeyEncodedLen = keyBytes.length - 4 - prefixKeyEncodedLen - - offsetForColFamilyPrefix + // Get prefix key length from the start of the actual key data + val prefixKeyEncodedLen = Platform.getInt(keyData, Platform.BYTE_ARRAY_OFFSET) + val prefixKeyEncoded = new Array[Byte](prefixKeyEncodedLen) + Platform.copyMemory( + keyData, Platform.BYTE_ARRAY_OFFSET + 4, + prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + prefixKeyEncodedLen + ) + // Calculate remaining key length and extract it + val remainingKeyEncodedLen = keyData.length - 4 - prefixKeyEncodedLen val remainingKeyEncoded = new Array[Byte](remainingKeyEncodedLen) - Platform.copyMemory(keyBytes, decodeKeyStartOffset + 4 + prefixKeyEncodedLen, - remainingKeyEncoded, Platform.BYTE_ARRAY_OFFSET, remainingKeyEncodedLen) - - val prefixKeyDecoded = decodeToUnsafeRow(prefixKeyEncoded, numFields = numColsPrefixKey) - val remainingKeyDecoded = decodeToUnsafeRow(remainingKeyEncoded, - numFields = keySchema.length - numColsPrefixKey) + Platform.copyMemory( + keyData, Platform.BYTE_ARRAY_OFFSET + 4 + prefixKeyEncodedLen, + remainingKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + remainingKeyEncodedLen + ) + // Decode both parts and combine + val prefixKeyDecoded = dataEncoder.decodeKey(prefixKeyEncoded) + val remainingKeyDecoded = dataEncoder.decodeRemainingKey(remainingKeyEncoded) restoreKeyProjection(joinedRowOnKey.withLeft(prefixKeyDecoded).withRight(remainingKeyDecoded)) } @@ -255,15 +1289,19 @@ class PrefixKeyScanStateEncoder( } override def encodePrefixKey(prefixKey: UnsafeRow): Array[Byte] = { - val prefixKeyEncoded = encodeUnsafeRow(prefixKey) - val (prefix, startingOffset) = encodeColumnFamilyPrefix( - prefixKeyEncoded.length + 4 + // First encode the prefix key part + val prefixKeyEncoded = dataEncoder.encodeKey(prefixKey) + + // Create array with length prefix + val dataWithLength = new Array[Byte](4 + prefixKeyEncoded.length) + Platform.putInt(dataWithLength, Platform.BYTE_ARRAY_OFFSET, prefixKeyEncoded.length) + Platform.copyMemory( + prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + dataWithLength, Platform.BYTE_ARRAY_OFFSET + 4, + prefixKeyEncoded.length ) - Platform.putInt(prefix, startingOffset, prefixKeyEncoded.length) - Platform.copyMemory(prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, prefix, - startingOffset + 4, prefixKeyEncoded.length) - prefix + encodeStateRowWithPrefix(dataWithLength) } override def supportPrefixKeyScan: Boolean = true @@ -296,18 +1334,21 @@ class PrefixKeyScanStateEncoder( * the right lexicographical ordering. For the rationale around this, please check the link * here: https://en.wikipedia.org/wiki/IEEE_754#Design_rationale * + * @param dataEncoder - the encoder that handles the actual encoding/decoding of data * @param keySchema - schema of the key to be encoded * @param orderingOrdinals - the ordinals for which the range scan is constructed * @param useColumnFamilies - if column family is enabled for this encoder */ class RangeKeyScanStateEncoder( + dataEncoder: RocksDBDataEncoder, keySchema: StructType, orderingOrdinals: Seq[Int], useColumnFamilies: Boolean = false, - virtualColFamilyId: Option[Short] = None) - extends RocksDBKeyStateEncoderBase(useColumnFamilies, virtualColFamilyId) { - - import RocksDBStateEncoder._ + columnFamilyInfo: Option[ColumnFamilyInfo] = None) + extends StateRowPrefixEncoder( + useColumnFamilies, + columnFamilyInfo + ) with RocksDBKeyStateEncoder with Logging { private val rangeScanKeyFieldsWithOrdinal: Seq[(StructField, Int)] = { orderingOrdinals.map { ordinal => @@ -381,266 +1422,88 @@ class RangeKeyScanStateEncoder( rangeScanKeyProjection(key) } - // bit masks used for checking sign or flipping all bits for negative float/double values - private val floatFlipBitMask = 0xFFFFFFFF - private val floatSignBitMask = 0x80000000 - - private val doubleFlipBitMask = 0xFFFFFFFFFFFFFFFFL - private val doubleSignBitMask = 0x8000000000000000L - - // Byte markers used to identify whether the value is null, negative or positive - // To ensure sorted ordering, we use the lowest byte value for negative numbers followed by - // positive numbers and then null values. - private val negativeValMarker: Byte = 0x00.toByte - private val positiveValMarker: Byte = 0x01.toByte - private val nullValMarker: Byte = 0x02.toByte - - // Rewrite the unsafe row by replacing fixed size fields with BIG_ENDIAN encoding - // using byte arrays. - // To handle "null" values, we prepend a byte to the byte array indicating whether the value - // is null or not. If the value is null, we write the null byte followed by zero bytes. - // If the value is not null, we write the null byte followed by the value. - // Note that setting null for the index on the unsafeRow is not feasible as it would change - // the sorting order on iteration. - // Also note that the same byte is used to indicate whether the value is negative or not. - private def encodePrefixKeyForRangeScan(row: UnsafeRow): UnsafeRow = { - val writer = new UnsafeRowWriter(orderingOrdinals.length) - writer.resetRowWriter() - rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) => - val field = fieldWithOrdinal._1 - val value = row.get(idx, field.dataType) - // Note that we cannot allocate a smaller buffer here even if the value is null - // because the effective byte array is considered variable size and needs to have - // the same size across all rows for the ordering to work as expected. - val bbuf = ByteBuffer.allocate(field.dataType.defaultSize + 1) - bbuf.order(ByteOrder.BIG_ENDIAN) - if (value == null) { - bbuf.put(nullValMarker) - writer.write(idx, bbuf.array()) - } else { - field.dataType match { - case BooleanType => - case ByteType => - val byteVal = value.asInstanceOf[Byte] - val signCol = if (byteVal < 0) { - negativeValMarker - } else { - positiveValMarker - } - bbuf.put(signCol) - bbuf.put(byteVal) - writer.write(idx, bbuf.array()) - - case ShortType => - val shortVal = value.asInstanceOf[Short] - val signCol = if (shortVal < 0) { - negativeValMarker - } else { - positiveValMarker - } - bbuf.put(signCol) - bbuf.putShort(shortVal) - writer.write(idx, bbuf.array()) - - case IntegerType => - val intVal = value.asInstanceOf[Int] - val signCol = if (intVal < 0) { - negativeValMarker - } else { - positiveValMarker - } - bbuf.put(signCol) - bbuf.putInt(intVal) - writer.write(idx, bbuf.array()) - - case LongType => - val longVal = value.asInstanceOf[Long] - val signCol = if (longVal < 0) { - negativeValMarker - } else { - positiveValMarker - } - bbuf.put(signCol) - bbuf.putLong(longVal) - writer.write(idx, bbuf.array()) - - case FloatType => - val floatVal = value.asInstanceOf[Float] - val rawBits = floatToRawIntBits(floatVal) - // perform sign comparison using bit manipulation to ensure NaN values are handled - // correctly - if ((rawBits & floatSignBitMask) != 0) { - // for negative values, we need to flip all the bits to ensure correct ordering - val updatedVal = rawBits ^ floatFlipBitMask - bbuf.put(negativeValMarker) - // convert the bits back to float - bbuf.putFloat(intBitsToFloat(updatedVal)) - } else { - bbuf.put(positiveValMarker) - bbuf.putFloat(floatVal) - } - writer.write(idx, bbuf.array()) - - case DoubleType => - val doubleVal = value.asInstanceOf[Double] - val rawBits = doubleToRawLongBits(doubleVal) - // perform sign comparison using bit manipulation to ensure NaN values are handled - // correctly - if ((rawBits & doubleSignBitMask) != 0) { - // for negative values, we need to flip all the bits to ensure correct ordering - val updatedVal = rawBits ^ doubleFlipBitMask - bbuf.put(negativeValMarker) - // convert the bits back to double - bbuf.putDouble(longBitsToDouble(updatedVal)) - } else { - bbuf.put(positiveValMarker) - bbuf.putDouble(doubleVal) - } - writer.write(idx, bbuf.array()) - } - } - } - writer.getRow() - } - - // Rewrite the unsafe row by converting back from BIG_ENDIAN byte arrays to the - // original data types. - // For decode, we extract the byte array from the UnsafeRow, and then read the first byte - // to determine if the value is null or not. If the value is null, we set the ordinal on - // the UnsafeRow to null. If the value is not null, we read the rest of the bytes to get the - // actual value. - // For negative float/double values, we need to flip all the bits back to get the original value. - private def decodePrefixKeyForRangeScan(row: UnsafeRow): UnsafeRow = { - val writer = new UnsafeRowWriter(orderingOrdinals.length) - writer.resetRowWriter() - rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) => - val field = fieldWithOrdinal._1 - - val value = row.getBinary(idx) - val bbuf = ByteBuffer.wrap(value.asInstanceOf[Array[Byte]]) - bbuf.order(ByteOrder.BIG_ENDIAN) - val isNullOrSignCol = bbuf.get() - if (isNullOrSignCol == nullValMarker) { - // set the column to null and skip reading the next byte(s) - writer.setNullAt(idx) - } else { - field.dataType match { - case BooleanType => - case ByteType => - writer.write(idx, bbuf.get) - - case ShortType => - writer.write(idx, bbuf.getShort) - - case IntegerType => - writer.write(idx, bbuf.getInt) - - case LongType => - writer.write(idx, bbuf.getLong) - - case FloatType => - if (isNullOrSignCol == negativeValMarker) { - // if the number is negative, get the raw binary bits for the float - // and flip the bits back - val updatedVal = floatToRawIntBits(bbuf.getFloat) ^ floatFlipBitMask - writer.write(idx, intBitsToFloat(updatedVal)) - } else { - writer.write(idx, bbuf.getFloat) - } - - case DoubleType => - if (isNullOrSignCol == negativeValMarker) { - // if the number is negative, get the raw binary bits for the double - // and flip the bits back - val updatedVal = doubleToRawLongBits(bbuf.getDouble) ^ doubleFlipBitMask - writer.write(idx, longBitsToDouble(updatedVal)) - } else { - writer.write(idx, bbuf.getDouble) - } - } - } - } - writer.getRow() - } - override def encodeKey(row: UnsafeRow): Array[Byte] = { - // This prefix key has the columns specified by orderingOrdinals + // First encode the range scan ordered prefix val prefixKey = extractPrefixKey(row) - val rangeScanKeyEncoded = encodeUnsafeRow(encodePrefixKeyForRangeScan(prefixKey)) + val rangeScanKeyEncoded = dataEncoder.encodePrefixKeyForRangeScan(prefixKey) - val result = if (orderingOrdinals.length < keySchema.length) { - val remainingEncoded = encodeUnsafeRow(remainingKeyProjection(row)) - val (encodedBytes, startingOffset) = encodeColumnFamilyPrefix( - rangeScanKeyEncoded.length + remainingEncoded.length + 4 - ) + // We have remaining key parts to encode + val remainingEncoded = dataEncoder.encodeRemainingKey(remainingKeyProjection(row)) - Platform.putInt(encodedBytes, startingOffset, - rangeScanKeyEncoded.length) - Platform.copyMemory(rangeScanKeyEncoded, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, startingOffset + 4, rangeScanKeyEncoded.length) - // NOTE: We don't put the length of remainingEncoded as we can calculate later - // on deserialization. - Platform.copyMemory(remainingEncoded, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, startingOffset + 4 + rangeScanKeyEncoded.length, - remainingEncoded.length) - encodedBytes - } else { - // if the num of ordering cols is same as num of key schema cols, we don't need to - // encode the remaining key as it's empty. - val (encodedBytes, startingOffset) = encodeColumnFamilyPrefix( - rangeScanKeyEncoded.length + 4 - ) + // Combine range scan key and remaining key with length prefix + val combinedData = new Array[Byte](4 + rangeScanKeyEncoded.length + remainingEncoded.length) - Platform.putInt(encodedBytes, startingOffset, - rangeScanKeyEncoded.length) - Platform.copyMemory(rangeScanKeyEncoded, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, startingOffset + 4, rangeScanKeyEncoded.length) - encodedBytes - } - result + // Write length of range scan key + Platform.putInt(combinedData, Platform.BYTE_ARRAY_OFFSET, rangeScanKeyEncoded.length) + + // Write range scan key + Platform.copyMemory( + rangeScanKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + combinedData, Platform.BYTE_ARRAY_OFFSET + 4, + rangeScanKeyEncoded.length + ) + // Write remaining key + Platform.copyMemory( + remainingEncoded, Platform.BYTE_ARRAY_OFFSET, + combinedData, Platform.BYTE_ARRAY_OFFSET + 4 + rangeScanKeyEncoded.length, + remainingEncoded.length + ) + + encodeStateRowWithPrefix(combinedData) } override def decodeKey(keyBytes: Array[Byte]): UnsafeRow = { - val prefixKeyEncodedLen = Platform.getInt(keyBytes, decodeKeyStartOffset) + // First decode metadata prefixes to get the actual key data + val keyData = decodeStateRowData(keyBytes) + + // Get range scan key length and extract it + val prefixKeyEncodedLen = Platform.getInt(keyData, Platform.BYTE_ARRAY_OFFSET) val prefixKeyEncoded = new Array[Byte](prefixKeyEncodedLen) - Platform.copyMemory(keyBytes, decodeKeyStartOffset + 4, - prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, prefixKeyEncodedLen) + Platform.copyMemory( + keyData, Platform.BYTE_ARRAY_OFFSET + 4, + prefixKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + prefixKeyEncodedLen + ) - val prefixKeyDecodedForRangeScan = decodeToUnsafeRow(prefixKeyEncoded, - numFields = orderingOrdinals.length) - val prefixKeyDecoded = decodePrefixKeyForRangeScan(prefixKeyDecodedForRangeScan) + // Decode the range scan prefix key + val prefixKeyDecoded = dataEncoder.decodePrefixKeyForRangeScan(prefixKeyEncoded) if (orderingOrdinals.length < keySchema.length) { - // Here we calculate the remainingKeyEncodedLen leveraging the length of keyBytes - val remainingKeyEncodedLen = keyBytes.length - 4 - - prefixKeyEncodedLen - offsetForColFamilyPrefix - + // We have remaining key parts to decode + val remainingKeyEncodedLen = keyData.length - 4 - prefixKeyEncodedLen val remainingKeyEncoded = new Array[Byte](remainingKeyEncodedLen) - Platform.copyMemory(keyBytes, decodeKeyStartOffset + 4 + prefixKeyEncodedLen, + Platform.copyMemory( + keyData, Platform.BYTE_ARRAY_OFFSET + 4 + prefixKeyEncodedLen, remainingKeyEncoded, Platform.BYTE_ARRAY_OFFSET, - remainingKeyEncodedLen) + remainingKeyEncodedLen + ) - val remainingKeyDecoded = decodeToUnsafeRow(remainingKeyEncoded, - numFields = keySchema.length - orderingOrdinals.length) + // Decode remaining key + val remainingKeyDecoded = dataEncoder.decodeRemainingKey(remainingKeyEncoded) + // Combine the parts and restore full key val joined = joinedRowOnKey.withLeft(prefixKeyDecoded).withRight(remainingKeyDecoded) - val restored = restoreKeyProjection(joined) - restored + restoreKeyProjection(joined) } else { - // if the number of ordering cols is same as the number of key schema cols, we only - // return the prefix key decoded unsafe row. + // No remaining key parts - return just the prefix key prefixKeyDecoded } } override def encodePrefixKey(prefixKey: UnsafeRow): Array[Byte] = { - val rangeScanKeyEncoded = encodeUnsafeRow(encodePrefixKeyForRangeScan(prefixKey)) - val (prefix, startingOffset) = encodeColumnFamilyPrefix(rangeScanKeyEncoded.length + 4) + // First encode the range scan ordered prefix + val rangeScanKeyEncoded = dataEncoder.encodePrefixKeyForRangeScan(prefixKey) + + // Add length prefix + val dataWithLength = new Array[Byte](4 + rangeScanKeyEncoded.length) + Platform.putInt(dataWithLength, Platform.BYTE_ARRAY_OFFSET, rangeScanKeyEncoded.length) + Platform.copyMemory( + rangeScanKeyEncoded, Platform.BYTE_ARRAY_OFFSET, + dataWithLength, Platform.BYTE_ARRAY_OFFSET + 4, + rangeScanKeyEncoded.length + ) - Platform.putInt(prefix, startingOffset, rangeScanKeyEncoded.length) - Platform.copyMemory(rangeScanKeyEncoded, Platform.BYTE_ARRAY_OFFSET, - prefix, startingOffset + 4, rangeScanKeyEncoded.length) - prefix + encodeStateRowWithPrefix(dataWithLength) } override def supportPrefixKeyScan: Boolean = true @@ -659,53 +1522,56 @@ class RangeKeyScanStateEncoder( * then the generated array byte will be N+1 bytes. */ class NoPrefixKeyStateEncoder( + dataEncoder: RocksDBDataEncoder, keySchema: StructType, useColumnFamilies: Boolean = false, - virtualColFamilyId: Option[Short] = None) - extends RocksDBKeyStateEncoderBase(useColumnFamilies, virtualColFamilyId) { - - import RocksDBStateEncoder._ - - // Reusable objects - private val keyRow = new UnsafeRow(keySchema.size) + columnFamilyInfo: Option[ColumnFamilyInfo] = None) + extends StateRowPrefixEncoder( + useColumnFamilies, + columnFamilyInfo + ) with RocksDBKeyStateEncoder with Logging { override def encodeKey(row: UnsafeRow): Array[Byte] = { if (!useColumnFamilies) { - encodeUnsafeRow(row) + encodeStateRowWithPrefix(dataEncoder.encodeKey(row)) } else { - val bytesToEncode = row.getBytes - val (encodedBytes, startingOffset) = encodeColumnFamilyPrefix( - bytesToEncode.length + - STATE_ENCODING_NUM_VERSION_BYTES - ) + // First encode the row with the data encoder + val rowBytes = dataEncoder.encodeKey(row) - Platform.putByte(encodedBytes, startingOffset, STATE_ENCODING_VERSION) - // Platform.BYTE_ARRAY_OFFSET is the recommended way to memcopy b/w byte arrays. See Platform. + // Create data array with version byte + val dataWithVersion = new Array[Byte](STATE_ENCODING_NUM_VERSION_BYTES + rowBytes.length) + Platform.putByte(dataWithVersion, Platform.BYTE_ARRAY_OFFSET, STATE_ENCODING_VERSION) Platform.copyMemory( - bytesToEncode, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, startingOffset + STATE_ENCODING_NUM_VERSION_BYTES, bytesToEncode.length) - encodedBytes + rowBytes, Platform.BYTE_ARRAY_OFFSET, + dataWithVersion, Platform.BYTE_ARRAY_OFFSET + STATE_ENCODING_NUM_VERSION_BYTES, + rowBytes.length + ) + + encodeStateRowWithPrefix(dataWithVersion) } } - /** - * Decode byte array for a key to a UnsafeRow. - * @note The UnsafeRow returned is reused across calls, and the UnsafeRow just points to - * the given byte array. - */ override def decodeKey(keyBytes: Array[Byte]): UnsafeRow = { - if (useColumnFamilies) { - if (keyBytes != null) { - // Platform.BYTE_ARRAY_OFFSET is the recommended way refer to the 1st offset. See Platform. - keyRow.pointTo( - keyBytes, - decodeKeyStartOffset + STATE_ENCODING_NUM_VERSION_BYTES, - keyBytes.length - STATE_ENCODING_NUM_VERSION_BYTES - VIRTUAL_COL_FAMILY_PREFIX_BYTES) - keyRow - } else { - null - } - } else decodeToUnsafeRow(keyBytes, keyRow) + if (!useColumnFamilies) { + dataEncoder.decodeKey(decodeStateRowData(keyBytes)) + } else if (keyBytes == null) { + null + } else { + // First decode the metadata prefixes + val dataWithVersion = decodeStateRowData(keyBytes) + + // Skip version byte to get to actual data + val dataLength = dataWithVersion.length - STATE_ENCODING_NUM_VERSION_BYTES + + // Extract data bytes and decode using data encoder + val dataBytes = new Array[Byte](dataLength) + Platform.copyMemory( + dataWithVersion, Platform.BYTE_ARRAY_OFFSET + STATE_ENCODING_NUM_VERSION_BYTES, + dataBytes, Platform.BYTE_ARRAY_OFFSET, + dataLength + ) + dataEncoder.decodeKey(dataBytes) + } } override def supportPrefixKeyScan: Boolean = false @@ -728,35 +1594,42 @@ class NoPrefixKeyStateEncoder( * merged in RocksDB using merge operation, and all merged values can be read using decodeValues * operation. */ -class MultiValuedStateEncoder(valueSchema: StructType) +class MultiValuedStateEncoder( + dataEncoder: RocksDBDataEncoder, + valueSchema: StructType) extends RocksDBValueStateEncoder with Logging { - import RocksDBStateEncoder._ - - // Reusable objects - private val valueRow = new UnsafeRow(valueSchema.size) - override def encodeValue(row: UnsafeRow): Array[Byte] = { - val bytes = encodeUnsafeRow(row) - val numBytes = bytes.length + // First encode the row using either Avro or UnsafeRow encoding + val rowBytes = dataEncoder.encodeValue(row) - val encodedBytes = new Array[Byte](java.lang.Integer.BYTES + bytes.length) - Platform.putInt(encodedBytes, Platform.BYTE_ARRAY_OFFSET, numBytes) - Platform.copyMemory(bytes, Platform.BYTE_ARRAY_OFFSET, - encodedBytes, java.lang.Integer.BYTES + Platform.BYTE_ARRAY_OFFSET, bytes.length) + // Create data array with length prefix + val dataWithLength = new Array[Byte](java.lang.Integer.BYTES + rowBytes.length) + Platform.putInt(dataWithLength, Platform.BYTE_ARRAY_OFFSET, rowBytes.length) + Platform.copyMemory( + rowBytes, Platform.BYTE_ARRAY_OFFSET, + dataWithLength, Platform.BYTE_ARRAY_OFFSET + java.lang.Integer.BYTES, + rowBytes.length + ) - encodedBytes + dataWithLength } override def decodeValue(valueBytes: Array[Byte]): UnsafeRow = { if (valueBytes == null) { null } else { - val numBytes = Platform.getInt(valueBytes, Platform.BYTE_ARRAY_OFFSET) + // First decode the metadata prefixes + val dataWithLength = valueBytes + // Get the value length and extract value bytes + val numBytes = Platform.getInt(dataWithLength, Platform.BYTE_ARRAY_OFFSET) val encodedValue = new Array[Byte](numBytes) - Platform.copyMemory(valueBytes, java.lang.Integer.BYTES + Platform.BYTE_ARRAY_OFFSET, - encodedValue, Platform.BYTE_ARRAY_OFFSET, numBytes) - decodeToUnsafeRow(encodedValue, valueRow) + Platform.copyMemory( + dataWithLength, Platform.BYTE_ARRAY_OFFSET + java.lang.Integer.BYTES, + encodedValue, Platform.BYTE_ARRAY_OFFSET, + numBytes + ) + dataEncoder.decodeValue(encodedValue) } } @@ -768,21 +1641,23 @@ class MultiValuedStateEncoder(valueSchema: StructType) private var pos: Int = Platform.BYTE_ARRAY_OFFSET private val maxPos = Platform.BYTE_ARRAY_OFFSET + valueBytes.length - override def hasNext: Boolean = { - pos < maxPos - } + override def hasNext: Boolean = pos < maxPos override def next(): UnsafeRow = { + // Get value length val numBytes = Platform.getInt(valueBytes, pos) - pos += java.lang.Integer.BYTES - val encodedValue = new Array[Byte](numBytes) - Platform.copyMemory(valueBytes, pos, - encodedValue, Platform.BYTE_ARRAY_OFFSET, numBytes) + // Extract value bytes + val encodedValue = new Array[Byte](numBytes) + Platform.copyMemory( + valueBytes, pos, + encodedValue, Platform.BYTE_ARRAY_OFFSET, + numBytes + ) pos += numBytes pos += 1 // eat the delimiter character - decodeToUnsafeRow(encodedValue, valueRow) + dataEncoder.decodeValue(encodedValue) } } } @@ -803,24 +1678,23 @@ class MultiValuedStateEncoder(valueSchema: StructType) * (offset 0 is the version byte of value 0). That is, if the unsafe row has N bytes, * then the generated array byte will be N+1 bytes. */ -class SingleValueStateEncoder(valueSchema: StructType) - extends RocksDBValueStateEncoder { - - import RocksDBStateEncoder._ - - // Reusable objects - private val valueRow = new UnsafeRow(valueSchema.size) +class SingleValueStateEncoder( + dataEncoder: RocksDBDataEncoder, + valueSchema: StructType) + extends RocksDBValueStateEncoder with Logging { - override def encodeValue(row: UnsafeRow): Array[Byte] = encodeUnsafeRow(row) + override def encodeValue(row: UnsafeRow): Array[Byte] = { + dataEncoder.encodeValue(row) + } - /** - * Decode byte array for a value to a UnsafeRow. - * - * @note The UnsafeRow returned is reused across calls, and the UnsafeRow just points to - * the given byte array. - */ override def decodeValue(valueBytes: Array[Byte]): UnsafeRow = { - decodeToUnsafeRow(valueBytes, valueRow) + if (valueBytes == null) { + return null + } + // First decode the metadata prefixes + val data = valueBytes + // Decode the actual value using either Avro or UnsafeRow + dataEncoder.decodeValue(data) } override def supportsMultipleValuesPerKey: Boolean = false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala index 1fc6ab5910c6c..9fc48a60d7c6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.execution.streaming.state import java.io._ -import java.util.concurrent.ConcurrentHashMap +import java.util.UUID +import java.util.concurrent.{ConcurrentHashMap, TimeUnit} import scala.util.control.NonFatal @@ -31,9 +32,9 @@ import org.apache.spark.internal.LogKeys._ import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.errors.QueryExecutionErrors -import org.apache.spark.sql.execution.streaming.CheckpointFileManager +import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, StreamExecution} import org.apache.spark.sql.types.StructType -import org.apache.spark.util.Utils +import org.apache.spark.util.{NonFateSharingCache, Utils} private[sql] class RocksDBStateStoreProvider extends StateStoreProvider with Logging with Closeable @@ -74,10 +75,30 @@ private[sql] class RocksDBStateStoreProvider isInternal: Boolean = false): Unit = { verifyColFamilyCreationOrDeletion("create_col_family", colFamilyName, isInternal) val newColFamilyId = rocksDB.createColFamilyIfAbsent(colFamilyName) - keyValueEncoderMap.putIfAbsent(colFamilyName, - (RocksDBStateEncoder.getKeyEncoder(keyStateEncoderSpec, useColumnFamilies, - Some(newColFamilyId)), RocksDBStateEncoder.getValueEncoder(valueSchema, - useMultipleValuesPerKey))) + val dataEncoderCacheKey = StateRowEncoderCacheKey( + queryRunId = getRunId(hadoopConf), + operatorId = stateStoreId.operatorId, + partitionId = stateStoreId.partitionId, + stateStoreName = stateStoreId.storeName, + colFamilyName = colFamilyName) + + val dataEncoder = getDataEncoder( + stateStoreEncoding, dataEncoderCacheKey, keyStateEncoderSpec, valueSchema) + + val columnFamilyInfo = Some(ColumnFamilyInfo(colFamilyName, newColFamilyId)) + + val keyEncoder = RocksDBStateEncoder.getKeyEncoder( + dataEncoder, + keyStateEncoderSpec, + useColumnFamilies, + columnFamilyInfo + ) + val valueEncoder = RocksDBStateEncoder.getValueEncoder( + dataEncoder, + valueSchema, + useMultipleValuesPerKey + ) + keyValueEncoderMap.putIfAbsent(colFamilyName, (keyEncoder, valueEncoder)) } override def get(key: UnsafeRow, colFamilyName: String): UnsafeRow = { @@ -364,6 +385,7 @@ private[sql] class RocksDBStateStoreProvider this.storeConf = storeConf this.hadoopConf = hadoopConf this.useColumnFamilies = useColumnFamilies + this.stateStoreEncoding = storeConf.stateStoreEncodingFormat if (useMultipleValuesPerKey) { require(useColumnFamilies, "Multiple values per key support requires column families to be" + @@ -373,14 +395,35 @@ private[sql] class RocksDBStateStoreProvider rocksDB // lazy initialization var defaultColFamilyId: Option[Short] = None - if (useColumnFamilies) { + val dataEncoderCacheKey = StateRowEncoderCacheKey( + queryRunId = getRunId(hadoopConf), + operatorId = stateStoreId.operatorId, + partitionId = stateStoreId.partitionId, + stateStoreName = stateStoreId.storeName, + colFamilyName = StateStore.DEFAULT_COL_FAMILY_NAME) + + val dataEncoder = getDataEncoder( + stateStoreEncoding, dataEncoderCacheKey, keyStateEncoderSpec, valueSchema) + + val columnFamilyInfo = if (useColumnFamilies) { defaultColFamilyId = Some(rocksDB.createColFamilyIfAbsent(StateStore.DEFAULT_COL_FAMILY_NAME)) + Some(ColumnFamilyInfo(StateStore.DEFAULT_COL_FAMILY_NAME, defaultColFamilyId.get)) + } else { + None } - keyValueEncoderMap.putIfAbsent(StateStore.DEFAULT_COL_FAMILY_NAME, - (RocksDBStateEncoder.getKeyEncoder(keyStateEncoderSpec, - useColumnFamilies, defaultColFamilyId), - RocksDBStateEncoder.getValueEncoder(valueSchema, useMultipleValuesPerKey))) + val keyEncoder = RocksDBStateEncoder.getKeyEncoder( + dataEncoder, + keyStateEncoderSpec, + useColumnFamilies, + columnFamilyInfo + ) + val valueEncoder = RocksDBStateEncoder.getValueEncoder( + dataEncoder, + valueSchema, + useMultipleValuesPerKey + ) + keyValueEncoderMap.putIfAbsent(StateStore.DEFAULT_COL_FAMILY_NAME, (keyEncoder, valueEncoder)) } override def stateStoreId: StateStoreId = stateStoreId_ @@ -396,7 +439,8 @@ private[sql] class RocksDBStateStoreProvider new RocksDBStateStore(version) } catch { - case e: SparkException if e.getCondition.contains("CANNOT_LOAD_STATE_STORE") => + case e: SparkException + if Option(e.getCondition).exists(_.contains("CANNOT_LOAD_STATE_STORE")) => throw e case e: OutOfMemoryError => throw QueryExecutionErrors.notEnoughMemoryToLoadStore( @@ -419,7 +463,8 @@ private[sql] class RocksDBStateStoreProvider new RocksDBStateStore(version) } catch { - case e: SparkException if e.getCondition.contains("CANNOT_LOAD_STATE_STORE") => + case e: SparkException + if Option(e.getCondition).exists(_.contains("CANNOT_LOAD_STATE_STORE")) => throw e case e: OutOfMemoryError => throw QueryExecutionErrors.notEnoughMemoryToLoadStore( @@ -458,6 +503,7 @@ private[sql] class RocksDBStateStoreProvider @volatile private var storeConf: StateStoreConf = _ @volatile private var hadoopConf: Configuration = _ @volatile private var useColumnFamilies: Boolean = _ + @volatile private var stateStoreEncoding: String = _ private[sql] lazy val rocksDB = { val dfsRootDir = stateStoreId.storeCheckpointLocation().toString @@ -587,11 +633,80 @@ private[sql] class RocksDBStateStoreProvider } } + +case class StateRowEncoderCacheKey( + queryRunId: String, + operatorId: Long, + partitionId: Int, + stateStoreName: String, + colFamilyName: String +) + object RocksDBStateStoreProvider { // Version as a single byte that specifies the encoding of the row data in RocksDB val STATE_ENCODING_NUM_VERSION_BYTES = 1 val STATE_ENCODING_VERSION: Byte = 0 val VIRTUAL_COL_FAMILY_PREFIX_BYTES = 2 + val SCHEMA_ID_PREFIX_BYTES = 2 + + private val MAX_AVRO_ENCODERS_IN_CACHE = 1000 + private val AVRO_ENCODER_LIFETIME_HOURS = 1L + private val DEFAULT_SCHEMA_IDS = StateSchemaInfo(0, 0) + + // Add the cache at companion object level so it persists across provider instances + private val dataEncoderCache: NonFateSharingCache[StateRowEncoderCacheKey, RocksDBDataEncoder] = + NonFateSharingCache( + maximumSize = MAX_AVRO_ENCODERS_IN_CACHE, + expireAfterAccessTime = AVRO_ENCODER_LIFETIME_HOURS, + expireAfterAccessTimeUnit = TimeUnit.HOURS + ) + + /** + * Creates and returns a data encoder for the state store based on the specified encoding type. + * This method handles caching of encoders to improve performance by reusing encoder instances + * when possible. + * + * The method supports two encoding types: + * - Avro: Uses Apache Avro for serialization with schema evolution support + * - UnsafeRow: Uses Spark's internal row format for optimal performance + * + * @param stateStoreEncoding The encoding type to use ("avro" or "unsaferow") + * @param encoderCacheKey A unique key for caching the encoder instance, typically combining + * query ID, operator ID, partition ID, and column family name + * @param keyStateEncoderSpec Specification for how to encode keys, including schema and any + * prefix/range scan requirements + * @param valueSchema The schema for the values to be encoded + * @return A RocksDBDataEncoder instance configured for the specified encoding type + */ + def getDataEncoder( + stateStoreEncoding: String, + encoderCacheKey: StateRowEncoderCacheKey, + keyStateEncoderSpec: KeyStateEncoderSpec, + valueSchema: StructType): RocksDBDataEncoder = { + assert(Set("avro", "unsaferow").contains(stateStoreEncoding)) + RocksDBStateStoreProvider.dataEncoderCache.get( + encoderCacheKey, + new java.util.concurrent.Callable[RocksDBDataEncoder] { + override def call(): RocksDBDataEncoder = { + if (stateStoreEncoding == "avro") { + new AvroStateEncoder(keyStateEncoderSpec, valueSchema, Some(DEFAULT_SCHEMA_IDS)) + } else { + new UnsafeRowDataEncoder(keyStateEncoderSpec, valueSchema, None) + } + } + } + ) + } + + private def getRunId(hadoopConf: Configuration): String = { + val runId = hadoopConf.get(StreamExecution.RUN_ID_KEY) + if (runId != null) { + runId + } else { + assert(Utils.isTesting, "Failed to find query id/batch Id in task context") + UUID.randomUUID().toString + } + } // Native operation latencies report as latency in microseconds // as SQLMetrics support millis. Convert the value to millis diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala index 721d72b6a0991..48b15ac04f40b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer} import org.apache.spark.sql.catalyst.util.UnsafeRowUtils import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, StatefulOperatorStateInfo} import org.apache.spark.sql.execution.streaming.state.SchemaHelper.{SchemaReader, SchemaWriter} @@ -37,6 +38,30 @@ case class StateSchemaValidationResult( schemaPath: String ) +/** + * An Avro-based encoder used for serializing between UnsafeRow and Avro + * byte arrays in RocksDB state stores. + * + * This encoder is primarily utilized by [[RocksDBStateStoreProvider]] and [[RocksDBStateEncoder]] + * to handle serialization and deserialization of state store data. + * + * @param keySerializer Serializer for converting state store keys to Avro format + * @param keyDeserializer Deserializer for converting Avro-encoded keys back to UnsafeRow + * @param valueSerializer Serializer for converting state store values to Avro format + * @param valueDeserializer Deserializer for converting Avro-encoded values back to UnsafeRow + * @param suffixKeySerializer Optional serializer for handling suffix keys in Avro format + * @param suffixKeyDeserializer Optional deserializer for converting Avro-encoded suffix + * keys back to UnsafeRow + */ +case class AvroEncoder( + keySerializer: AvroSerializer, + keyDeserializer: AvroDeserializer, + valueSerializer: AvroSerializer, + valueDeserializer: AvroDeserializer, + suffixKeySerializer: Option[AvroSerializer] = None, + suffixKeyDeserializer: Option[AvroDeserializer] = None +) extends Serializable + // Used to represent the schema of a column family in the state store case class StateStoreColFamilySchema( colFamilyName: String, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala index 72bc3ca33054d..33df8ad42747c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala @@ -37,10 +37,22 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.util.UnsafeRowUtils import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.execution.streaming.StatefulOperatorStateInfo +import org.apache.spark.sql.execution.streaming.{StatefulOperatorStateInfo, StreamExecution} import org.apache.spark.sql.types.StructType import org.apache.spark.util.{NextIterator, ThreadUtils, Utils} +sealed trait StateStoreEncoding { + override def toString: String = this match { + case StateStoreEncoding.UnsafeRow => "unsaferow" + case StateStoreEncoding.Avro => "avro" + } +} + +object StateStoreEncoding { + case object UnsafeRow extends StateStoreEncoding + case object Avro extends StateStoreEncoding +} + /** * Base trait for a versioned key-value store which provides read operations. Each instance of a * `ReadStateStore` represents a specific version of state data, and such instances are created @@ -310,8 +322,22 @@ case class StateStoreCustomTimingMetric(name: String, desc: String) extends Stat } sealed trait KeyStateEncoderSpec { + def keySchema: StructType def jsonValue: JValue def json: String = compact(render(jsonValue)) + + /** + * Creates a RocksDBKeyStateEncoder for this specification. + * + * @param dataEncoder The encoder to handle the actual data encoding/decoding + * @param useColumnFamilies Whether to use RocksDB column families + * @param virtualColFamilyId Optional column family ID when column families are used + * @return A RocksDBKeyStateEncoder configured for this spec + */ + def toEncoder( + dataEncoder: RocksDBDataEncoder, + useColumnFamilies: Boolean, + columnFamilyInfo: Option[ColumnFamilyInfo]): RocksDBKeyStateEncoder } object KeyStateEncoderSpec { @@ -335,6 +361,14 @@ case class NoPrefixKeyStateEncoderSpec(keySchema: StructType) extends KeyStateEn override def jsonValue: JValue = { ("keyStateEncoderType" -> JString("NoPrefixKeyStateEncoderSpec")) } + + override def toEncoder( + dataEncoder: RocksDBDataEncoder, + useColumnFamilies: Boolean, + columnFamilyInfo: Option[ColumnFamilyInfo]): RocksDBKeyStateEncoder = { + new NoPrefixKeyStateEncoder( + dataEncoder, keySchema, useColumnFamilies, columnFamilyInfo) + } } case class PrefixKeyScanStateEncoderSpec( @@ -344,6 +378,15 @@ case class PrefixKeyScanStateEncoderSpec( throw StateStoreErrors.incorrectNumOrderingColsForPrefixScan(numColsPrefixKey.toString) } + override def toEncoder( + dataEncoder: RocksDBDataEncoder, + useColumnFamilies: Boolean, + columnFamilyInfo: Option[ColumnFamilyInfo]): RocksDBKeyStateEncoder = { + new PrefixKeyScanStateEncoder( + dataEncoder, keySchema, numColsPrefixKey, useColumnFamilies, columnFamilyInfo) + } + + override def jsonValue: JValue = { ("keyStateEncoderType" -> JString("PrefixKeyScanStateEncoderSpec")) ~ ("numColsPrefixKey" -> JInt(numColsPrefixKey)) @@ -358,6 +401,14 @@ case class RangeKeyScanStateEncoderSpec( throw StateStoreErrors.incorrectNumOrderingColsForRangeScan(orderingOrdinals.length.toString) } + override def toEncoder( + dataEncoder: RocksDBDataEncoder, + useColumnFamilies: Boolean, + columnFamilyInfo: Option[ColumnFamilyInfo]): RocksDBKeyStateEncoder = { + new RangeKeyScanStateEncoder( + dataEncoder, keySchema, orderingOrdinals, useColumnFamilies, columnFamilyInfo) + } + override def jsonValue: JValue = { ("keyStateEncoderType" -> JString("RangeKeyScanStateEncoderSpec")) ~ ("orderingOrdinals" -> orderingOrdinals.map(JInt(_))) @@ -746,6 +797,7 @@ object StateStore extends Logging { storeConf: StateStoreConf, hadoopConf: Configuration, useMultipleValuesPerKey: Boolean = false): ReadStateStore = { + hadoopConf.set(StreamExecution.RUN_ID_KEY, storeProviderId.queryRunId.toString) if (version < 0) { throw QueryExecutionErrors.unexpectedStateStoreVersion(version) } @@ -766,9 +818,11 @@ object StateStore extends Logging { storeConf: StateStoreConf, hadoopConf: Configuration, useMultipleValuesPerKey: Boolean = false): StateStore = { + hadoopConf.set(StreamExecution.RUN_ID_KEY, storeProviderId.queryRunId.toString) if (version < 0) { throw QueryExecutionErrors.unexpectedStateStoreVersion(version) } + hadoopConf.set(StreamExecution.RUN_ID_KEY, storeProviderId.queryRunId.toString) val storeProvider = getStateStoreProvider(storeProviderId, keySchema, valueSchema, keyStateEncoderSpec, useColumnFamilies, storeConf, hadoopConf, useMultipleValuesPerKey) storeProvider.getStore(version, stateStoreCkptId) @@ -923,7 +977,8 @@ object StateStore extends Logging { } finally { val duration = System.currentTimeMillis() - startTime val logMsg = - log"Finished maintenance task for provider=${MDC(LogKeys.STATE_STORE_PROVIDER, id)}" + + log"Finished maintenance task for " + + log"provider=${MDC(LogKeys.STATE_STORE_PROVIDER_ID, id)}" + log" in elapsed_time=${MDC(LogKeys.TIME_UNITS, duration)}\n" if (duration > 5000) { logInfo(logMsg) @@ -953,9 +1008,9 @@ object StateStore extends Logging { .map(_.reportActiveInstance(storeProviderId, host, executorId, otherProviderIds)) .getOrElse(Seq.empty[StateStoreProviderId]) logInfo(log"Reported that the loaded instance " + - log"${MDC(LogKeys.STATE_STORE_PROVIDER, storeProviderId)} is active") + log"${MDC(LogKeys.STATE_STORE_PROVIDER_ID, storeProviderId)} is active") logDebug(log"The loaded instances are going to unload: " + - log"${MDC(LogKeys.STATE_STORE_PROVIDER, providerIdsToUnload.mkString(", "))}") + log"${MDC(LogKeys.STATE_STORE_PROVIDER_IDS, providerIdsToUnload)}") providerIdsToUnload } else { Seq.empty[StateStoreProviderId] @@ -987,7 +1042,7 @@ object StateStore extends Logging { _coordRef = StateStoreCoordinatorRef.forExecutor(env) } logInfo(log"Retrieved reference to StateStoreCoordinator: " + - log"${MDC(LogKeys.STATE_STORE_PROVIDER, _coordRef)}") + log"${MDC(LogKeys.STATE_STORE_COORDINATOR, _coordRef)}") Some(_coordRef) } else { _coordRef = null diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala index 203af9d10217e..b4fbb5560f2f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala @@ -24,6 +24,8 @@ import scala.util.control.NonFatal import com.google.common.io.ByteStreams import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FSError, Path} +import org.json4s._ +import org.json4s.jackson.Serialization import org.apache.spark.internal.{Logging, MDC} import org.apache.spark.internal.LogKeys._ @@ -78,6 +80,14 @@ object RecordType extends Enumeration { } } +/** + * Class for lineage item for checkpoint format V2. + */ +case class LineageItem( + version: Long, + checkpointUniqueId: String +) + /** * Base class for state store changelog writer * @param fm - checkpoint file manager used to manage streaming query checkpoint @@ -89,18 +99,27 @@ abstract class StateStoreChangelogWriter( file: Path, compressionCodec: CompressionCodec) extends Logging { + implicit val formats: Formats = DefaultFormats + private def compressStream(outputStream: DataOutputStream): DataOutputStream = { val compressed = compressionCodec.compressedOutputStream(outputStream) new DataOutputStream(compressed) } + protected var backingFileStream: CancellableFSDataOutputStream = + fm.createAtomic(file, overwriteIfPossible = true) + protected var compressedStream: DataOutputStream = compressStream(backingFileStream) + protected def writeVersion(): Unit = { compressedStream.writeUTF(s"v${version}") } - protected var backingFileStream: CancellableFSDataOutputStream = - fm.createAtomic(file, overwriteIfPossible = true) - protected var compressedStream: DataOutputStream = compressStream(backingFileStream) + protected def writeLineage(stateStoreCheckpointIdLineage: Array[LineageItem]): Unit = { + assert(version >= 3, + "writeLineage should only be invoked with state store checkpoint id enabled (version >= 3)") + val lineageStr = Serialization.write(stateStoreCheckpointIdLineage) + compressedStream.writeUTF(lineageStr) + } def version: Short @@ -115,9 +134,9 @@ abstract class StateStoreChangelogWriter( if (backingFileStream != null) backingFileStream.cancel() if (compressedStream != null) IOUtils.closeQuietly(compressedStream) } catch { - // Closing the compressedStream causes the stream to write/flush flush data into the + // Closing the compressedStream causes the stream to write/flush data into the // rawStream. Since the rawStream is already closed, there may be errors. - // Usually its an IOException. However, Hadoop's RawLocalFileSystem wraps + // Usually it's an IOException. However, Hadoop's RawLocalFileSystem wraps // IOException into FSError. case e: FSError if e.getCause.isInstanceOf[IOException] => case NonFatal(ex) => @@ -152,15 +171,15 @@ class StateStoreChangelogWriterV1( override def put(key: Array[Byte], value: Array[Byte]): Unit = { assert(compressedStream != null) - compressedStream.writeInt(key.size) + compressedStream.writeInt(key.length) compressedStream.write(key) - compressedStream.writeInt(value.size) + compressedStream.writeInt(value.length) compressedStream.write(value) } override def delete(key: Array[Byte]): Unit = { assert(compressedStream != null) - compressedStream.writeInt(key.size) + compressedStream.writeInt(key.length) compressedStream.write(key) // -1 in the value field means record deletion. compressedStream.writeInt(-1) @@ -206,7 +225,7 @@ class StateStoreChangelogWriterV2( override def version: Short = 2 - // append the version field to the changelog file starting from version 2 + // append the version field to the changelog file writeVersion() override def put(key: Array[Byte], value: Array[Byte]): Unit = { @@ -216,7 +235,7 @@ class StateStoreChangelogWriterV2( override def delete(key: Array[Byte]): Unit = { assert(compressedStream != null) compressedStream.write(RecordType.getRecordTypeAsByte(RecordType.DELETE_RECORD)) - compressedStream.writeInt(key.size) + compressedStream.writeInt(key.length) compressedStream.write(key) // -1 in the value field means record deletion. compressedStream.writeInt(-1) @@ -232,9 +251,9 @@ class StateStoreChangelogWriterV2( assert(recordType == RecordType.PUT_RECORD || recordType == RecordType.MERGE_RECORD) assert(compressedStream != null) compressedStream.write(RecordType.getRecordTypeAsByte(recordType)) - compressedStream.writeInt(key.size) + compressedStream.writeInt(key.length) compressedStream.write(key) - compressedStream.writeInt(value.size) + compressedStream.writeInt(value.length) compressedStream.write(value) } @@ -255,6 +274,128 @@ class StateStoreChangelogWriterV2( } } +/** + * Write changes to the key value state store instance to a changelog file. + * There are 2 types of records, put and delete. + * A put record is written as: | key length | key content | value length | value content | + * A delete record is written as: | key length | key content | -1 | + * Write an Int -1 to signal the end of file. + * The overall changelog format is: | put record | delete record | ... | put record | -1 | + * V3 is a extension of V1 for writing changelogs with version + * in the first line and lineage in the second line. + */ +class StateStoreChangelogWriterV3( + fm: CheckpointFileManager, + file: Path, + compressionCodec: CompressionCodec, + stateStoreCheckpointIdLineage: Array[LineageItem]) + extends StateStoreChangelogWriterV1(fm, file, compressionCodec) { + + override def version: Short = 3 + + // append the version field to the changelog file + writeVersion() + + // Also write lineage information to the changelog, it should appear + // in the second line for v3 because the first line is the version + writeLineage(stateStoreCheckpointIdLineage) +} + +/** + * Write changes to the key value state store instance to a changelog file. + * There are 3 types of data records, put, merge and delete. + * A put record or merge record is written as: | record type | key length + * | key content | value length | value content | -1 | + * A delete record is written as: | record type | key length | key content | -1 + * Write an EOF_RECORD to signal the end of file. + * The overall changelog format is: version | put record | delete record + * | ... | put record | eof record | + * V4 is a extension of V2 for writing changelogs with version + * in the first line and lineage in the second line. + */ +class StateStoreChangelogWriterV4( + fm: CheckpointFileManager, + file: Path, + compressionCodec: CompressionCodec, + stateStoreCheckpointIdLineage: Array[LineageItem]) + extends StateStoreChangelogWriterV2(fm, file, compressionCodec) { + + override def version: Short = 4 + + // Also write lineage information to the changelog, it should appear + // in the second line for v4 because the first line is the version + writeLineage(stateStoreCheckpointIdLineage) +} + +/** + * A factory class for constructing state store readers by reading the first line + * of the change log file, which stores the version. + * Note that for changelog version 1, there is no version written. + * @param fm - checkpoint file manager used to manage streaming query checkpoint + * @param fileToRead - name of file to use to read changelog + * @param compressionCodec - de-compression method using for reading changelog file + */ +class StateStoreChangelogReaderFactory( + fm: CheckpointFileManager, + fileToRead: Path, + compressionCodec: CompressionCodec) extends Logging { + + private def decompressStream(inputStream: DataInputStream): DataInputStream = { + val compressed = compressionCodec.compressedInputStream(inputStream) + new DataInputStream(compressed) + } + + private lazy val sourceStream = try { + fm.open(fileToRead) + } catch { + case f: FileNotFoundException => + throw QueryExecutionErrors.failedToReadStreamingStateFileError(fileToRead, f) + } + protected val input: DataInputStream = decompressStream(sourceStream) + + private lazy val changeLogVersion: Short = { + try { + val versionStr = input.readUTF() + // Versions in the first line are prefixed with "v", e.g. "v2" + // Since there is no version written for version 1, + // return 1 if first line doesn't start with "v" + if (!versionStr.startsWith("v")) { + 1 + } else { + versionStr.stripPrefix("v").toShort + } + } catch { + // When there is no record being written in the changelog file in V1, + // the file contains a single int -1 meaning EOF, then the above readUTF() + // throws with EOFException and we return version 1. + case _: java.io.EOFException => 1 + } + } + + /** + * Construct the change log reader based on the version stored in changelog file + * @return StateStoreChangelogReader + */ + def constructChangelogReader(): StateStoreChangelogReader = { + var reader: StateStoreChangelogReader = null + try { + reader = changeLogVersion match { + case 1 => new StateStoreChangelogReaderV1(fm, fileToRead, compressionCodec) + case 2 => new StateStoreChangelogReaderV2(fm, fileToRead, compressionCodec) + case 3 => new StateStoreChangelogReaderV3(fm, fileToRead, compressionCodec) + case 4 => new StateStoreChangelogReaderV4(fm, fileToRead, compressionCodec) + case version => throw QueryExecutionErrors.invalidChangeLogReaderVersion(version) + } + } finally { + if (input != null) { + input.close() + // input is not set to null because it is effectively lazy. + } + } + reader + } +} + /** * Base class for state store changelog reader * @param fm - checkpoint file manager used to manage streaming query checkpoint @@ -267,12 +408,14 @@ abstract class StateStoreChangelogReader( compressionCodec: CompressionCodec) extends NextIterator[(RecordType.Value, Array[Byte], Array[Byte])] with Logging { + implicit val formats: Formats = DefaultFormats + private def decompressStream(inputStream: DataInputStream): DataInputStream = { val compressed = compressionCodec.compressedInputStream(inputStream) new DataInputStream(compressed) } - private val sourceStream = try { + private lazy val sourceStream = try { fm.open(fileToRead) } catch { case f: FileNotFoundException => @@ -280,6 +423,26 @@ abstract class StateStoreChangelogReader( } protected val input: DataInputStream = decompressStream(sourceStream) + // This function is valid only when called upon initialization, + // because version is written in the first line only for version >= 2. + protected def readVersion(): String = input.readUTF() + + protected def verifyVersion(): Unit = { + // ensure that the version read is correct, also updates file position + val changelogVersionStr = readVersion() + assert(changelogVersionStr == s"v${version}", + s"Changelog version mismatch: $changelogVersionStr != v${version}") + } + + private def readLineage(): Array[LineageItem] = { + assert(version >= 3, + "readLineage should only be invoked with state store checkpoint id enabled (version >= 3)") + val lineageStr = input.readUTF() + Serialization.read[Array[LineageItem]](lineageStr) + } + + lazy val lineage: Array[LineageItem] = readLineage() + def version: Short override protected def close(): Unit = { if (input != null) input.close() } @@ -352,10 +515,7 @@ class StateStoreChangelogReaderV2( override def version: Short = 2 - // ensure that the version read is v2 - val changelogVersionStr = input.readUTF() - assert(changelogVersionStr == "v2", - s"Changelog version mismatch: $changelogVersionStr != v2") + verifyVersion() override def getNext(): (RecordType.Value, Array[Byte], Array[Byte]) = { val recordType = RecordType.getRecordTypeFromByte(input.readByte()) @@ -388,6 +548,56 @@ class StateStoreChangelogReaderV2( } } +/** + * Read an iterator of change record from the changelog file. + * A record is represented by tuple(recordType: RecordType.Value, + * key: Array[Byte], value: Array[Byte]) + * A put record is returned as a tuple(recordType, key, value) + * A delete record is return as a tuple(recordType, key, null) + * V3 is a extension of V1 for reading changelogs with version + * in the first line and lineage in the second line. + */ +class StateStoreChangelogReaderV3( + fm: CheckpointFileManager, + fileToRead: Path, + compressionCodec: CompressionCodec) + extends StateStoreChangelogReaderV1(fm, fileToRead, compressionCodec) { + + override def version: Short = 3 + + verifyVersion() + + // If the changelogFile is written when state store checkpoint unique id is enabled + // the first line would be the version and the second line would be the lineage. + // We should update the file position by reading from the lineage during + // the reader initialization. + lineage +} + +/** + * Read an iterator of change record from the changelog file. + * A record is represented by tuple(recordType: RecordType.Value, + * key: Array[Byte], value: Array[Byte]) + * A put or merge record is returned as a tuple(recordType, key, value) + * A delete record is return as a tuple(recordType, key, null) + * V4 is a extension of V2 for reading changelogs with version + * in the first line and lineage in the second line. + */ +class StateStoreChangelogReaderV4( + fm: CheckpointFileManager, + fileToRead: Path, + compressionCodec: CompressionCodec) + extends StateStoreChangelogReaderV2(fm, fileToRead, compressionCodec) { + + override def version: Short = 4 + + // If the changelogFile is written when state store checkpoint unique id is enabled + // the first line would be the version and the second line would be the lineage. + // We should update the file position by reading from the lineage during + // the reader initialization. + lineage +} + /** * Base class representing a iterator that iterates over a range of changelog files in a state * store. In each iteration, it will return a tuple of (changeType: [[RecordType]], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala index c8af395e996d8..9d26bf8fdf2e7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala @@ -83,6 +83,9 @@ class StateStoreConf( /** The interval of maintenance tasks. */ val maintenanceInterval = sqlConf.streamingMaintenanceInterval + /** The interval of maintenance tasks. */ + val stateStoreEncodingFormat = sqlConf.stateStoreEncodingFormat + /** * When creating new state store checkpoint, which format version to use. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 59a873ef982fe..bfd5915529118 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -20,6 +20,7 @@ import org.apache.spark.annotation.Unstable import org.apache.spark.sql.{ExperimentalMethods, SparkSession, UDFRegistration, _} import org.apache.spark.sql.artifact.ArtifactManager import org.apache.spark.sql.catalyst.analysis.{Analyzer, EvalSubqueriesForTimeTravel, FunctionRegistry, InvokeProcedures, ReplaceCharWithVarchar, ResolveSessionCatalog, ResolveTranspose, TableFunctionRegistry} +import org.apache.spark.sql.catalyst.analysis.resolver.ResolverExtension import org.apache.spark.sql.catalyst.catalog.{FunctionExpressionBuilder, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.optimizer.Optimizer @@ -199,6 +200,15 @@ abstract class BaseSessionStateBuilder( protected def analyzer: Analyzer = new Analyzer(catalogManager) { override val hintResolutionRules: Seq[Rule[LogicalPlan]] = customHintResolutionRules + + override val singlePassResolverExtensions: Seq[ResolverExtension] = Seq( + new DataSourceResolver(session) + ) + + override val singlePassMetadataResolverExtensions: Seq[ResolverExtension] = Seq( + new FileResolver(session) + ) + override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 64689e75e2e5e..5fd88b417ac44 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -685,6 +685,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { optionExpression = newOptions, location = location, comment = { if (description.isEmpty) None else Some(description) }, + collation = None, serde = None, external = tableType == CatalogTableType.EXTERNAL) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterImpl.scala index 16f9fcf77d622..5a96db5e34bbd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterImpl.scala @@ -209,6 +209,7 @@ final class DataFrameWriterImpl[T] private[sql](ds: Dataset[T]) extends DataFram optionExpression = OptionList(Seq.empty), location = extraOptions.get("path"), comment = extraOptions.get(TableCatalog.PROP_COMMENT), + collation = extraOptions.get(TableCatalog.PROP_COLLATION), serde = None, external = false) runCommand(df.sparkSession) { @@ -382,6 +383,11 @@ final class DataFrameWriterImpl[T] private[sql](ds: Dataset[T]) extends DataFram } } + private def hasCustomSessionCatalog: Boolean = { + df.sparkSession.sessionState.conf + .getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION) != "builtin" + } + /** * Saves the content of the `DataFrame` as the specified table. * @@ -425,8 +431,7 @@ final class DataFrameWriterImpl[T] private[sql](ds: Dataset[T]) extends DataFram import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ val session = df.sparkSession - val canUseV2 = lookupV2Provider().isDefined || (df.sparkSession.sessionState.conf.getConf( - SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION).isDefined && + val canUseV2 = lookupV2Provider().isDefined || (hasCustomSessionCatalog && !df.sparkSession.sessionState.catalogManager.catalog(CatalogManager.SESSION_CATALOG_NAME) .isInstanceOf[CatalogExtension]) @@ -469,6 +474,7 @@ final class DataFrameWriterImpl[T] private[sql](ds: Dataset[T]) extends DataFram optionExpression = OptionList(Seq.empty), location = extraOptions.get("path"), comment = extraOptions.get(TableCatalog.PROP_COMMENT), + collation = extraOptions.get(TableCatalog.PROP_COLLATION), serde = None, external = false) ReplaceTableAsSelect( @@ -489,6 +495,7 @@ final class DataFrameWriterImpl[T] private[sql](ds: Dataset[T]) extends DataFram optionExpression = OptionList(Seq.empty), location = extraOptions.get("path"), comment = extraOptions.get(TableCatalog.PROP_COMMENT), + collation = extraOptions.get(TableCatalog.PROP_COLLATION), serde = None, external = false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterV2Impl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterV2Impl.scala index 0a19e6c47afa9..86ea55bc59b7b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterV2Impl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterV2Impl.scala @@ -150,6 +150,7 @@ final class DataFrameWriterV2Impl[T] private[sql](table: String, ds: Dataset[T]) optionExpression = OptionList(Seq.empty), location = None, comment = None, + collation = None, serde = None, external = false) runCommand( @@ -215,6 +216,7 @@ final class DataFrameWriterV2Impl[T] private[sql](table: String, ds: Dataset[T]) optionExpression = OptionList(Seq.empty), location = None, comment = None, + collation = None, serde = None, external = false) runCommand(ReplaceTableAsSelect( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/MergeIntoWriterImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/MergeIntoWriterImpl.scala index bb8146e3e0e33..2f1a34648a470 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/MergeIntoWriterImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/MergeIntoWriterImpl.scala @@ -44,7 +44,7 @@ class MergeIntoWriterImpl[T] private[sql] (table: String, ds: Dataset[T], on: Co private val df: DataFrame = ds.toDF() private[sql] val sparkSession = ds.sparkSession - import sparkSession.RichColumn + import sparkSession.toRichColumn private val tableName = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(table) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/RuntimeConfigImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/RuntimeConfigImpl.scala index 1739b86c8dcb4..b2004215a99f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/RuntimeConfigImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/RuntimeConfigImpl.scala @@ -21,7 +21,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.SPARK_DOC_ROOT import org.apache.spark.annotation.Stable -import org.apache.spark.internal.config.{ConfigEntry, DEFAULT_PARALLELISM} +import org.apache.spark.internal.config.{ConfigEntry, DEFAULT_PARALLELISM, OptionalConfigEntry} import org.apache.spark.sql.RuntimeConfig import org.apache.spark.sql.errors.QueryCompilationErrors @@ -41,6 +41,12 @@ class RuntimeConfigImpl private[sql](val sqlConf: SQLConf = new SQLConf) extends sqlConf.setConfString(key, value) } + /** @inheritdoc */ + override private[sql] def set[T](entry: ConfigEntry[T], value: T): Unit = { + requireNonStaticConf(entry.key) + sqlConf.setConf(entry, value) + } + /** @inheritdoc */ @throws[NoSuchElementException]("if the key is not set and there is no default value") def get(key: String): String = { @@ -57,6 +63,18 @@ class RuntimeConfigImpl private[sql](val sqlConf: SQLConf = new SQLConf) extends sqlConf.getAllConfs } + /** @inheritdoc */ + override private[sql] def get[T](entry: ConfigEntry[T]): T = + sqlConf.getConf(entry) + + /** @inheritdoc */ + override private[sql] def get[T](entry: OptionalConfigEntry[T]): Option[T] = + sqlConf.getConf(entry) + + /** @inheritdoc */ + override private[sql] def get[T](entry: ConfigEntry[T], default: T): T = + sqlConf.getConf(entry, default) + private[sql] def getAllAsJava: java.util.Map[String, String] = { getAll.asJava } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala index 64eacba1c6bf3..8f37f5c32de34 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/columnNodeSupport.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.sql.internal -import scala.language.implicitConversions - import UserDefinedFunctionUtils.toScalaUDF import org.apache.spark.SparkException @@ -88,9 +86,6 @@ private[sql] trait ColumnNodeToExpressionConverter extends (ColumnNode => Expres isDistinct = isDistinct, isInternal = isInternal) - case LazyOuterReference(nameParts, planId, _) => - convertLazyOuterReference(nameParts, planId) - case Alias(child, Seq(name), None, _) => expressions.Alias(apply(child), name)( nonInheritableMetadataKeys = Seq(Dataset.DATASET_ID_KEY, Dataset.COL_POS_KEY)) @@ -193,6 +188,9 @@ private[sql] trait ColumnNodeToExpressionConverter extends (ColumnNode => Expres case _ => transformed } + case l: LazyExpression => + analysis.LazyExpression(apply(l.child)) + case node => throw SparkException.internalError("Unsupported ColumnNode: " + node) } @@ -248,16 +246,6 @@ private[sql] trait ColumnNodeToExpressionConverter extends (ColumnNode => Expres } attribute } - - private def convertLazyOuterReference( - nameParts: Seq[String], - planId: Option[Long]): analysis.LazyOuterReference = { - val lazyOuterReference = analysis.LazyOuterReference(nameParts) - if (planId.isDefined) { - lazyOuterReference.setTagValue(LogicalPlan.PLAN_ID_TAG, planId.get) - } - lazyOuterReference - } } private[sql] object ColumnNodeToExpressionConverter extends ColumnNodeToExpressionConverter { @@ -285,6 +273,8 @@ private[sql] case class ExpressionColumnNode private( } override def sql: String = expression.sql + + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } private[sql] object ExpressionColumnNode { @@ -312,13 +302,14 @@ private[spark] object ExpressionUtils { /** * Create an Expression backed Column. */ - implicit def column(e: Expression): Column = Column(ExpressionColumnNode(e)) + def column(e: Expression): Column = Column(ExpressionColumnNode(e)) /** - * Create an ColumnNode backed Expression. Please not that this has to be converted to an actual - * Expression before it is used. + * Create an ColumnNode backed Expression. This can only be used for expressions that will be + * used to construct a [[Column]]. In all other cases please use `SparkSession.expression(...)`, + * `SparkSession.toRichColumn(...)`, or `org.apache.spark.sql.classic.ColumnConversions`. */ - implicit def expression(c: Column): Expression = ColumnNodeExpression(c.node) + def expression(c: Column): Expression = ColumnNodeExpression(c.node) /** * Returns the expression either with an existing or auto assigned name. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index bce9c67042782..c1b79f8017419 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -310,6 +310,10 @@ private case class PostgresDialect() case _ => super.visitExtract(field, source) } } + + override def visitBinaryArithmetic(name: String, l: String, r: String): String = { + l + " " + name.replace('^', '#') + " " + r + } } override def compileExpression(expr: Expression): Option[String] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecution.scala new file mode 100644 index 0000000000000..2b15a6c55fa97 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecution.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.scripting + +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.{CommandResult, CompoundBody} + +/** + * SQL scripting executor - executes script and returns result statements. + * This supports returning multiple result statements from a single script. + * The caller of the SqlScriptingExecution API must adhere to the contract of executing + * the returned statement before continuing iteration. Executing the statement needs to be done + * inside withErrorHandling block. + * + * @param sqlScript CompoundBody which need to be executed. + * @param session Spark session that SQL script is executed within. + * @param args A map of parameter names to SQL literal expressions. + */ +class SqlScriptingExecution( + sqlScript: CompoundBody, + session: SparkSession, + args: Map[String, Expression]) { + + private val interpreter = SqlScriptingInterpreter(session) + + // Frames to keep what is being executed. + private val context: SqlScriptingExecutionContext = { + val ctx = new SqlScriptingExecutionContext() + val executionPlan = interpreter.buildExecutionPlan(sqlScript, args, ctx) + // Add frame which represents SQL Script to the context. + ctx.frames.append(new SqlScriptingExecutionFrame(executionPlan.getTreeIterator)) + // Enter the scope of the top level compound. + // We don't need to exit this scope explicitly as it will be done automatically + // when the frame is removed during iteration. + executionPlan.enterScope() + ctx + } + + + /** Helper method to iterate get next statements from the first available frame. */ + private def getNextStatement: Option[CompoundStatementExec] = { + // Remove frames that are already executed. + while (context.frames.nonEmpty && !context.frames.last.hasNext) { + context.frames.remove(context.frames.size - 1) + } + // If there are still frames available, get the next statement. + if (context.frames.nonEmpty) { + return Some(context.frames.last.next()) + } + None + } + + /** + * Advances through the script and executes statements until a result statement or + * end of script is encountered. + * + * To know if there is result statement available, the method has to advance through script and + * execute statements until the result statement or end of script is encountered. For that reason + * the returned result must be executed before subsequent calls. Multiple calls without executing + * the intermediate results will lead to incorrect behavior. + * + * @return Result DataFrame if it is available, otherwise None. + */ + def getNextResult: Option[DataFrame] = { + var currentStatement = getNextStatement + // While we don't have a result statement, execute the statements. + while (currentStatement.isDefined) { + currentStatement match { + case Some(stmt: SingleStatementExec) if !stmt.isExecuted => + withErrorHandling { + val df = stmt.buildDataFrame(session) + df.logicalPlan match { + case _: CommandResult => // pass + case _ => return Some(df) // If the statement is a result, return it to the caller. + } + } + case _ => // pass + } + currentStatement = getNextStatement + } + None + } + + private def handleException(e: Throwable): Unit = { + // Rethrow the exception. + // TODO: SPARK-48353 Add error handling for SQL scripts + throw e + } + + def withErrorHandling(f: => Unit): Unit = { + try { + f + } catch { + case e: Throwable => + handleException(e) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionContext.scala new file mode 100644 index 0000000000000..94462ab828f75 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionContext.scala @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.scripting + +import scala.collection.mutable.ListBuffer + +import org.apache.spark.SparkException + +/** + * SQL scripting execution context - keeps track of the current execution state. + */ +class SqlScriptingExecutionContext { + // List of frames that are currently active. + val frames: ListBuffer[SqlScriptingExecutionFrame] = ListBuffer.empty + + def enterScope(label: String): Unit = { + if (frames.isEmpty) { + throw SparkException.internalError("Cannot enter scope: no frames.") + } + frames.last.enterScope(label) + } + + def exitScope(label: String): Unit = { + if (frames.isEmpty) { + throw SparkException.internalError("Cannot exit scope: no frames.") + } + frames.last.exitScope(label) + } +} + +/** + * SQL scripting executor - executes script and returns result statements. + * This supports returning multiple result statements from a single script. + * + * @param executionPlan CompoundBody which need to be executed. + */ +class SqlScriptingExecutionFrame( + executionPlan: Iterator[CompoundStatementExec]) extends Iterator[CompoundStatementExec] { + + // List of scopes that are currently active. + private val scopes: ListBuffer[SqlScriptingExecutionScope] = ListBuffer.empty + + override def hasNext: Boolean = executionPlan.hasNext + + override def next(): CompoundStatementExec = { + if (!hasNext) throw SparkException.internalError("No more elements to iterate through.") + executionPlan.next() + } + + def enterScope(label: String): Unit = { + scopes.append(new SqlScriptingExecutionScope(label)) + } + + def exitScope(label: String): Unit = { + if (scopes.isEmpty) { + throw SparkException.internalError("Cannot exit scope: no scopes to exit.") + } + + // Remove all scopes until the one with the given label. + while (scopes.nonEmpty && scopes.last.label != label) { + scopes.remove(scopes.length - 1) + } + + // Remove the scope with the given label. + if (scopes.nonEmpty) { + scopes.remove(scopes.length - 1) + } + } +} + +/** + * SQL scripting execution scope - keeps track of the current execution scope. + * + * @param label + * Label of the scope. + */ +class SqlScriptingExecutionScope(val label: String) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala index 9129fc6ab00f3..58cbfb0feb015 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala @@ -17,10 +17,14 @@ package org.apache.spark.sql.scripting +import java.util + import org.apache.spark.SparkException import org.apache.spark.internal.Logging -import org.apache.spark.sql.{Dataset, SparkSession} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.analysis.{NameParameterizedQuery, UnresolvedAttribute, UnresolvedIdentifier} +import org.apache.spark.sql.catalyst.expressions.{Alias, CreateArray, CreateMap, CreateNamedStruct, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{CreateVariable, DefaultValueExpression, DropVariable, LogicalPlan, OneRowRelation, Project, SetVariable} import org.apache.spark.sql.catalyst.trees.{Origin, WithOrigin} import org.apache.spark.sql.errors.SqlScriptingErrors import org.apache.spark.sql.types.BooleanType @@ -77,7 +81,7 @@ trait NonLeafStatementExec extends CompoundStatementExec { // DataFrame evaluates to True if it is single row, single column // of boolean type with value True. - val df = Dataset.ofRows(session, statement.parsedPlan) + val df = statement.buildDataFrame(session) df.schema.fields match { case Array(field) if field.dataType == BooleanType => df.limit(2).collect() match { @@ -105,15 +109,21 @@ trait NonLeafStatementExec extends CompoundStatementExec { * Logical plan of the parsed statement. * @param origin * Origin descriptor for the statement. + * @param args + * A map of parameter names to SQL literal expressions. * @param isInternal * Whether the statement originates from the SQL script or it is created during the * interpretation. Example: DropVariable statements are automatically created at the end of each * compound. + * @param context + * SqlScriptingExecutionContext keeps the execution state of current script. */ class SingleStatementExec( var parsedPlan: LogicalPlan, override val origin: Origin, - override val isInternal: Boolean) + val args: Map[String, Expression], + override val isInternal: Boolean, + context: SqlScriptingExecutionContext) extends LeafStatementExec with WithOrigin { /** @@ -122,6 +132,17 @@ class SingleStatementExec( */ var isExecuted = false + /** + * Plan with named parameters. + */ + private lazy val preparedPlan: LogicalPlan = { + if (args.nonEmpty) { + NameParameterizedQuery(parsedPlan, args) + } else { + parsedPlan + } + } + /** * Get the SQL query text corresponding to this statement. * @return @@ -132,21 +153,82 @@ class SingleStatementExec( origin.sqlText.get.substring(origin.startIndex.get, origin.stopIndex.get + 1) } + /** + * Builds a DataFrame from the parsedPlan of this SingleStatementExec + * @param session The SparkSession on which the parsedPlan is built. + * @return + * The DataFrame. + */ + def buildDataFrame(session: SparkSession): DataFrame = { + Dataset.ofRows(session, preparedPlan) + } + override def reset(): Unit = isExecuted = false } +/** + * NO-OP leaf node, which does nothing when returned to the iterator. + * It is emitted by empty BEGIN END blocks. + */ +class NoOpStatementExec extends LeafStatementExec { + override def reset(): Unit = () +} + /** * Executable node for CompoundBody. * @param statements * Executable nodes for nested statements within the CompoundBody. * @param label * Label set by user to CompoundBody or None otherwise. + * @param isScope + * Flag indicating if the CompoundBody is a labeled scope. + * Scopes are used for grouping local variables and exception handlers. + * @param context + * SqlScriptingExecutionContext keeps the execution state of current script. */ -class CompoundBodyExec(statements: Seq[CompoundStatementExec], label: Option[String] = None) +class CompoundBodyExec( + statements: Seq[CompoundStatementExec], + label: Option[String] = None, + isScope: Boolean, + context: SqlScriptingExecutionContext) extends NonLeafStatementExec { + private object ScopeStatus extends Enumeration { + type ScopeStatus = Value + val NOT_ENTERED, INSIDE, EXITED = Value + } + private var localIterator = statements.iterator private var curr = if (localIterator.hasNext) Some(localIterator.next()) else None + private var scopeStatus = ScopeStatus.NOT_ENTERED + + /** + * Enter scope represented by this compound statement. + * + * This operation needs to be idempotent because it is called multiple times during + * iteration, but it should be executed only once when compound body that represent + * scope is encountered for the first time. + */ + def enterScope(): Unit = { + // This check makes this operation idempotent. + if (isScope && scopeStatus == ScopeStatus.NOT_ENTERED) { + scopeStatus = ScopeStatus.INSIDE + context.enterScope(label.get) + } + } + + /** + * Exit scope represented by this compound statement. + * + * Even though this operation is called exactly once, we are making it idempotent. + */ + protected def exitScope(): Unit = { + // This check makes this operation idempotent. + if (isScope && scopeStatus == ScopeStatus.INSIDE) { + scopeStatus = ScopeStatus.EXITED + context.exitScope(label.get) + } + } /** Used to stop the iteration in cases when LEAVE statement is encountered. */ private var stopIteration = false @@ -182,6 +264,11 @@ class CompoundBodyExec(statements: Seq[CompoundStatementExec], label: Option[Str statement case Some(body: NonLeafStatementExec) => if (body.getTreeIterator.hasNext) { + body match { + // Scope will be entered only once per compound because enter scope is idempotent. + case compoundBodyExec: CompoundBodyExec => compoundBodyExec.enterScope() + case _ => // pass + } body.getTreeIterator.next() match { case leaveStatement: LeaveStatementExec => handleLeaveStatement(leaveStatement) @@ -192,6 +279,11 @@ class CompoundBodyExec(statements: Seq[CompoundStatementExec], label: Option[Str case other => other } } else { + body match { + // Exit scope when there are no more statements to iterate through. + case compoundBodyExec: CompoundBodyExec => compoundBodyExec.exitScope() + case _ => // pass + } curr = if (localIterator.hasNext) Some(localIterator.next()) else None next() } @@ -208,6 +300,7 @@ class CompoundBodyExec(statements: Seq[CompoundStatementExec], label: Option[Str localIterator = statements.iterator curr = if (localIterator.hasNext) Some(localIterator.next()) else None stopIteration = false + scopeStatus = ScopeStatus.NOT_ENTERED } /** Actions to do when LEAVE statement is encountered, to stop the execution of this compound. */ @@ -216,6 +309,9 @@ class CompoundBodyExec(statements: Seq[CompoundStatementExec], label: Option[Str // Stop the iteration. stopIteration = true + // Exit scope if leave statement is encountered. + exitScope() + // TODO: Variable cleanup (once we add SQL script execution logic). // TODO: Add interpreter tests as well. @@ -232,6 +328,9 @@ class CompoundBodyExec(statements: Seq[CompoundStatementExec], label: Option[Str // Stop the iteration. stopIteration = true + // Exit scope if iterate statement is encountered. + exitScope() + // TODO: Variable cleanup (once we add SQL script execution logic). // TODO: Add interpreter tests as well. @@ -636,3 +735,233 @@ class LoopStatementExec( body.reset() } } + +/** + * Executable node for ForStatement. + * @param query Executable node for the query. + * @param variableName Name of variable used for accessing current row during iteration. + * @param body Executable node for the body. + * @param label Label set to ForStatement by user or None otherwise. + * @param session Spark session that SQL script is executed within. + * @param context SqlScriptingExecutionContext keeps the execution state of current script. + */ +class ForStatementExec( + query: SingleStatementExec, + variableName: Option[String], + body: CompoundBodyExec, + val label: Option[String], + session: SparkSession, + context: SqlScriptingExecutionContext) extends NonLeafStatementExec { + + private object ForState extends Enumeration { + val VariableAssignment, Body, VariableCleanup = Value + } + private var state = ForState.VariableAssignment + private var areVariablesDeclared = false + + // map of all variables created internally by the for statement + // (variableName -> variableExpression) + private var variablesMap: Map[String, Expression] = Map() + + // compound body used for dropping variables while in ForState.VariableAssignment + private var dropVariablesExec: CompoundBodyExec = null + + private var queryResult: util.Iterator[Row] = _ + private var isResultCacheValid = false + private def cachedQueryResult(): util.Iterator[Row] = { + if (!isResultCacheValid) { + queryResult = query.buildDataFrame(session).toLocalIterator() + query.isExecuted = true + isResultCacheValid = true + } + queryResult + } + + /** + * For can be interrupted by LeaveStatementExec + */ + private var interrupted: Boolean = false + + private lazy val treeIterator: Iterator[CompoundStatementExec] = + new Iterator[CompoundStatementExec] { + + override def hasNext: Boolean = !interrupted && (state match { + case ForState.VariableAssignment => cachedQueryResult().hasNext + case ForState.Body => true + case ForState.VariableCleanup => dropVariablesExec.getTreeIterator.hasNext + }) + + @scala.annotation.tailrec + override def next(): CompoundStatementExec = state match { + + case ForState.VariableAssignment => + variablesMap = createVariablesMapFromRow(cachedQueryResult().next()) + + if (!areVariablesDeclared) { + // create and execute declare var statements + variablesMap.keys.toSeq + .map(colName => createDeclareVarExec(colName, variablesMap(colName))) + .foreach(declareVarExec => declareVarExec.buildDataFrame(session).collect()) + areVariablesDeclared = true + } + + // create and execute set var statements + variablesMap.keys.toSeq + .map(colName => createSetVarExec(colName, variablesMap(colName))) + .foreach(setVarExec => setVarExec.buildDataFrame(session).collect()) + + state = ForState.Body + body.reset() + next() + + case ForState.Body => + val retStmt = body.getTreeIterator.next() + + // Handle LEAVE or ITERATE statement if it has been encountered. + retStmt match { + case leaveStatementExec: LeaveStatementExec if !leaveStatementExec.hasBeenMatched => + if (label.contains(leaveStatementExec.label)) { + leaveStatementExec.hasBeenMatched = true + } + interrupted = true + // If this for statement encounters LEAVE, it will either not be executed + // again, or it will be reset before being executed. + // In either case, variables will not + // be dropped normally, from ForState.VariableCleanup, so we drop them here. + dropVars() + return retStmt + case iterStatementExec: IterateStatementExec if !iterStatementExec.hasBeenMatched => + if (label.contains(iterStatementExec.label)) { + iterStatementExec.hasBeenMatched = true + } else { + // if an outer loop is being iterated, this for statement will either not be + // executed again, or it will be reset before being executed. + // In either case, variables will not + // be dropped normally, from ForState.VariableCleanup, so we drop them here. + dropVars() + } + switchStateFromBody() + return retStmt + case _ => + } + + if (!body.getTreeIterator.hasNext) { + switchStateFromBody() + } + retStmt + + case ForState.VariableCleanup => + dropVariablesExec.getTreeIterator.next() + } + } + + /** + * Recursively creates a Catalyst expression from Scala value.
+ * See https://spark.apache.org/docs/latest/sql-ref-datatypes.html for Spark -> Scala mappings + */ + private def createExpressionFromValue(value: Any): Expression = value match { + case m: Map[_, _] => + // arguments of CreateMap are in the format: (key1, val1, key2, val2, ...) + val mapArgs = m.keys.toSeq.flatMap { key => + Seq(createExpressionFromValue(key), createExpressionFromValue(m(key))) + } + CreateMap(mapArgs, useStringTypeWhenEmpty = false) + + // structs and rows match this case + case s: Row => + // arguments of CreateNamedStruct are in the format: (name1, val1, name2, val2, ...) + val namedStructArgs = s.schema.names.toSeq.flatMap { colName => + val valueExpression = createExpressionFromValue(s.getAs(colName)) + Seq(Literal(colName), valueExpression) + } + CreateNamedStruct(namedStructArgs) + + // arrays match this case + case a: collection.Seq[_] => + val arrayArgs = a.toSeq.map(createExpressionFromValue(_)) + CreateArray(arrayArgs, useStringTypeWhenEmpty = false) + + case _ => Literal(value) + } + + private def createVariablesMapFromRow(row: Row): Map[String, Expression] = { + var variablesMap = row.schema.names.toSeq.map { colName => + colName -> createExpressionFromValue(row.getAs(colName)) + }.toMap + + if (variableName.isDefined) { + val namedStructArgs = variablesMap.keys.toSeq.flatMap { colName => + Seq(Literal(colName), variablesMap(colName)) + } + val forVariable = CreateNamedStruct(namedStructArgs) + variablesMap = variablesMap + (variableName.get -> forVariable) + } + variablesMap + } + + /** + * Create and immediately execute dropVariable exec nodes for all variables in variablesMap. + */ + private def dropVars(): Unit = { + variablesMap.keys.toSeq + .map(colName => createDropVarExec(colName)) + .foreach(dropVarExec => dropVarExec.buildDataFrame(session).collect()) + areVariablesDeclared = false + } + + private def switchStateFromBody(): Unit = { + state = if (cachedQueryResult().hasNext) ForState.VariableAssignment + else { + // create compound body for dropping nodes after execution is complete + dropVariablesExec = new CompoundBodyExec( + variablesMap.keys.toSeq.map(colName => createDropVarExec(colName)), + None, + isScope = false, + context + ) + ForState.VariableCleanup + } + } + + private def createDeclareVarExec(varName: String, variable: Expression): SingleStatementExec = { + val defaultExpression = DefaultValueExpression(Literal(null, variable.dataType), "null") + val declareVariable = CreateVariable( + UnresolvedIdentifier(Seq(varName)), + defaultExpression, + replace = true + ) + new SingleStatementExec(declareVariable, Origin(), Map.empty, isInternal = true, context) + } + + private def createSetVarExec(varName: String, variable: Expression): SingleStatementExec = { + val projectNamedStruct = Project( + Seq(Alias(variable, varName)()), + OneRowRelation() + ) + val setIdentifierToCurrentRow = + SetVariable(Seq(UnresolvedAttribute(varName)), projectNamedStruct) + new SingleStatementExec( + setIdentifierToCurrentRow, + Origin(), + Map.empty, + isInternal = true, + context) + } + + private def createDropVarExec(varName: String): SingleStatementExec = { + val dropVar = DropVariable(UnresolvedIdentifier(Seq(varName)), ifExists = true) + new SingleStatementExec(dropVar, Origin(), Map.empty, isInternal = true, context) + } + + override def getTreeIterator: Iterator[CompoundStatementExec] = treeIterator + + override def reset(): Unit = { + state = ForState.VariableAssignment + isResultCacheValid = false + variablesMap = Map() + areVariablesDeclared = false + dropVariablesExec = null + interrupted = false + body.reset() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreter.scala b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreter.scala index 1be75cb61c8b0..7d00bbb3538df 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreter.scala @@ -19,13 +19,17 @@ package org.apache.spark.sql.scripting import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.analysis.UnresolvedIdentifier -import org.apache.spark.sql.catalyst.plans.logical.{CaseStatement, CompoundBody, CompoundPlanStatement, CreateVariable, DropVariable, IfElseStatement, IterateStatement, LeaveStatement, LogicalPlan, LoopStatement, RepeatStatement, SingleStatement, WhileStatement} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.{CaseStatement, CompoundBody, CompoundPlanStatement, CreateVariable, DropVariable, ForStatement, IfElseStatement, IterateStatement, LeaveStatement, LogicalPlan, LoopStatement, RepeatStatement, SingleStatement, WhileStatement} import org.apache.spark.sql.catalyst.trees.Origin /** * SQL scripting interpreter - builds SQL script execution plan. + * + * @param session + * Spark session that SQL script is executed within. */ -case class SqlScriptingInterpreter() { +case class SqlScriptingInterpreter(session: SparkSession) { /** * Build execution plan and return statements that need to be executed, @@ -33,15 +37,17 @@ case class SqlScriptingInterpreter() { * * @param compound * CompoundBody for which to build the plan. - * @param session - * Spark session that SQL script is executed within. + * @param args + * A map of parameter names to SQL literal expressions. * @return - * Iterator through collection of statements to be executed. + * Top level CompoundBodyExec representing SQL Script to be executed. */ def buildExecutionPlan( compound: CompoundBody, - session: SparkSession): Iterator[CompoundStatementExec] = { - transformTreeIntoExecutable(compound, session).asInstanceOf[CompoundBodyExec].getTreeIterator + args: Map[String, Expression], + context: SqlScriptingExecutionContext): CompoundBodyExec = { + transformTreeIntoExecutable(compound, args, context) + .asInstanceOf[CompoundBodyExec] } /** @@ -62,15 +68,17 @@ case class SqlScriptingInterpreter() { * * @param node * Root node of the parsed tree. - * @param session - * Spark session that SQL script is executed within. + * @param args + * A map of parameter names to SQL literal expressions. * @return * Executable statement. */ private def transformTreeIntoExecutable( - node: CompoundPlanStatement, session: SparkSession): CompoundStatementExec = + node: CompoundPlanStatement, + args: Map[String, Expression], + context: SqlScriptingExecutionContext): CompoundStatementExec = node match { - case CompoundBody(collection, label) => + case CompoundBody(collection, label, isScope) => // TODO [SPARK-48530]: Current logic doesn't support scoped variables and shadowing. val variables = collection.flatMap { case st: SingleStatement => getDeclareVarNameFromPlan(st.parsedPlan) @@ -78,51 +86,92 @@ case class SqlScriptingInterpreter() { } val dropVariables = variables .map(varName => DropVariable(varName, ifExists = true)) - .map(new SingleStatementExec(_, Origin(), isInternal = true)) + .map(new SingleStatementExec(_, Origin(), args, isInternal = true, context)) .reverse + + val statements = collection + .map(st => transformTreeIntoExecutable(st, args, context)) ++ dropVariables match { + case Nil => Seq(new NoOpStatementExec) + case s => s + } + new CompoundBodyExec( - collection.map(st => transformTreeIntoExecutable(st, session)) ++ dropVariables, - label) + statements, + label, + isScope, + context) case IfElseStatement(conditions, conditionalBodies, elseBody) => val conditionsExec = conditions.map(condition => - new SingleStatementExec(condition.parsedPlan, condition.origin, isInternal = false)) + new SingleStatementExec( + condition.parsedPlan, + condition.origin, + args, + isInternal = false, + context)) val conditionalBodiesExec = conditionalBodies.map(body => - transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec]) + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec]) val unconditionalBodiesExec = elseBody.map(body => - transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec]) + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec]) new IfElseStatementExec( conditionsExec, conditionalBodiesExec, unconditionalBodiesExec, session) case CaseStatement(conditions, conditionalBodies, elseBody) => val conditionsExec = conditions.map(condition => - // todo: what to put here for isInternal, in case of simple case statement - new SingleStatementExec(condition.parsedPlan, condition.origin, isInternal = false)) + new SingleStatementExec( + condition.parsedPlan, + condition.origin, + args, + isInternal = false, + context)) val conditionalBodiesExec = conditionalBodies.map(body => - transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec]) + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec]) val unconditionalBodiesExec = elseBody.map(body => - transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec]) + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec]) new CaseStatementExec( conditionsExec, conditionalBodiesExec, unconditionalBodiesExec, session) case WhileStatement(condition, body, label) => val conditionExec = - new SingleStatementExec(condition.parsedPlan, condition.origin, isInternal = false) + new SingleStatementExec( + condition.parsedPlan, + condition.origin, + args, + isInternal = false, + context) val bodyExec = - transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec] + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec] new WhileStatementExec(conditionExec, bodyExec, label, session) case RepeatStatement(condition, body, label) => val conditionExec = - new SingleStatementExec(condition.parsedPlan, condition.origin, isInternal = false) + new SingleStatementExec( + condition.parsedPlan, + condition.origin, + args, + isInternal = false, + context) val bodyExec = - transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec] + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec] new RepeatStatementExec(conditionExec, bodyExec, label, session) case LoopStatement(body, label) => - val bodyExec = transformTreeIntoExecutable(body, session).asInstanceOf[CompoundBodyExec] + val bodyExec = transformTreeIntoExecutable(body, args, context) + .asInstanceOf[CompoundBodyExec] new LoopStatementExec(bodyExec, label) + case ForStatement(query, variableNameOpt, body, label) => + val queryExec = + new SingleStatementExec( + query.parsedPlan, + query.origin, + args, + isInternal = false, + context) + val bodyExec = + transformTreeIntoExecutable(body, args, context).asInstanceOf[CompoundBodyExec] + new ForStatementExec(queryExec, variableNameOpt, bodyExec, label, session, context) + case leaveStatement: LeaveStatement => new LeaveStatementExec(leaveStatement.label) @@ -133,6 +182,8 @@ case class SqlScriptingInterpreter() { new SingleStatementExec( sparkStatement.parsedPlan, sparkStatement.origin, - isInternal = false) + args, + isInternal = false, + context) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index b0233d2c51b75..d41933c6a135c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -175,6 +175,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends api.DataStr extraOptions.get("path"), None, None, + None, external = false) val cmd = CreateTable( UnresolvedIdentifier(originalMultipartIdentifier), diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index c54e09735a9be..39cefdaa892b2 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -425,6 +425,8 @@ | org.apache.spark.sql.catalyst.expressions.aggregate.Kurtosis | kurtosis | SELECT kurtosis(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.Last | last | SELECT last(col) FROM VALUES (10), (5), (20) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.Last | last_value | SELECT last_value(col) FROM VALUES (10), (5), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.ListAgg | listagg | SELECT listagg(col) FROM VALUES ('a'), ('b'), ('c') AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.ListAgg | string_agg | SELECT string_agg(col) FROM VALUES ('a'), ('b'), ('c') AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.Max | max | SELECT max(col) FROM VALUES (10), (50), (20) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.MaxBy | max_by | SELECT max_by(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.Median | median | SELECT median(col) FROM VALUES (0), (10) AS tab(col) | struct | diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out index 2fd2261708c91..ca51aa50ac1bb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out @@ -442,77 +442,77 @@ Project [array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_l -- !query select 'a' collate unicode < 'A' -- !query analysis -Project [(collate(a, unicode) < cast(A as string collate UNICODE)) AS (collate(a, unicode) < A)#x] +Project [(collate(a, unicode) < A) AS (collate(a, unicode) < 'A' collate UNICODE)#x] +- OneRowRelation -- !query select 'a' collate unicode_ci = 'A' -- !query analysis -Project [(collate(a, unicode_ci) = cast(A as string collate UNICODE_CI)) AS (collate(a, unicode_ci) = A)#x] +Project [(collate(a, unicode_ci) = A) AS (collate(a, unicode_ci) = 'A' collate UNICODE_CI)#x] +- OneRowRelation -- !query select 'a' collate unicode_ai = 'å' -- !query analysis -Project [(collate(a, unicode_ai) = cast(å as string collate UNICODE_AI)) AS (collate(a, unicode_ai) = å)#x] +Project [(collate(a, unicode_ai) = å) AS (collate(a, unicode_ai) = 'å' collate UNICODE_AI)#x] +- OneRowRelation -- !query select 'a' collate unicode_ci_ai = 'Å' -- !query analysis -Project [(collate(a, unicode_ci_ai) = cast(Å as string collate UNICODE_CI_AI)) AS (collate(a, unicode_ci_ai) = Å)#x] +Project [(collate(a, unicode_ci_ai) = Å) AS (collate(a, unicode_ci_ai) = 'Å' collate UNICODE_CI_AI)#x] +- OneRowRelation -- !query select 'a' collate en < 'A' -- !query analysis -Project [(collate(a, en) < cast(A as string collate en)) AS (collate(a, en) < A)#x] +Project [(collate(a, en) < A) AS (collate(a, en) < 'A' collate en)#x] +- OneRowRelation -- !query select 'a' collate en_ci = 'A' -- !query analysis -Project [(collate(a, en_ci) = cast(A as string collate en_CI)) AS (collate(a, en_ci) = A)#x] +Project [(collate(a, en_ci) = A) AS (collate(a, en_ci) = 'A' collate en_CI)#x] +- OneRowRelation -- !query select 'a' collate en_ai = 'å' -- !query analysis -Project [(collate(a, en_ai) = cast(å as string collate en_AI)) AS (collate(a, en_ai) = å)#x] +Project [(collate(a, en_ai) = å) AS (collate(a, en_ai) = 'å' collate en_AI)#x] +- OneRowRelation -- !query select 'a' collate en_ci_ai = 'Å' -- !query analysis -Project [(collate(a, en_ci_ai) = cast(Å as string collate en_CI_AI)) AS (collate(a, en_ci_ai) = Å)#x] +Project [(collate(a, en_ci_ai) = Å) AS (collate(a, en_ci_ai) = 'Å' collate en_CI_AI)#x] +- OneRowRelation -- !query select 'Kypper' collate sv < 'Köpfe' -- !query analysis -Project [(collate(Kypper, sv) < cast(Köpfe as string collate sv)) AS (collate(Kypper, sv) < Köpfe)#x] +Project [(collate(Kypper, sv) < Köpfe) AS (collate(Kypper, sv) < 'Köpfe' collate sv)#x] +- OneRowRelation -- !query select 'Kypper' collate de > 'Köpfe' -- !query analysis -Project [(collate(Kypper, de) > cast(Köpfe as string collate de)) AS (collate(Kypper, de) > Köpfe)#x] +Project [(collate(Kypper, de) > Köpfe) AS (collate(Kypper, de) > 'Köpfe' collate de)#x] +- OneRowRelation -- !query select 'I' collate tr_ci = 'ı' -- !query analysis -Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I, tr_ci) = ı)#x] +Project [(collate(I, tr_ci) = ı) AS (collate(I, tr_ci) = 'ı' collate tr_CI)#x] +- OneRowRelation @@ -826,7 +826,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d -- !query select concat_ws(' ', utf8_lcase, utf8_lcase) from t5 -- !query analysis -Project [concat_ws(cast( as string collate UTF8_LCASE), utf8_lcase#x, utf8_lcase#x) AS concat_ws( , utf8_lcase, utf8_lcase)#x] +Project [concat_ws( , utf8_lcase#x, utf8_lcase#x) AS concat_ws( , utf8_lcase, utf8_lcase)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -868,7 +868,7 @@ Project [concat_ws(collate( , utf8_lcase), cast(utf8_binary#x as string collate -- !query select concat_ws(',', utf8_lcase, 'word'), concat_ws(',', utf8_binary, 'word') from t5 -- !query analysis -Project [concat_ws(cast(, as string collate UTF8_LCASE), utf8_lcase#x, cast(word as string collate UTF8_LCASE)) AS concat_ws(,, utf8_lcase, word)#x, concat_ws(,, utf8_binary#x, word) AS concat_ws(,, utf8_binary, word)#x] +Project [concat_ws(,, utf8_lcase#x, word) AS concat_ws(,, utf8_lcase, word)#x, concat_ws(,, utf8_binary#x, word) AS concat_ws(,, utf8_binary, word)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -876,7 +876,7 @@ Project [concat_ws(cast(, as string collate UTF8_LCASE), utf8_lcase#x, cast(word -- !query select concat_ws(',', utf8_lcase, 'word' collate utf8_binary), concat_ws(',', utf8_binary, 'word' collate utf8_lcase) from t5 -- !query analysis -Project [concat_ws(,, cast(utf8_lcase#x as string), collate(word, utf8_binary)) AS concat_ws(,, utf8_lcase, collate(word, utf8_binary))#x, concat_ws(cast(, as string collate UTF8_LCASE), cast(utf8_binary#x as string collate UTF8_LCASE), collate(word, utf8_lcase)) AS concat_ws(,, utf8_binary, collate(word, utf8_lcase))#x] +Project [concat_ws(,, cast(utf8_lcase#x as string), collate(word, utf8_binary)) AS concat_ws(,, utf8_lcase, collate(word, utf8_binary))#x, concat_ws(,, cast(utf8_binary#x as string collate UTF8_LCASE), collate(word, utf8_lcase)) AS concat_ws(,, utf8_binary, collate(word, utf8_lcase))#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -934,7 +934,7 @@ Project [elt(1, collate(utf8_binary#x, utf8_binary), cast(utf8_lcase#x as string -- !query select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5 -- !query analysis -Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x, elt(1, utf8_lcase#x, cast(word as string collate UTF8_LCASE), true) AS elt(1, utf8_lcase, word)#x] +Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x, elt(1, utf8_lcase#x, word, true) AS elt(1, utf8_lcase, 'word' collate UTF8_LCASE)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1024,7 +1024,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select split_part(utf8_binary, 'a', 3), split_part(utf8_lcase, 'a', 3) from t5 -- !query analysis -Project [split_part(utf8_binary#x, a, 3) AS split_part(utf8_binary, a, 3)#x, split_part(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 3) AS split_part(utf8_lcase, a, 3)#x] +Project [split_part(utf8_binary#x, a, 3) AS split_part(utf8_binary, a, 3)#x, split_part(utf8_lcase#x, a, 3) AS split_part(utf8_lcase, a, 3)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1122,7 +1122,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select contains(utf8_binary, 'a'), contains(utf8_lcase, 'a') from t5 -- !query analysis -Project [Contains(utf8_binary#x, a) AS contains(utf8_binary, a)#x, Contains(utf8_lcase#x, cast(a as string collate UTF8_LCASE)) AS contains(utf8_lcase, a)#x] +Project [Contains(utf8_binary#x, a) AS contains(utf8_binary, a)#x, Contains(utf8_lcase#x, a) AS contains(utf8_lcase, a)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1220,7 +1220,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select substring_index(utf8_binary, 'a', 2), substring_index(utf8_lcase, 'a', 2) from t5 -- !query analysis -Project [substring_index(utf8_binary#x, a, 2) AS substring_index(utf8_binary, a, 2)#x, substring_index(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 2) AS substring_index(utf8_lcase, a, 2)#x] +Project [substring_index(utf8_binary#x, a, 2) AS substring_index(utf8_binary, a, 2)#x, substring_index(utf8_lcase#x, a, 2) AS substring_index(utf8_lcase, a, 2)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1318,7 +1318,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select instr(utf8_binary, 'a'), instr(utf8_lcase, 'a') from t5 -- !query analysis -Project [instr(utf8_binary#x, a) AS instr(utf8_binary, a)#x, instr(utf8_lcase#x, cast(a as string collate UTF8_LCASE)) AS instr(utf8_lcase, a)#x] +Project [instr(utf8_binary#x, a) AS instr(utf8_binary, a)#x, instr(utf8_lcase#x, a) AS instr(utf8_lcase, a)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1384,7 +1384,7 @@ Project [find_in_set(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, u -- !query select find_in_set(utf8_binary, 'aaAaaAaA,i̇o'), find_in_set(utf8_lcase, 'aaAaaAaA,i̇o') from t5 -- !query analysis -Project [find_in_set(utf8_binary#x, aaAaaAaA,i̇o) AS find_in_set(utf8_binary, aaAaaAaA,i̇o)#x, find_in_set(utf8_lcase#x, cast(aaAaaAaA,i̇o as string collate UTF8_LCASE)) AS find_in_set(utf8_lcase, aaAaaAaA,i̇o)#x] +Project [find_in_set(utf8_binary#x, aaAaaAaA,i̇o) AS find_in_set(utf8_binary, aaAaaAaA,i̇o)#x, find_in_set(utf8_lcase#x, aaAaaAaA,i̇o) AS find_in_set(utf8_lcase, aaAaaAaA,i̇o)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1482,7 +1482,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select startswith(utf8_binary, 'aaAaaAaA'), startswith(utf8_lcase, 'aaAaaAaA') from t5 -- !query analysis -Project [StartsWith(utf8_binary#x, aaAaaAaA) AS startswith(utf8_binary, aaAaaAaA)#x, StartsWith(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE)) AS startswith(utf8_lcase, aaAaaAaA)#x] +Project [StartsWith(utf8_binary#x, aaAaaAaA) AS startswith(utf8_binary, aaAaaAaA)#x, StartsWith(utf8_lcase#x, aaAaaAaA) AS startswith(utf8_lcase, aaAaaAaA)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1506,7 +1506,7 @@ Project [StartsWith(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM), coll -- !query select translate(utf8_lcase, utf8_lcase, '12345') from t5 -- !query analysis -Project [translate(utf8_lcase#x, utf8_lcase#x, cast(12345 as string collate UTF8_LCASE)) AS translate(utf8_lcase, utf8_lcase, 12345)#x] +Project [translate(utf8_lcase#x, utf8_lcase#x, 12345) AS translate(utf8_lcase, utf8_lcase, 12345)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1572,7 +1572,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select translate(utf8_lcase, 'aaAaaAaA', '12345'), translate(utf8_binary, 'aaAaaAaA', '12345') from t5 -- !query analysis -Project [translate(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE), cast(12345 as string collate UTF8_LCASE)) AS translate(utf8_lcase, aaAaaAaA, 12345)#x, translate(utf8_binary#x, aaAaaAaA, 12345) AS translate(utf8_binary, aaAaaAaA, 12345)#x] +Project [translate(utf8_lcase#x, aaAaaAaA, 12345) AS translate(utf8_lcase, aaAaaAaA, 12345)#x, translate(utf8_binary#x, aaAaaAaA, 12345) AS translate(utf8_binary, aaAaaAaA, 12345)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1580,7 +1580,7 @@ Project [translate(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE), ca -- !query select translate(utf8_lcase, 'aBc' collate utf8_binary, '12345'), translate(utf8_binary, 'aBc' collate utf8_lcase, '12345') from t5 -- !query analysis -Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary), 12345) AS translate(utf8_lcase, collate(aBc, utf8_binary), 12345)#x, translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc, utf8_lcase), cast(12345 as string collate UTF8_LCASE)) AS translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x] +Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary), 12345) AS translate(utf8_lcase, collate(aBc, utf8_binary), 12345)#x, translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc, utf8_lcase), 12345) AS translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1588,7 +1588,7 @@ Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary), 1234 -- !query select translate(utf8_lcase, 'aBc ' collate utf8_binary_rtrim, '12345'), translate(utf8_binary, 'aBc' collate utf8_lcase, '12345') from t5 -- !query analysis -Project [translate(cast(utf8_lcase#x as string collate UTF8_BINARY_RTRIM), collate(aBc , utf8_binary_rtrim), cast(12345 as string collate UTF8_BINARY_RTRIM)) AS translate(utf8_lcase, collate(aBc , utf8_binary_rtrim), 12345)#x, translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc, utf8_lcase), cast(12345 as string collate UTF8_LCASE)) AS translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x] +Project [translate(cast(utf8_lcase#x as string collate UTF8_BINARY_RTRIM), collate(aBc , utf8_binary_rtrim), 12345) AS translate(utf8_lcase, collate(aBc , utf8_binary_rtrim), 12345)#x, translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc, utf8_lcase), 12345) AS translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1638,7 +1638,7 @@ Project [replace(utf8_binary#x, collate(utf8_lcase#x, utf8_binary), abc) AS repl -- !query select replace(utf8_binary collate utf8_lcase, utf8_lcase collate utf8_lcase, 'abc') from t5 -- !query analysis -Project [replace(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, utf8_lcase), cast(abc as string collate UTF8_LCASE)) AS replace(collate(utf8_binary, utf8_lcase), collate(utf8_lcase, utf8_lcase), abc)#x] +Project [replace(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, utf8_lcase), abc) AS replace(collate(utf8_binary, utf8_lcase), collate(utf8_lcase, utf8_lcase), abc)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1655,7 +1655,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "inputType" : "\"STRING COLLATE UNICODE_AI\"", "paramIndex" : "first", "requiredType" : "\"STRING\"", - "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), abc)\"" + "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\"" }, "queryContext" : [ { "objectType" : "", @@ -1670,7 +1670,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select replace(utf8_binary, 'aaAaaAaA', 'abc'), replace(utf8_lcase, 'aaAaaAaA', 'abc') from t5 -- !query analysis -Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary, aaAaaAaA, abc)#x, replace(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE), cast(abc as string collate UTF8_LCASE)) AS replace(utf8_lcase, aaAaaAaA, abc)#x] +Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary, aaAaaAaA, abc)#x, replace(utf8_lcase#x, aaAaaAaA, abc) AS replace(utf8_lcase, aaAaaAaA, abc)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1678,7 +1678,7 @@ Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary, aaAaaAaA, -- !query select replace(utf8_binary, 'aaAaaAaA' collate utf8_lcase, 'abc'), replace(utf8_lcase, 'aaAaaAaA' collate utf8_binary, 'abc') from t5 -- !query analysis -Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aaAaaAaA, utf8_lcase), cast(abc as string collate UTF8_LCASE)) AS replace(utf8_binary, collate(aaAaaAaA, utf8_lcase), abc)#x, replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), abc)#x] +Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aaAaaAaA, utf8_lcase), abc) AS replace(utf8_binary, collate(aaAaaAaA, utf8_lcase), abc)#x, replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), abc)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1686,7 +1686,7 @@ Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aaAaa -- !query select replace(utf8_binary, 'aaAaaAaA ' collate utf8_lcase_rtrim, 'abc'), replace(utf8_lcase, 'aaAaaAaA' collate utf8_binary, 'abc') from t5 -- !query analysis -Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM), collate(aaAaaAaA , utf8_lcase_rtrim), cast(abc as string collate UTF8_LCASE_RTRIM)) AS replace(utf8_binary, collate(aaAaaAaA , utf8_lcase_rtrim), abc)#x, replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), abc)#x] +Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM), collate(aaAaaAaA , utf8_lcase_rtrim), abc) AS replace(utf8_binary, collate(aaAaaAaA , utf8_lcase_rtrim), abc)#x, replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), abc)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -1768,7 +1768,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select endswith(utf8_binary, 'aaAaaAaA'), endswith(utf8_lcase, 'aaAaaAaA') from t5 -- !query analysis -Project [EndsWith(utf8_binary#x, aaAaaAaA) AS endswith(utf8_binary, aaAaaAaA)#x, EndsWith(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE)) AS endswith(utf8_lcase, aaAaaAaA)#x] +Project [EndsWith(utf8_binary#x, aaAaaAaA) AS endswith(utf8_binary, aaAaaAaA)#x, EndsWith(utf8_lcase#x, aaAaaAaA) AS endswith(utf8_lcase, aaAaaAaA)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2042,7 +2042,7 @@ Project [overlay(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, utf8_ -- !query select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5 -- !query analysis -Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x, overlay(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 2, -1) AS overlay(utf8_lcase, a, 2, -1)#x] +Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x, overlay(utf8_lcase#x, a, 2, -1) AS overlay(utf8_lcase, 'a' collate UTF8_LCASE, 2, -1)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2143,6 +2143,14 @@ Project [octet_length(collate(utf8_binary#x, utf8_lcase)) AS octet_length(collat +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet +-- !query +select octet_length(utf8_binary collate utf8_lcase_rtrim), octet_length(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query analysis +Project [octet_length(collate(utf8_binary#x, utf8_lcase_rtrim)) AS octet_length(collate(utf8_binary, utf8_lcase_rtrim))#x, octet_length(collate(utf8_lcase#x, utf8_binary_rtrim)) AS octet_length(collate(utf8_lcase, utf8_binary_rtrim))#x] ++- SubqueryAlias spark_catalog.default.t5 + +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet + + -- !query select luhn_check(num) from t9 -- !query analysis @@ -2204,7 +2212,7 @@ Project [levenshtein(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, u -- !query select levenshtein(utf8_binary, 'a'), levenshtein(utf8_lcase, 'a') from t5 -- !query analysis -Project [levenshtein(utf8_binary#x, a, None) AS levenshtein(utf8_binary, a)#x, levenshtein(utf8_lcase#x, cast(a as string collate UTF8_LCASE), None) AS levenshtein(utf8_lcase, a)#x] +Project [levenshtein(utf8_binary#x, a, None) AS levenshtein(utf8_binary, a)#x, levenshtein(utf8_lcase#x, a, None) AS levenshtein(utf8_lcase, a)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2233,6 +2241,14 @@ Project [is_valid_utf8(collate(utf8_binary#x, utf8_lcase)) AS is_valid_utf8(coll +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet +-- !query +select is_valid_utf8(utf8_binary collate utf8_lcase_rtrim), is_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query analysis +Project [is_valid_utf8(collate(utf8_binary#x, utf8_lcase_rtrim)) AS is_valid_utf8(collate(utf8_binary, utf8_lcase_rtrim))#x, is_valid_utf8(collate(utf8_lcase#x, utf8_binary_rtrim)) AS is_valid_utf8(collate(utf8_lcase, utf8_binary_rtrim))#x] ++- SubqueryAlias spark_catalog.default.t5 + +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet + + -- !query select make_valid_utf8(utf8_binary), make_valid_utf8(utf8_lcase) from t5 -- !query analysis @@ -2249,6 +2265,14 @@ Project [make_valid_utf8(collate(utf8_binary#x, utf8_lcase)) AS make_valid_utf8( +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet +-- !query +select make_valid_utf8(utf8_binary collate utf8_lcase_rtrim), make_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query analysis +Project [make_valid_utf8(collate(utf8_binary#x, utf8_lcase_rtrim)) AS make_valid_utf8(collate(utf8_binary, utf8_lcase_rtrim))#x, make_valid_utf8(collate(utf8_lcase#x, utf8_binary_rtrim)) AS make_valid_utf8(collate(utf8_lcase, utf8_binary_rtrim))#x] ++- SubqueryAlias spark_catalog.default.t5 + +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet + + -- !query select validate_utf8(utf8_binary), validate_utf8(utf8_lcase) from t5 -- !query analysis @@ -2265,6 +2289,14 @@ Project [validate_utf8(collate(utf8_binary#x, utf8_lcase)) AS validate_utf8(coll +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet +-- !query +select validate_utf8(utf8_binary collate utf8_lcase_rtrim), validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query analysis +Project [validate_utf8(collate(utf8_binary#x, utf8_lcase_rtrim)) AS validate_utf8(collate(utf8_binary, utf8_lcase_rtrim))#x, validate_utf8(collate(utf8_lcase#x, utf8_binary_rtrim)) AS validate_utf8(collate(utf8_lcase, utf8_binary_rtrim))#x] ++- SubqueryAlias spark_catalog.default.t5 + +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet + + -- !query select try_validate_utf8(utf8_binary), try_validate_utf8(utf8_lcase) from t5 -- !query analysis @@ -2281,6 +2313,14 @@ Project [try_validate_utf8(collate(utf8_binary#x, utf8_lcase)) AS try_validate_u +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet +-- !query +select try_validate_utf8(utf8_binary collate utf8_lcase_rtrim), try_validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query analysis +Project [try_validate_utf8(collate(utf8_binary#x, utf8_lcase_rtrim)) AS try_validate_utf8(collate(utf8_binary, utf8_lcase_rtrim))#x, try_validate_utf8(collate(utf8_lcase#x, utf8_binary_rtrim)) AS try_validate_utf8(collate(utf8_lcase, utf8_binary_rtrim))#x] ++- SubqueryAlias spark_catalog.default.t5 + +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet + + -- !query select substr(utf8_binary, 2, 2), substr(utf8_lcase, 2, 2) from t5 -- !query analysis @@ -2390,7 +2430,7 @@ Project [lpad(collate(utf8_binary#x, utf8_binary_rtrim), 8, collate(utf8_lcase#x -- !query select rpad(utf8_binary, 8, 'a'), rpad(utf8_lcase, 8, 'a') from t5 -- !query analysis -Project [rpad(utf8_binary#x, 8, a) AS rpad(utf8_binary, 8, a)#x, rpad(utf8_lcase#x, 8, cast(a as string collate UTF8_LCASE)) AS rpad(utf8_lcase, 8, a)#x] +Project [rpad(utf8_binary#x, 8, a) AS rpad(utf8_binary, 8, a)#x, rpad(utf8_lcase#x, 8, a) AS rpad(utf8_lcase, 8, a)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2464,7 +2504,7 @@ Project [lpad(collate(utf8_binary#x, utf8_binary_rtrim), 8, collate(utf8_lcase#x -- !query select lpad(utf8_binary, 8, 'a'), lpad(utf8_lcase, 8, 'a') from t5 -- !query analysis -Project [lpad(utf8_binary#x, 8, a) AS lpad(utf8_binary, 8, a)#x, lpad(utf8_lcase#x, 8, cast(a as string collate UTF8_LCASE)) AS lpad(utf8_lcase, 8, a)#x] +Project [lpad(utf8_binary#x, 8, a) AS lpad(utf8_binary, 8, a)#x, lpad(utf8_lcase#x, 8, a) AS lpad(utf8_lcase, 8, a)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2554,7 +2594,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select locate(utf8_binary, 'a'), locate(utf8_lcase, 'a') from t5 -- !query analysis -Project [locate(utf8_binary#x, a, 1) AS locate(utf8_binary, a, 1)#x, locate(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 1) AS locate(utf8_lcase, a, 1)#x] +Project [locate(utf8_binary#x, a, 1) AS locate(utf8_binary, a, 1)#x, locate(utf8_lcase#x, a, 1) AS locate(utf8_lcase, a, 1)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2660,7 +2700,7 @@ Project [trim(collate(utf8_lcase#x, utf8_binary_rtrim), Some(collate(utf8_binary -- !query select TRIM('ABc', utf8_binary), TRIM('ABc', utf8_lcase) from t5 -- !query analysis -Project [trim(utf8_binary#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_binary)#x, trim(utf8_lcase#x, Some(cast(ABc as string collate UTF8_LCASE))) AS TRIM(BOTH ABc FROM utf8_lcase)#x] +Project [trim(utf8_binary#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_binary)#x, trim(utf8_lcase#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_lcase)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2856,7 +2896,7 @@ Project [ltrim(collate(utf8_lcase#x, utf8_binary_rtrim), Some(collate(utf8_binar -- !query select LTRIM('ABc', utf8_binary), LTRIM('ABc', utf8_lcase) from t5 -- !query analysis -Project [ltrim(utf8_binary#x, Some(ABc)) AS TRIM(LEADING ABc FROM utf8_binary)#x, ltrim(utf8_lcase#x, Some(cast(ABc as string collate UTF8_LCASE))) AS TRIM(LEADING ABc FROM utf8_lcase)#x] +Project [ltrim(utf8_binary#x, Some(ABc)) AS TRIM(LEADING ABc FROM utf8_binary)#x, ltrim(utf8_lcase#x, Some(ABc)) AS TRIM(LEADING ABc FROM utf8_lcase)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet @@ -2954,7 +2994,7 @@ Project [rtrim(collate(utf8_lcase#x, utf8_binary_rtrim), Some(collate(utf8_binar -- !query select RTRIM('ABc', utf8_binary), RTRIM('ABc', utf8_lcase) from t5 -- !query analysis -Project [rtrim(utf8_binary#x, Some(ABc)) AS TRIM(TRAILING ABc FROM utf8_binary)#x, rtrim(utf8_lcase#x, Some(cast(ABc as string collate UTF8_LCASE))) AS TRIM(TRAILING ABc FROM utf8_lcase)#x] +Project [rtrim(utf8_binary#x, Some(ABc)) AS TRIM(TRAILING ABc FROM utf8_binary)#x, rtrim(utf8_lcase#x, Some(ABc)) AS TRIM(TRAILING ABc FROM utf8_lcase)#x] +- SubqueryAlias spark_catalog.default.t5 +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out index 0b539267e720f..c12076b85b1df 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out @@ -10,7 +10,7 @@ CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`cte_tbl`, ErrorI : +- OneRowRelation +- Project [col#x] +- SubqueryAlias s - +- CTERelationRef xxxx, true, [col#x], false + +- CTERelationRef xxxx, true, [col#x], false, false -- !query @@ -32,7 +32,7 @@ CreateViewCommand `cte_view`, WITH s AS (SELECT 42 AS col) SELECT * FROM s, fals : +- OneRowRelation +- Project [col#x] +- SubqueryAlias s - +- CTERelationRef xxxx, true, [col#x], false + +- CTERelationRef xxxx, true, [col#x], false, false -- !query @@ -49,7 +49,7 @@ Project [col#x] : +- OneRowRelation +- Project [col#x] +- SubqueryAlias s - +- CTERelationRef xxxx, true, [col#x], false + +- CTERelationRef xxxx, true, [col#x], false, false -- !query @@ -64,7 +64,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d : +- OneRowRelation +- Project [col#x] +- SubqueryAlias S - +- CTERelationRef xxxx, true, [col#x], false + +- CTERelationRef xxxx, true, [col#x], false, false -- !query @@ -86,7 +86,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d : +- OneRowRelation +- Project [col#x] +- SubqueryAlias s - +- CTERelationRef xxxx, true, [col#x], false + +- CTERelationRef xxxx, true, [col#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out index 2cbcbedff81b2..0d39ff7ad5101 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out @@ -15,10 +15,10 @@ WithCTE : +- SubqueryAlias t : +- Project [1#x] : +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -37,7 +37,7 @@ Aggregate [max(c#x) AS max(c)#x] : +- OneRowRelation +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -54,7 +54,7 @@ Project [scalar-subquery#x [] AS scalarsubquery()#x] : : +- OneRowRelation : +- Project [1#x] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- OneRowRelation @@ -140,10 +140,10 @@ WithCTE : +- SubqueryAlias t2 : +- Project [2#x] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [2#x], false +: +- CTERelationRef xxxx, true, [2#x], false, false +- Project [2#x] +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [2#x], false + +- CTERelationRef xxxx, true, [2#x], false, false -- !query @@ -178,11 +178,11 @@ WithCTE : : : +- OneRowRelation : : +- Project [c#x] : : +- SubqueryAlias t -: : +- CTERelationRef xxxx, true, [c#x], false +: : +- CTERelationRef xxxx, true, [c#x], false, false : +- OneRowRelation +- Project [scalarsubquery()#x] +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [scalarsubquery()#x], false + +- CTERelationRef xxxx, true, [scalarsubquery()#x], false, false -- !query @@ -215,15 +215,15 @@ WithCTE : +- SubqueryAlias t2 : +- Project [3#x] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [3#x], false +: +- CTERelationRef xxxx, true, [3#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias t2 : +- Project [3#x] : +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [3#x], false +: +- CTERelationRef xxxx, true, [3#x], false, false +- Project [3#x] +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [3#x], false + +- CTERelationRef xxxx, true, [3#x], false, false -- !query @@ -248,7 +248,7 @@ WithCTE +- SubqueryAlias __auto_generated_subquery_name +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -277,7 +277,7 @@ WithCTE +- SubqueryAlias __auto_generated_subquery_name +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -312,7 +312,7 @@ WithCTE +- SubqueryAlias __auto_generated_subquery_name +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -335,7 +335,7 @@ WithCTE : : +- OneRowRelation : +- Project [2#x] : +- SubqueryAlias t - : +- CTERelationRef xxxx, true, [2#x], false + : +- CTERelationRef xxxx, true, [2#x], false, false +- OneRowRelation @@ -362,7 +362,7 @@ WithCTE : : : +- OneRowRelation : : +- Project [2#x] : : +- SubqueryAlias t - : : +- CTERelationRef xxxx, true, [2#x], false + : : +- CTERelationRef xxxx, true, [2#x], false, false : +- OneRowRelation +- OneRowRelation @@ -396,7 +396,7 @@ WithCTE : : : +- OneRowRelation : : +- Project [3#x] : : +- SubqueryAlias t - : : +- CTERelationRef xxxx, true, [3#x], false + : : +- CTERelationRef xxxx, true, [3#x], false, false : +- OneRowRelation +- OneRowRelation @@ -425,9 +425,9 @@ WithCTE : : +- OneRowRelation : +- Project [c#x] : +- SubqueryAlias t - : +- CTERelationRef xxxx, true, [c#x], false + : +- CTERelationRef xxxx, true, [c#x], false, false +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -448,14 +448,14 @@ WithCTE : +- SubqueryAlias t : +- Project [1#x] : +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias t2 : +- Project [2 AS 2#x] : +- OneRowRelation +- Project [1#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -480,10 +480,10 @@ WithCTE : +- SubqueryAlias t : +- Project [2#x] : +- SubqueryAlias aBC -: +- CTERelationRef xxxx, true, [2#x], false +: +- CTERelationRef xxxx, true, [2#x], false, false +- Project [2#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [2#x], false + +- CTERelationRef xxxx, true, [2#x], false, false -- !query @@ -506,7 +506,7 @@ WithCTE : : +- OneRowRelation : +- Project [2#x] : +- SubqueryAlias aBC - : +- CTERelationRef xxxx, true, [2#x], false + : +- CTERelationRef xxxx, true, [2#x], false, false +- OneRowRelation @@ -530,15 +530,15 @@ WithCTE : +- SubqueryAlias t3 : +- Project [1#x] : +- SubqueryAlias t1 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias t2 : +- Project [1#x] : +- SubqueryAlias t3 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -561,12 +561,12 @@ WithCTE : +- SubqueryAlias cte_inner : +- Project [1#x] : +- SubqueryAlias cte_outer -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias __auto_generated_subquery_name +- Project [1#x] +- SubqueryAlias cte_inner - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -594,19 +594,19 @@ WithCTE : +- SubqueryAlias cte_inner_inner : +- Project [1#x] : +- SubqueryAlias cte_outer -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias cte_inner : +- Project [1#x] : +- SubqueryAlias __auto_generated_subquery_name : +- Project [1#x] : +- SubqueryAlias cte_inner_inner -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias __auto_generated_subquery_name +- Project [1#x] +- SubqueryAlias cte_inner - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out index 88d7bf9f929ad..633352a8a3b6f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out @@ -15,10 +15,10 @@ WithCTE : +- SubqueryAlias t : +- Project [1#x] : +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -37,7 +37,7 @@ Aggregate [max(c#x) AS max(c)#x] : +- OneRowRelation +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -54,7 +54,7 @@ Project [scalar-subquery#x [] AS scalarsubquery()#x] : : +- OneRowRelation : +- Project [1#x] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- OneRowRelation @@ -171,11 +171,11 @@ WithCTE : : : +- OneRowRelation : : +- Project [c#x] : : +- SubqueryAlias t -: : +- CTERelationRef xxxx, true, [c#x], false +: : +- CTERelationRef xxxx, true, [c#x], false, false : +- OneRowRelation +- Project [scalarsubquery()#x] +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [scalarsubquery()#x], false + +- CTERelationRef xxxx, true, [scalarsubquery()#x], false, false -- !query @@ -225,7 +225,7 @@ WithCTE +- SubqueryAlias __auto_generated_subquery_name +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -254,7 +254,7 @@ WithCTE +- SubqueryAlias __auto_generated_subquery_name +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -289,7 +289,7 @@ WithCTE +- SubqueryAlias __auto_generated_subquery_name +- Project [c#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [c#x], false + +- CTERelationRef xxxx, true, [c#x], false, false -- !query @@ -392,14 +392,14 @@ WithCTE : +- SubqueryAlias t : +- Project [1#x] : +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias t2 : +- Project [2 AS 2#x] : +- OneRowRelation +- Project [1#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -462,15 +462,15 @@ WithCTE : +- SubqueryAlias t3 : +- Project [1#x] : +- SubqueryAlias t1 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias t2 : +- Project [1#x] : +- SubqueryAlias t3 -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -493,12 +493,12 @@ WithCTE : +- SubqueryAlias cte_inner : +- Project [1#x] : +- SubqueryAlias cte_outer -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias __auto_generated_subquery_name +- Project [1#x] +- SubqueryAlias cte_inner - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query @@ -526,19 +526,19 @@ WithCTE : +- SubqueryAlias cte_inner_inner : +- Project [1#x] : +- SubqueryAlias cte_outer -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias cte_inner : +- Project [1#x] : +- SubqueryAlias __auto_generated_subquery_name : +- Project [1#x] : +- SubqueryAlias cte_inner_inner -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- Project [1#x] +- SubqueryAlias __auto_generated_subquery_name +- Project [1#x] +- SubqueryAlias cte_inner - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out index 885f34a28d67d..ded612ec8f8b6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out @@ -17,6 +17,19 @@ CreateViewCommand `t2`, select * from values 0, 1 as t(id), false, false, LocalT +- LocalRelation [id#x] +-- !query +create temporary view t3 as select * from t +-- !query analysis +CreateViewCommand `t3`, select * from t, false, false, LocalTempView, UNSUPPORTED, true + +- Project [id#x] + +- SubqueryAlias t + +- View (`t`, [id#x]) + +- Project [cast(id#x as int) AS id#x] + +- Project [id#x] + +- SubqueryAlias t + +- LocalRelation [id#x] + + -- !query WITH s AS (SELECT 1 FROM s) SELECT * FROM s -- !query analysis @@ -73,7 +86,28 @@ WithCTE : +- LocalRelation [id#x] +- Project [1#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [1#x], false + +- CTERelationRef xxxx, true, [1#x], false, false + + +-- !query +WITH t AS (SELECT 1) SELECT * FROM t3 +-- !query analysis +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [1 AS 1#x] +: +- OneRowRelation ++- Project [id#x] + +- SubqueryAlias t3 + +- View (`t3`, [id#x]) + +- Project [cast(id#x as int) AS id#x] + +- Project [id#x] + +- SubqueryAlias t + +- View (`t`, [id#x]) + +- Project [cast(id#x as int) AS id#x] + +- Project [id#x] + +- SubqueryAlias t + +- LocalRelation [id#x] -- !query @@ -113,13 +147,13 @@ WithCTE : +- SubqueryAlias t2 : +- Project [2 AS 2#x] : +- SubqueryAlias t1 -: +- CTERelationRef xxxx, true, [id#x], false +: +- CTERelationRef xxxx, true, [id#x], false, false +- Project [id#x, 2#x] +- Join Cross :- SubqueryAlias t1 - : +- CTERelationRef xxxx, true, [id#x], false + : +- CTERelationRef xxxx, true, [id#x], false, false +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [2#x], false + +- CTERelationRef xxxx, true, [2#x], false, false -- !query @@ -157,10 +191,10 @@ WithCTE +- Join Cross :- SubqueryAlias t1 : +- SubqueryAlias CTE1 - : +- CTERelationRef xxxx, true, [id#x], false + : +- CTERelationRef xxxx, true, [id#x], false, false +- SubqueryAlias t2 +- SubqueryAlias CTE1 - +- CTERelationRef xxxx, true, [id#x], false + +- CTERelationRef xxxx, true, [id#x], false, false -- !query @@ -176,7 +210,7 @@ WithCTE +- Project [x#x] +- Filter (x#x = 1) +- SubqueryAlias t - +- CTERelationRef xxxx, true, [x#x], false + +- CTERelationRef xxxx, true, [x#x], false, false -- !query @@ -192,7 +226,7 @@ WithCTE +- Project [x#x, y#x] +- Filter ((x#x = 1) AND (y#x = 2)) +- SubqueryAlias t - +- CTERelationRef xxxx, true, [x#x, y#x], false + +- CTERelationRef xxxx, true, [x#x, y#x], false, false -- !query @@ -207,7 +241,7 @@ WithCTE : +- OneRowRelation +- Project [x#x, x#x] +- SubqueryAlias t - +- CTERelationRef xxxx, true, [x#x, x#x], false + +- CTERelationRef xxxx, true, [x#x, x#x], false, false -- !query @@ -310,46 +344,46 @@ WithCTE : +- Project [c8#x AS c7#x] : +- Project [c8#x] : +- SubqueryAlias w8 -: +- CTERelationRef xxxx, true, [c8#x], false +: +- CTERelationRef xxxx, true, [c8#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias w6 : +- Project [c7#x AS c6#x] : +- Project [c7#x] : +- SubqueryAlias w7 -: +- CTERelationRef xxxx, true, [c7#x], false +: +- CTERelationRef xxxx, true, [c7#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias w5 : +- Project [c6#x AS c5#x] : +- Project [c6#x] : +- SubqueryAlias w6 -: +- CTERelationRef xxxx, true, [c6#x], false +: +- CTERelationRef xxxx, true, [c6#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias w4 : +- Project [c5#x AS c4#x] : +- Project [c5#x] : +- SubqueryAlias w5 -: +- CTERelationRef xxxx, true, [c5#x], false +: +- CTERelationRef xxxx, true, [c5#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias w3 : +- Project [c4#x AS c3#x] : +- Project [c4#x] : +- SubqueryAlias w4 -: +- CTERelationRef xxxx, true, [c4#x], false +: +- CTERelationRef xxxx, true, [c4#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias w2 : +- Project [c3#x AS c2#x] : +- Project [c3#x] : +- SubqueryAlias w3 -: +- CTERelationRef xxxx, true, [c3#x], false +: +- CTERelationRef xxxx, true, [c3#x], false, false :- CTERelationDef xxxx, false : +- SubqueryAlias w1 : +- Project [c2#x AS c1#x] : +- Project [c2#x] : +- SubqueryAlias w2 -: +- CTERelationRef xxxx, true, [c2#x], false +: +- CTERelationRef xxxx, true, [c2#x], false, false +- Project [c1#x] +- SubqueryAlias w1 - +- CTERelationRef xxxx, true, [c1#x], false + +- CTERelationRef xxxx, true, [c1#x], false, false -- !query @@ -386,7 +420,7 @@ WithCTE +- Project [42#x, 10#x] +- Join Inner :- SubqueryAlias same_name - : +- CTERelationRef xxxx, true, [42#x], false + : +- CTERelationRef xxxx, true, [42#x], false, false +- SubqueryAlias same_name +- Project [10 AS 10#x] +- OneRowRelation @@ -425,7 +459,7 @@ WithCTE : +- OneRowRelation +- Project [x#x, typeof(x#x) AS typeof(x)#x] +- SubqueryAlias q - +- CTERelationRef xxxx, true, [x#x], false + +- CTERelationRef xxxx, true, [x#x], false, false -- !query @@ -485,7 +519,7 @@ Project [y#x] : +- OneRowRelation +- Project [(x#x + 1) AS y#x] +- SubqueryAlias q - +- CTERelationRef xxxx, true, [x#x], false + +- CTERelationRef xxxx, true, [x#x], false, false -- !query @@ -499,7 +533,7 @@ Project [scalar-subquery#x [] AS scalarsubquery()#x] : : +- OneRowRelation : +- Project [x#x] : +- SubqueryAlias q -: +- CTERelationRef xxxx, true, [x#x], false +: +- CTERelationRef xxxx, true, [x#x], false, false +- OneRowRelation @@ -514,7 +548,7 @@ Project [1 IN (list#x []) AS (1 IN (listquery()))#x] : : +- OneRowRelation : +- Project [1#x] : +- SubqueryAlias q -: +- CTERelationRef xxxx, true, [1#x], false +: +- CTERelationRef xxxx, true, [1#x], false, false +- OneRowRelation @@ -562,14 +596,14 @@ WithCTE :- Join Inner : :- SubqueryAlias x : : +- SubqueryAlias T1 - : : +- CTERelationRef xxxx, true, [a#x], false + : : +- CTERelationRef xxxx, true, [a#x], false, false : +- SubqueryAlias y : +- Project [b#x] : +- SubqueryAlias T1 - : +- CTERelationRef xxxx, true, [b#x], false + : +- CTERelationRef xxxx, true, [b#x], false, false +- SubqueryAlias z +- SubqueryAlias T1 - +- CTERelationRef xxxx, true, [a#x], false + +- CTERelationRef xxxx, true, [a#x], false, false -- !query @@ -597,9 +631,9 @@ WithCTE +- Project [c#x, a#x] +- Join Inner :- SubqueryAlias ttTT - : +- CTERelationRef xxxx, true, [c#x], false + : +- CTERelationRef xxxx, true, [c#x], false, false +- SubqueryAlias tttT_2 - +- CTERelationRef xxxx, true, [a#x], false + +- CTERelationRef xxxx, true, [a#x], false, false -- !query @@ -615,7 +649,7 @@ Project [scalar-subquery#x [x#x] AS scalarsubquery(x)#x] : : +- OneRowRelation : +- Project [x#x] : +- SubqueryAlias q -: +- CTERelationRef xxxx, true, [x#x], false +: +- CTERelationRef xxxx, true, [x#x], false, false +- SubqueryAlias T +- Project [1 AS x#x, 2 AS y#x] +- OneRowRelation @@ -634,7 +668,7 @@ Project [scalar-subquery#x [x#x && y#x] AS scalarsubquery(x, y)#x] : : +- OneRowRelation : +- Project [((outer(x#x) + outer(y#x)) + z#x) AS ((outer(T.x) + outer(T.y)) + z)#x] : +- SubqueryAlias q -: +- CTERelationRef xxxx, true, [z#x], false +: +- CTERelationRef xxxx, true, [z#x], false, false +- SubqueryAlias T +- Project [1 AS x#x, 2 AS y#x] +- OneRowRelation @@ -654,12 +688,12 @@ WithCTE : +- SubqueryAlias q2 : +- Project [x#x] : +- SubqueryAlias q1 -: +- CTERelationRef xxxx, true, [x#x], false +: +- CTERelationRef xxxx, true, [x#x], false, false +- Project [x#x] +- SubqueryAlias __auto_generated_subquery_name +- Project [x#x] +- SubqueryAlias q2 - +- CTERelationRef xxxx, true, [x#x], false + +- CTERelationRef xxxx, true, [x#x], false, false -- !query @@ -676,12 +710,12 @@ WithCTE : +- SubqueryAlias q1 : +- Project [(x#x + 1) AS (x + 1)#x] : +- SubqueryAlias q1 -: +- CTERelationRef xxxx, true, [x#x], false +: +- CTERelationRef xxxx, true, [x#x], false, false +- Project [(x + 1)#x] +- SubqueryAlias __auto_generated_subquery_name +- Project [(x + 1)#x] +- SubqueryAlias q1 - +- CTERelationRef xxxx, true, [(x + 1)#x], false + +- CTERelationRef xxxx, true, [(x + 1)#x], false, false -- !query @@ -723,9 +757,9 @@ WithCTE : +- Aggregate [max(j#x) AS max(j)#x] : +- SubqueryAlias cte2 : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [j#x], false + : +- CTERelationRef xxxx, true, [j#x], false, false +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [j#x], false + +- CTERelationRef xxxx, true, [j#x], false, false -- !query @@ -778,3 +812,9 @@ DropTempViewCommand t DROP VIEW IF EXISTS t2 -- !query analysis DropTempViewCommand t2 + + +-- !query +DROP VIEW IF EXISTS t3 +-- !query analysis +DropTempViewCommand t3 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out index d6f0953dcf90a..99f7326e5ef8e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out @@ -56,6 +56,36 @@ DESCRIBE t DescribeTableCommand `spark_catalog`.`default`.`t`, false, [col_name#x, data_type#x, comment#x] +-- !query +DESCRIBE EXTENDED t AS JSON +-- !query analysis +DescribeRelationJsonCommand true, [json_metadata#x] ++- ResolvedTable V2SessionCatalog(spark_catalog), default.t, V1Table(default.t), [a#x, b#x, c#x, d#x] + + +-- !query +DESCRIBE t AS JSON +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "DESCRIBE_JSON_NOT_EXTENDED", + "sqlState" : "0A000", + "messageParameters" : { + "tableName" : "t" + } +} + + +-- !query +DESC FORMATTED t a AS JSON +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "UNSUPPORTED_FEATURE.DESC_TABLE_COLUMN_JSON", + "sqlState" : "0A000" +} + + -- !query DESC default.t -- !query analysis @@ -110,6 +140,13 @@ DESC t PARTITION (c='Us', d=1) DescribeTableCommand `spark_catalog`.`default`.`t`, [c=Us, d=1], false, [col_name#x, data_type#x, comment#x] +-- !query +DESC EXTENDED t PARTITION (c='Us', d=1) AS JSON +-- !query analysis +DescribeRelationJsonCommand [c=Us, d=1], true, [json_metadata#x] ++- ResolvedTable V2SessionCatalog(spark_catalog), default.t, V1Table(default.t), [a#x, b#x, c#x, d#x] + + -- !query DESC EXTENDED t PARTITION (c='Us', d=1) -- !query analysis @@ -290,6 +327,12 @@ EXPLAIN DESCRIBE t PARTITION (c='Us', d=2) ExplainCommand 'DescribeRelation [c=Us, d=2], false, [col_name#x, data_type#x, comment#x], SimpleMode +-- !query +EXPLAIN DESCRIBE EXTENDED t PARTITION (c='Us', d=2) AS JSON +-- !query analysis +ExplainCommand 'DescribeRelationJsonCommand [c=Us, d=2], true, [json_metadata#x], SimpleMode + + -- !query DROP TABLE t -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out index 2edcd638120c5..f9f0067648fcf 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out @@ -418,7 +418,7 @@ CreateViewCommand `myview`, [(c1,None)], WITH "v"("a") AS (SELECT 1) SELECT "a" : +- OneRowRelation +- Project [a#x] +- SubqueryAlias v - +- CTERelationRef xxxx, true, [a#x], false + +- CTERelationRef xxxx, true, [a#x], false, false -- !query @@ -438,7 +438,7 @@ Project [a1#x AS a2#x] : +- OneRowRelation +- Project [a#x] +- SubqueryAlias v - +- CTERelationRef xxxx, true, [a#x], false + +- CTERelationRef xxxx, true, [a#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out index 0895fe788f84a..f085e47c08ecf 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out @@ -630,7 +630,7 @@ Aggregate [a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x], [a#x, b#x, coun SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), ROLLUP(a, b), GROUPING SETS((a, b), (a), ()) -- !query analysis Aggregate [a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x], [a#x, b#x, count(1) AS count(1)#xL] -+- Expand [[a#x, b#x, a#x, b#x, 0, 0], [a#x, b#x, a#x, b#x, 0, 1], [a#x, b#x, a#x, b#x, 0, 2], [a#x, b#x, a#x, b#x, 0, 3], [a#x, b#x, a#x, b#x, 0, 4], [a#x, b#x, a#x, b#x, 0, 5], [a#x, b#x, a#x, b#x, 0, 6], [a#x, b#x, a#x, b#x, 0, 7], [a#x, b#x, a#x, b#x, 0, 8], [a#x, b#x, a#x, b#x, 0, 9], [a#x, b#x, a#x, b#x, 0, 10], [a#x, b#x, a#x, b#x, 0, 11], [a#x, b#x, a#x, b#x, 0, 12], [a#x, b#x, a#x, null, 1, 13], [a#x, b#x, a#x, null, 1, 14], [a#x, b#x, a#x, b#x, 0, 15], [a#x, b#x, a#x, null, 1, 16], [a#x, b#x, a#x, null, 1, 17], [a#x, b#x, a#x, b#x, 0, 18], [a#x, b#x, a#x, b#x, 0, 19], [a#x, b#x, a#x, b#x, 0, 20], [a#x, b#x, a#x, b#x, 0, 21], [a#x, b#x, a#x, b#x, 0, 22], [a#x, b#x, a#x, b#x, 0, 23], ... 12 more fields], [a#x, b#x, a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x] ++- Expand [[a#x, b#x, a#x, b#x, 0, 0], [a#x, b#x, a#x, b#x, 0, 1], [a#x, b#x, a#x, b#x, 0, 2], [a#x, b#x, a#x, b#x, 0, 3], [a#x, b#x, a#x, b#x, 0, 4], [a#x, b#x, a#x, b#x, 0, 5], [a#x, b#x, a#x, b#x, 0, 6], [a#x, b#x, a#x, b#x, 0, 7], [a#x, b#x, a#x, b#x, 0, 8], [a#x, b#x, a#x, b#x, 0, 9], [a#x, b#x, a#x, b#x, 0, 10], [a#x, b#x, a#x, b#x, 0, 11], [a#x, b#x, a#x, b#x, 0, 12], [a#x, b#x, a#x, null, 1, 13], [a#x, b#x, a#x, null, 1, 14], [a#x, b#x, a#x, b#x, 0, 15], [a#x, b#x, a#x, null, 1, 16], [a#x, b#x, a#x, null, 1, 17], [a#x, b#x, a#x, b#x, 0, 18], [a#x, b#x, a#x, b#x, 0, 19], [a#x, b#x, a#x, b#x, 0, 20], [a#x, b#x, a#x, b#x, 0, 21], [a#x, b#x, a#x, b#x, 0, 22], [a#x, b#x, a#x, b#x, 0, 23], [a#x, b#x, a#x, b#x, 0, 24], ... 11 more fields], [a#x, b#x, a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x] +- Project [a#x, b#x, a#x AS a#x, b#x AS b#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out index c4839fd359d14..607b2401e853b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out @@ -1055,6 +1055,15 @@ Aggregate [histogram_numeric(col#xL, 3, 0, 0) AS histogram_numeric(col, 3)#x] +- LocalRelation [col#xL] +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS DECIMAL(4, 2))), (CAST(2 AS DECIMAL(4, 2))), (CAST(3 AS DECIMAL(4, 2))) AS tab(col) +-- !query analysis +Aggregate [histogram_numeric(col#x, 3, 0, 0) AS histogram_numeric(col, 3)#x] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + -- !query SELECT histogram_numeric(col, 3) FROM VALUES (TIMESTAMP '2017-03-01 00:00:00'), (TIMESTAMP '2017-04-01 00:00:00'), (TIMESTAMP '2017-05-01 00:00:00') AS tab(col) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out index 7bbad7f49fb25..e79a549f84062 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out @@ -1022,7 +1022,7 @@ WithCTE : +- LocalRelation [col1#x, col2#x] +- Aggregate [max(c1#x) AS max(c1)#x] +- SubqueryAlias T - +- CTERelationRef xxxx, true, [c1#x, c2#x], false + +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false -- !query @@ -1041,7 +1041,7 @@ WithCTE : +- LocalRelation [col1#x, col2#x] +- Aggregate [max(c1#x) AS max(c1)#x] +- SubqueryAlias T - +- CTERelationRef xxxx, true, [c1#x, c2#x], false + +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false -- !query @@ -1055,7 +1055,7 @@ WithCTE : +- LocalRelation [col1#x, col2#x] +- Aggregate [max(c1#x) AS max(c1)#x] +- SubqueryAlias ABC - +- CTERelationRef xxxx, true, [c1#x, c2#x], false + +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out index e4e23339134c4..6dfbf13ce3595 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out @@ -1377,10 +1377,10 @@ WithCTE : : +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] : : +- LocalRelation [col1#x, col2#x] : +- SubqueryAlias cte1 -: +- CTERelationRef xxxx, true, [c1#x], false +: +- CTERelationRef xxxx, true, [c1#x], false, false +- Project [c1#x, c2#x] +- SubqueryAlias cte2 - +- CTERelationRef xxxx, true, [c1#x, c2#x], false + +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out new file mode 100644 index 0000000000000..5bced5e897e22 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out @@ -0,0 +1,86 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, null, collate(c1#x, utf8_binary) ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_binary) ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query analysis +Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS listagg(DISTINCT collate(c1, utf8_binary), NULL)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query analysis +Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0) AS listagg(DISTINCT collate(c1, utf8_lcase), NULL)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) +-- !query analysis +Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('x'), ('abc')) AS t(c1) +-- !query analysis +Aggregate [listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0) AS listagg(DISTINCT collate(c1, unicode_rtrim), NULL)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, null, c1#x ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, null, collate(c1#x, unicode_rtrim) ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim) ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + "sqlState" : "42K0K", + "messageParameters" : { + "funcArg" : "\"collate(c1, utf8_lcase)\"", + "funcName" : "`listagg`", + "orderingExpr" : "\"collate(c1, utf8_binary)\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out new file mode 100644 index 0000000000000..9ad94bce3a2be --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out @@ -0,0 +1,435 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE TEMP VIEW df AS +SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b) +-- !query analysis +CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b), false, false, LocalTempView, UNSUPPORTED, true + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE TEMP VIEW df2 AS +SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b) +-- !query analysis +CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b), false, false, LocalTempView, UNSUPPORTED, true + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(b) FROM df GROUP BY a +-- !query analysis +Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT string_agg(b) FROM df GROUP BY a +-- !query analysis +Aggregate [a#x], [string_agg(b#x, null, 0, 0) AS string_agg(b, NULL)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(b, NULL) FROM df GROUP BY a +-- !query analysis +Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(b) FROM df WHERE 1 != 1 +-- !query analysis +Aggregate [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x] ++- Filter NOT (1 = 1) + +- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(b, '|') FROM df GROUP BY a +-- !query analysis +Aggregate [a#x], [listagg(b#x, |, 0, 0) AS listagg(b, |)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, 0, 0) AS listagg(a, NULL)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(DISTINCT a) FROM df +-- !query analysis +Aggregate [listagg(distinct a#x, null, 0, 0) AS listagg(DISTINCT a, NULL)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, a#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a ASC NULLS FIRST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df +-- !query analysis +Project [listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] ++- Project [a#x, b#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] + +- Window [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) windowspecdefinition(b#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [b#x] + +- Project [a#x, b#x] + +- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, b#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b ASC NULLS FIRST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df +-- !query analysis +Aggregate [listagg(a#x, |, b#x DESC NULLS LAST, 0, 0) AS listagg(a, |) WITHIN GROUP (ORDER BY b DESC NULLS LAST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS FIRST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df +-- !query analysis +Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS LAST)#x] ++- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, null, 0, 0) AS listagg(c1, NULL)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, null, 0, 0) AS listagg(c1, NULL)#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query analysis +Aggregate [listagg(c1#x, 0x42, 0, 0) AS listagg(c1, X'42')#x] ++- SubqueryAlias t + +- Project [col1#x AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT listagg(a), listagg(b, ',') FROM df2 +-- !query analysis +Aggregate [listagg(cast(a#x as string), null, 0, 0) AS listagg(a, NULL)#x, listagg(cast(b#x as string), ,, 0, 0) AS listagg(b, ,)#x] ++- SubqueryAlias df2 + +- View (`df2`, [a#x, b#x]) + +- Project [cast(a#x as int) AS a#x, cast(b#x as boolean) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"c1\"", + "inputType" : "\"ARRAY\"", + "paramIndex" : "first", + "requiredType" : "(\"STRING\" or \"BINARY\")", + "sqlExpr" : "\"listagg(c1, NULL)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 18, + "fragment" : "listagg(c1)" + } ] +} + + +-- !query +SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.DATA_DIFF_TYPES", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "(\"BINARY\" or \"STRING\")", + "functionName" : "`listagg`", + "sqlExpr" : "\"listagg(c1, , )\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 24, + "fragment" : "listagg(c1, ', ')" + } ] +} + + +-- !query +SELECT listagg(b, a) FROM df GROUP BY a +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + "sqlState" : "42K09", + "messageParameters" : { + "inputExpr" : "\"a\"", + "inputName" : "`delimiter`", + "inputType" : "\"STRING\"", + "sqlExpr" : "\"listagg(b, a)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 20, + "fragment" : "listagg(b, a)" + } ] +} + + +-- !query +SELECT listagg(a) OVER (ORDER BY a) FROM df +-- !query analysis +Project [listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x] ++- Project [a#x, listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x] + +- Window [listagg(a#x, null, 0, 0) windowspecdefinition(a#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x], [a#x ASC NULLS FIRST] + +- Project [a#x] + +- SubqueryAlias df + +- View (`df`, [a#x, b#x]) + +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] + +- Project [a#x, b#x] + +- SubqueryAlias t + +- Project [col1#x AS a#x, col2#x AS b#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", + "sqlState" : "42601", + "messageParameters" : { + "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 61, + "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + } ] +} + + +-- !query +SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", + "sqlState" : "42601", + "messageParameters" : { + "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 64, + "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + } ] +} + + +-- !query +SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED", + "sqlState" : "0A000", + "messageParameters" : { + "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 44, + "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)" + } ] +} + + +-- !query +SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + "sqlState" : "42K0K", + "messageParameters" : { + "funcArg" : "\"a\"", + "funcName" : "`listagg`", + "orderingExpr" : "\"b\"" + } +} + + +-- !query +SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + "sqlState" : "42K0K", + "messageParameters" : { + "funcArg" : "\"a\"", + "funcName" : "`listagg`", + "orderingExpr" : "\"a\", \"b\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out index d103da1f6939f..95c2db670a87d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out @@ -74,7 +74,7 @@ SELECT department, mode(DISTINCT salary) FROM basic_pays GROUP BY department ORD -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`mode`" @@ -379,7 +379,7 @@ FROM basic_pays -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`mode`" @@ -401,7 +401,7 @@ FROM basic_pays -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`mode`" @@ -423,7 +423,7 @@ FROM basic_pays -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WRONG_NUM_ORDERINGS", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WRONG_NUM_ORDERINGS", "sqlState" : "42K0K", "messageParameters" : { "actualNum" : "1", diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out index 6b2c60f25bae3..4a717488e0172 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out @@ -47,7 +47,7 @@ WithCTE +- Filter (id#xL > scalar-subquery#x []) : +- Aggregate [max(id#xL) AS max(id)#xL] : +- SubqueryAlias tmp - : +- CTERelationRef xxxx, true, [id#xL], false + : +- CTERelationRef xxxx, true, [id#xL], false, false +- Range (0, 3, step=1) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out index 31e5f7b63c604..3088e93ead216 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out @@ -248,7 +248,7 @@ FROM aggr -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -270,7 +270,7 @@ FROM aggr -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -342,7 +342,7 @@ FROM aggr -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -364,7 +364,7 @@ FROM aggr -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -386,7 +386,7 @@ FROM aggr -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WRONG_NUM_ORDERINGS", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WRONG_NUM_ORDERINGS", "sqlState" : "42K0K", "messageParameters" : { "actualNum" : "2", diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out index b296396c886be..8089d7c4e962a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out @@ -265,11 +265,136 @@ CreateViewCommand `windowTestData`, select * from values +- LocalRelation [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x] +-- !query +from t +-- !query analysis +SubqueryAlias spark_catalog.default.t ++- Relation spark_catalog.default.t[x#x,y#x] csv + + -- !query table t +-- !query analysis +SubqueryAlias spark_catalog.default.t ++- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +from t |> select 1 as x -- !query analysis -Project [pipeexpression(1, false, SELECT) AS x#x] +Project [1 AS x#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +from t as t_alias +|> select t_alias.x +-- !query analysis +Project [x#x] ++- SubqueryAlias t_alias + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +from t as t_alias +|> select t_alias.x as tx, t_alias.y as ty +|> where ty = 'def' +|> select tx +-- !query analysis +Project [tx#x] ++- Filter (ty#x = def) + +- PipeOperator + +- Project [x#x AS tx#x, y#x AS ty#x] + +- SubqueryAlias t_alias + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +from t, other +|> select t.x + other.a as z +-- !query analysis +Project [(x#x + a#x) AS z#x] ++- Join Inner + :- SubqueryAlias spark_catalog.default.t + : +- Relation spark_catalog.default.t[x#x,y#x] csv + +- SubqueryAlias spark_catalog.default.other + +- Relation spark_catalog.default.other[a#x,b#x] json + + +-- !query +from t join other on (t.x = other.a) +|> select t.x + other.a as z +-- !query analysis +Project [(x#x + a#x) AS z#x] ++- Join Inner, (x#x = a#x) + :- SubqueryAlias spark_catalog.default.t + : +- Relation spark_catalog.default.t[x#x,y#x] csv + +- SubqueryAlias spark_catalog.default.other + +- Relation spark_catalog.default.other[a#x,b#x] json + + +-- !query +from t lateral view explode(array(100, 101)) as ly +|> select t.x + ly as z +-- !query analysis +Project [(x#x + ly#x) AS z#x] ++- Generate explode(array(100, 101)), false, as, [ly#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +from st +|> select col.i1 +-- !query analysis +Project [col#x.i1 AS i1#x] ++- SubqueryAlias spark_catalog.default.st + +- Relation spark_catalog.default.st[x#x,col#x] parquet + + +-- !query +from st as st_alias +|> select st_alias.col.i1 +-- !query analysis +Project [col#x.i1 AS i1#x] ++- SubqueryAlias st_alias + +- SubqueryAlias spark_catalog.default.st + +- Relation spark_catalog.default.st[x#x,col#x] parquet + + +-- !query +from values (0), (1) tab(col) +|> select col as x +-- !query analysis +Project [col#x AS x#x] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +from t +|> from t +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'from'", + "hint" : "" + } +} + + +-- !query +table t +|> select 1 as x +-- !query analysis +Project [1 AS x#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -288,7 +413,7 @@ table t |> select x, y |> select x + length(y) as z -- !query analysis -Project [pipeexpression((x#x + length(y#x)), false, SELECT) AS z#x] +Project [(x#x + length(y#x)) AS z#x] +- Project [x#x, y#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -298,7 +423,7 @@ Project [pipeexpression((x#x + length(y#x)), false, SELECT) AS z#x] values (0), (1) tab(col) |> select col * 2 as result -- !query analysis -Project [pipeexpression((col#x * 2), false, SELECT) AS result#x] +Project [(col#x * 2) AS result#x] +- SubqueryAlias tab +- LocalRelation [col#x] @@ -307,7 +432,7 @@ Project [pipeexpression((col#x * 2), false, SELECT) AS result#x] (select * from t union all select * from t) |> select x + length(y) as result -- !query analysis -Project [pipeexpression((x#x + length(y#x)), false, SELECT) AS result#x] +Project [(x#x + length(y#x)) AS result#x] +- Union false, false :- Project [x#x, y#x] : +- SubqueryAlias spark_catalog.default.t @@ -358,7 +483,7 @@ Project [col#x.i1 AS i1#x] table t |> select (select a from other where x = a limit 1) as result -- !query analysis -Project [pipeexpression(scalar-subquery#x [x#x], false, SELECT) AS result#x] +Project [scalar-subquery#x [x#x] AS result#x] : +- GlobalLimit 1 : +- LocalLimit 1 : +- Project [a#x] @@ -383,7 +508,7 @@ Project [scalar-subquery#x [] AS result#x] table t |> select (select any_value(a) from other where x = a limit 1) as result -- !query analysis -Project [pipeexpression(scalar-subquery#x [x#x], false, SELECT) AS result#x] +Project [scalar-subquery#x [x#x] AS result#x] : +- GlobalLimit 1 : +- LocalLimit 1 : +- Aggregate [any_value(a#x, false) AS any_value(a)#x] @@ -398,8 +523,8 @@ Project [pipeexpression(scalar-subquery#x [x#x], false, SELECT) AS result#x] table t |> select x + length(x) as z, z + 1 as plus_one -- !query analysis -Project [z#x, pipeexpression((z#x + 1), false, SELECT) AS plus_one#x] -+- Project [x#x, y#x, pipeexpression((x#x + length(cast(x#x as string))), false, SELECT) AS z#x] +Project [z#x, (z#x + 1) AS plus_one#x] ++- Project [x#x, y#x, (x#x + length(cast(x#x as string))) AS z#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -409,8 +534,8 @@ table t |> select first_value(x) over (partition by y) as result -- !query analysis Project [result#x] -+- Project [x#x, y#x, _we0#x, pipeexpression(_we0#x, false, SELECT) AS result#x] - +- Window [first_value(x#x, false) windowspecdefinition(y#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#x], [y#x] ++- Project [x#x, y#x, result#x, result#x] + +- Window [first_value(x#x, false) windowspecdefinition(y#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS result#x], [y#x] +- Project [x#x, y#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -426,8 +551,8 @@ select 1 x, 2 y, 3 z -- !query analysis Project [a2#x] +- Project [(1 + sum(x) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))#xL, avg(y) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, x#x, a2#x] - +- Project [x#x, y#x, _w1#x, z#x, _we0#xL, avg(y) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, _we2#x, (cast(1 as bigint) + _we0#xL) AS (1 + sum(x) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))#xL, avg(y) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, pipeexpression(_we2#x, false, SELECT) AS a2#x] - +- Window [avg(_w1#x) windowspecdefinition(y#x, z#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS _we2#x], [y#x], [z#x ASC NULLS FIRST] + +- Project [x#x, y#x, _w1#x, z#x, _we0#xL, avg(y) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, a2#x, (cast(1 as bigint) + _we0#xL) AS (1 + sum(x) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))#xL, avg(y) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, a2#x] + +- Window [avg(_w1#x) windowspecdefinition(y#x, z#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS a2#x], [y#x], [z#x ASC NULLS FIRST] +- Window [sum(x#x) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#xL, avg(y#x) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS avg(y) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] +- Project [x#x, y#x, (x#x + 1) AS _w1#x, z#x] +- Project [1 AS x#x, 2 AS y#x, 3 AS z#x] @@ -513,204 +638,703 @@ table t -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", - "sqlState" : "0A000", + "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", + "sqlState" : "0A000", + "messageParameters" : { + "clause" : "SELECT", + "expr" : "sum(x#x)" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 19, + "stopIndex" : 24, + "fragment" : "sum(x)" + } ] +} + + +-- !query +table t +|> select y, length(y) + sum(x) as result +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", + "sqlState" : "0A000", + "messageParameters" : { + "clause" : "SELECT", + "expr" : "sum(x#x)" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 34, + "stopIndex" : 39, + "fragment" : "sum(x)" + } ] +} + + +-- !query +table t +|> extend 1 as z +-- !query analysis +Project [x#x, y#x, 1 AS z#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 +-- !query analysis +Project [x#x, y#x, 1 AS 1#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend x as z +-- !query analysis +Project [x#x, y#x, x#x AS z#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend x + length(y) as z +-- !query analysis +Project [x#x, y#x, (x#x + length(y#x)) AS z#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend x + length(y) as z, x + 1 as zz +-- !query analysis +Project [x#x, y#x, (x#x + length(y#x)) AS z#x, (x#x + 1) AS zz#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend x + length(y) as z +|> extend z + 1 as zz +-- !query analysis +Project [x#x, y#x, z#x, (z#x + 1) AS zz#x] ++- Project [x#x, y#x, (x#x + length(y#x)) AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +select col from st +|> extend col.i1 as z +-- !query analysis +Project [col#x, col#x.i1 AS z#x] ++- Project [col#x] + +- SubqueryAlias spark_catalog.default.st + +- Relation spark_catalog.default.st[x#x,col#x] parquet + + +-- !query +table t +|> extend (select a from other where x = a limit 1) as z +-- !query analysis +Project [x#x, y#x, scalar-subquery#x [x#x] AS z#x] +: +- GlobalLimit 1 +: +- LocalLimit 1 +: +- Project [a#x] +: +- Filter (outer(x#x) = a#x) +: +- SubqueryAlias spark_catalog.default.other +: +- Relation spark_catalog.default.other[a#x,b#x] json ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> where exists ( + table other + |> extend t.x + |> select * except (a, b)) +-- !query analysis +Filter exists#x [x#x] +: +- Project [x#x] +: +- Project [a#x, b#x, outer(x#x)] +: +- SubqueryAlias spark_catalog.default.other +: +- Relation spark_catalog.default.other[a#x,b#x] json ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as x +-- !query analysis +Project [x#x, y#x, 1 AS x#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend first_value(x) over (partition by y) as result +-- !query analysis +Project [x#x, y#x, result#x] ++- Project [x#x, y#x, result#x, result#x] + +- Window [first_value(x#x, false) windowspecdefinition(y#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS result#x], [y#x] + +- Project [x#x, y#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend x + length(y) as z, z + 1 as plus_one +-- !query analysis +Project [x#x, y#x, z#x, (z#x + 1) AS plus_one#x] ++- Project [x#x, y#x, (x#x + length(y#x)) AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend sum(x) as z +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", + "sqlState" : "0A000", + "messageParameters" : { + "clause" : "EXTEND", + "expr" : "sum(x#x)" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 19, + "stopIndex" : 24, + "fragment" : "sum(x)" + } ] +} + + +-- !query +table t +|> extend distinct x as z +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'as'", + "hint" : "" + } +} + + +-- !query +table t +|> extend * +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "INVALID_USAGE_OF_STAR_OR_REGEX", + "sqlState" : "42000", + "messageParameters" : { + "elem" : "'*'", + "prettyName" : "expression `pipeexpression`" + } +} + + +-- !query +table t +|> set x = 1 +-- !query analysis +Project [1 AS x#x, y#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> set y = x +-- !query analysis +Project [x#x, x#x AS y#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as z +|> set z = x + length(y) +-- !query analysis +Project [x#x, y#x, (x#x + length(y#x)) AS z#x] ++- Project [x#x, y#x, 1 AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as z +|> extend 2 as zz +|> set z = x + length(y), zz = x + 1 +-- !query analysis +Project [x#x, y#x, z#x, (x#x + 1) AS zz#x] ++- Project [x#x, y#x, (x#x + length(y#x)) AS z#x, zz#x] + +- Project [x#x, y#x, z#x, 2 AS zz#x] + +- Project [x#x, y#x, 1 AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table other +|> extend 3 as c +|> set a = b, b = c +-- !query analysis +Project [a#x, c#x AS b#x, c#x] ++- Project [b#x AS a#x, b#x, c#x] + +- Project [a#x, b#x, 3 AS c#x] + +- SubqueryAlias spark_catalog.default.other + +- Relation spark_catalog.default.other[a#x,b#x] json + + +-- !query +table t +|> extend 1 as z +|> extend 2 as zz +|> set z = x + length(y), zz = z + 1 +-- !query analysis +Project [x#x, y#x, z#x, (z#x + 1) AS zz#x] ++- Project [x#x, y#x, (x#x + length(y#x)) AS z#x, zz#x] + +- Project [x#x, y#x, z#x, 2 AS zz#x] + +- Project [x#x, y#x, 1 AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as z +|> set z = x + length(y) +|> set z = z + 1 +-- !query analysis +Project [x#x, y#x, (z#x + 1) AS z#x] ++- Project [x#x, y#x, (x#x + length(y#x)) AS z#x] + +- Project [x#x, y#x, 1 AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as z +|> set z = x + length(y), z = z + 1 +-- !query analysis +Project [x#x, y#x, (z#x + 1) AS z#x] ++- Project [x#x, y#x, (x#x + length(y#x)) AS z#x] + +- Project [x#x, y#x, 1 AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +select col from st +|> extend 1 as z +|> set z = col.i1 +-- !query analysis +Project [col#x, col#x.i1 AS z#x] ++- Project [col#x, 1 AS z#x] + +- Project [col#x] + +- SubqueryAlias spark_catalog.default.st + +- Relation spark_catalog.default.st[x#x,col#x] parquet + + +-- !query +table t +|> set y = (select a from other where x = a limit 1) +-- !query analysis +Project [x#x, scalar-subquery#x [x#x] AS y#x] +: +- GlobalLimit 1 +: +- LocalLimit 1 +: +- Project [a#x] +: +- Filter (outer(x#x) = a#x) +: +- SubqueryAlias spark_catalog.default.other +: +- Relation spark_catalog.default.other[a#x,b#x] json ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as `x.y.z` +|> set `x.y.z` = x + length(y) +-- !query analysis +Project [x#x, y#x, (x#x + length(y#x)) AS x.y.z#x] ++- Project [x#x, y#x, 1 AS x.y.z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as z +|> set z = first_value(x) over (partition by y) +-- !query analysis +Project [x#x, y#x, z#x] ++- Project [x#x, y#x, z#x, z#x] + +- Window [first_value(x#x, false) windowspecdefinition(y#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS z#x], [y#x] + +- Project [x#x, y#x] + +- Project [x#x, y#x, 1 AS z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +values (0), (1) lhs(a) +|> inner join values (1), (2) rhs(a) using (a) +|> extend lhs.a + rhs.a as z1 +|> extend lhs.a - rhs.a as z2 +|> drop z1 +|> where z2 = 0 +|> order by lhs.a, rhs.a, z2 +|> set z2 = 4 +|> limit 2 +|> select lhs.a, rhs.a, z2 +-- !query analysis +Project [a#x, a#x, z2#x] ++- GlobalLimit 2 + +- LocalLimit 2 + +- PipeOperator + +- Project [a#x, 4 AS z2#x, a#x] + +- Project [a#x, z2#x, a#x] + +- Sort [a#x ASC NULLS FIRST, a#x ASC NULLS FIRST, z2#x ASC NULLS FIRST], true + +- PipeOperator + +- Filter (z2#x = 0) + +- PipeOperator + +- Project [a#x, z2#x, a#x, a#x] + +- Project [a#x, z1#x, (a#x - a#x) AS z2#x, a#x, a#x] + +- Project [a#x, (a#x + a#x) AS z1#x, a#x, a#x, a#x] + +- Project [a#x, a#x, a#x, a#x, a#x] + +- Join Inner, (a#x = a#x) + :- SubqueryAlias lhs + : +- LocalRelation [a#x] + +- SubqueryAlias rhs + +- LocalRelation [a#x] + + +-- !query +table t +|> set z = 1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`z`", + "proposal" : "`x`, `y`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 1, + "stopIndex" : 20, + "fragment" : "table t\n|> set z = 1" + } ] +} + + +-- !query +table t +|> set x = 1 as z +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'as'", + "hint" : "" + } +} + + +-- !query +select col from st +|> set col.i1 = 42 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "_LEGACY_ERROR_TEMP_0035", + "messageParameters" : { + "message" : "SQL pipe syntax |> SET operator with multi-part assignment key (only single-part keys are allowed)" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 27, + "stopIndex" : 37, + "fragment" : "col.i1 = 42" + } ] +} + + +-- !query +table t +|> drop y +-- !query analysis +Project [x#x] ++- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +select 1 as x, 2 as y, 3 as z +|> drop z, y +-- !query analysis +Project [x#x] ++- Project [1 AS x#x, 2 AS y#x, 3 AS z#x] + +- OneRowRelation + + +-- !query +select 1 as x, 2 as y, 3 as z +|> drop z +|> drop y +-- !query analysis +Project [x#x] ++- Project [x#x, y#x] + +- Project [1 AS x#x, 2 AS y#x, 3 AS z#x] + +- OneRowRelation + + +-- !query +select x from t +|> drop x +-- !query analysis +Project ++- Project [x#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> extend 1 as `x.y.z` +|> drop `x.y.z` +-- !query analysis +Project [x#x, y#x] ++- Project [x#x, y#x, 1 AS x.y.z#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + +-- !query +table t +|> drop z +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`z`", + "proposal" : "`x`, `y`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 1, + "stopIndex" : 17, + "fragment" : "table t\n|> drop z" + } ] +} + + +-- !query +table st +|> drop col.i1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'.'", + "hint" : "" + } +} + + +-- !query +table st +|> drop `col.i1` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", "messageParameters" : { - "clause" : "SELECT", - "expr" : "sum(x#x)" + "objectName" : "`col.i1`", + "proposal" : "`col`, `x`" }, "queryContext" : [ { "objectType" : "", "objectName" : "", - "startIndex" : 19, - "stopIndex" : 24, - "fragment" : "sum(x)" + "startIndex" : 1, + "stopIndex" : 25, + "fragment" : "table st\n|> drop `col.i1`" } ] } -- !query -table t -|> select y, length(y) + sum(x) as result +select 1 as x, 2 as y, 3 as z +|> drop z, y, z -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", - "sqlState" : "0A000", + "condition" : "EXCEPT_OVERLAPPING_COLUMNS", + "sqlState" : "42702", "messageParameters" : { - "clause" : "SELECT", - "expr" : "sum(x#x)" + "columns" : "z, y, z" }, "queryContext" : [ { "objectType" : "", "objectName" : "", - "startIndex" : 34, - "stopIndex" : 39, - "fragment" : "sum(x)" + "startIndex" : 1, + "stopIndex" : 45, + "fragment" : "select 1 as x, 2 as y, 3 as z\n|> drop z, y, z" } ] } -- !query table t -|> extend 1 as z --- !query analysis -Project [x#x, y#x, pipeexpression(1, false, EXTEND) AS z#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv - - --- !query -table t -|> extend 1 --- !query analysis -Project [x#x, y#x, pipeexpression(1, false, EXTEND) AS pipeexpression(1)#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv - - --- !query -table t -|> extend x as z --- !query analysis -Project [x#x, y#x, pipeexpression(x#x, false, EXTEND) AS z#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv - - --- !query -table t -|> extend x + length(y) as z --- !query analysis -Project [x#x, y#x, pipeexpression((x#x + length(y#x)), false, EXTEND) AS z#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv - - --- !query -table t -|> extend x + length(y) as z, x + 1 as zz --- !query analysis -Project [x#x, y#x, pipeexpression((x#x + length(y#x)), false, EXTEND) AS z#x, pipeexpression((x#x + 1), false, EXTEND) AS zz#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv - - --- !query -table t -|> extend x + length(y) as z -|> extend z + 1 as zz +|> as u +|> select u.x, u.y -- !query analysis -Project [x#x, y#x, z#x, pipeexpression((z#x + 1), false, EXTEND) AS zz#x] -+- Project [x#x, y#x, pipeexpression((x#x + length(y#x)), false, EXTEND) AS z#x] +Project [x#x, y#x] ++- SubqueryAlias u +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query -select col from st -|> extend col.i1 as z +select 1 as x, 2 as y +|> as u +|> select u.x, u.y -- !query analysis -Project [col#x, pipeexpression(col#x.i1, false, EXTEND) AS z#x] -+- Project [col#x] - +- SubqueryAlias spark_catalog.default.st - +- Relation spark_catalog.default.st[x#x,col#x] parquet +Project [x#x, y#x] ++- SubqueryAlias u + +- Project [1 AS x#x, 2 AS y#x] + +- OneRowRelation -- !query table t -|> extend (select a from other where x = a limit 1) as z +|> as `u.v` +|> select `u.v`.x, `u.v`.y -- !query analysis -Project [x#x, y#x, pipeexpression(scalar-subquery#x [x#x], false, EXTEND) AS z#x] -: +- GlobalLimit 1 -: +- LocalLimit 1 -: +- Project [a#x] -: +- Filter (outer(x#x) = a#x) -: +- SubqueryAlias spark_catalog.default.other -: +- Relation spark_catalog.default.other[a#x,b#x] json -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv +Project [x#x, y#x] ++- SubqueryAlias `u.v` + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query table t -|> where exists ( - table other - |> extend t.x - |> select * except (a, b)) +|> as u +|> as v +|> select v.x, v.y -- !query analysis -Filter exists#x [x#x] -: +- Project [pipeexpression(outer(spark_catalog.default.t.x))#x] -: +- Project [a#x, b#x, pipeexpression(outer(x#x), false, EXTEND) AS pipeexpression(outer(spark_catalog.default.t.x))#x] -: +- SubqueryAlias spark_catalog.default.other -: +- Relation spark_catalog.default.other[a#x,b#x] json -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv +Project [x#x, y#x] ++- SubqueryAlias v + +- SubqueryAlias u + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query table t -|> extend 1 as x +|> as u +|> where u.x = 1 -- !query analysis -Project [x#x, y#x, pipeexpression(1, false, EXTEND) AS x#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv +Filter (x#x = 1) ++- PipeOperator + +- SubqueryAlias u + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query table t -|> extend first_value(x) over (partition by y) as result +|> as u, v -- !query analysis -Project [x#x, y#x, result#x] -+- Project [x#x, y#x, _we0#x, pipeexpression(_we0#x, false, EXTEND) AS result#x] - +- Window [first_value(x#x, false) windowspecdefinition(y#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#x], [y#x] - +- Project [x#x, y#x] - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "','", + "hint" : "" + } +} -- !query table t -|> extend x + length(y) as z, z + 1 as plus_one +|> as 1 + 2 -- !query analysis -Project [x#x, y#x, z#x, pipeexpression((z#x + 1), false, EXTEND) AS plus_one#x] -+- Project [x#x, y#x, pipeexpression((x#x + length(y#x)), false, EXTEND) AS z#x] - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'1'", + "hint" : "" + } +} -- !query table t -|> extend sum(x) as z +|> as u-v -- !query analysis -org.apache.spark.sql.AnalysisException +org.apache.spark.sql.catalyst.parser.ParseException { - "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", - "sqlState" : "0A000", + "condition" : "INVALID_IDENTIFIER", + "sqlState" : "42602", "messageParameters" : { - "clause" : "EXTEND", - "expr" : "sum(x#x)" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 19, - "stopIndex" : 24, - "fragment" : "sum(x)" - } ] + "ident" : "u-v" + } } -- !query table t -|> extend distinct x as z +|> as u@v -- !query analysis org.apache.spark.sql.catalyst.parser.ParseException { "condition" : "PARSE_SYNTAX_ERROR", "sqlState" : "42601", "messageParameters" : { - "error" : "'as'", + "error" : "'@'", "hint" : "" } } @@ -718,15 +1342,15 @@ org.apache.spark.sql.catalyst.parser.ParseException -- !query table t -|> extend * +|> as u#######v -- !query analysis -org.apache.spark.sql.AnalysisException +org.apache.spark.sql.catalyst.parser.ParseException { - "condition" : "INVALID_USAGE_OF_STAR_OR_REGEX", - "sqlState" : "42000", + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", "messageParameters" : { - "elem" : "'*'", - "prettyName" : "expression `pipeexpression`" + "error" : "'#'", + "hint" : "" } } @@ -736,8 +1360,9 @@ table t |> where true -- !query analysis Filter true -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -745,8 +1370,9 @@ table t |> where x + length(y) < 4 -- !query analysis Filter ((x#x + length(y#x)) < 4) -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -755,10 +1381,11 @@ table t |> where x + length(y) < 3 -- !query analysis Filter ((x#x + length(y#x)) < 3) -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Filter ((x#x + length(y#x)) < 4) - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -766,7 +1393,7 @@ Filter ((x#x + length(y#x)) < 3) |> where x = 1 -- !query analysis Filter (x#x = 1) -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Aggregate [x#x], [x#x, sum(length(y#x)) AS sum_len#xL] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -777,8 +1404,9 @@ table t |> where t.x = 1 -- !query analysis Filter (x#x = 1) -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -786,8 +1414,9 @@ table t |> where spark_catalog.default.t.x = 1 -- !query analysis Filter (x#x = 1) -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -795,7 +1424,7 @@ Filter (x#x = 1) |> where col.i1 = 1 -- !query analysis Filter (col#x.i1 = 1) -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Project [col#x] +- SubqueryAlias spark_catalog.default.st +- Relation spark_catalog.default.st[x#x,col#x] parquet @@ -806,8 +1435,9 @@ table st |> where st.col.i1 = 2 -- !query analysis Filter (col#x.i1 = 2) -+- SubqueryAlias spark_catalog.default.st - +- Relation spark_catalog.default.st[x#x,col#x] parquet ++- PipeOperator + +- SubqueryAlias spark_catalog.default.st + +- Relation spark_catalog.default.st[x#x,col#x] parquet -- !query @@ -821,8 +1451,9 @@ Filter exists#x [x#x] : +- Filter (outer(x#x) = a#x) : +- SubqueryAlias spark_catalog.default.other : +- Relation spark_catalog.default.other[a#x,b#x] json -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -836,8 +1467,9 @@ Filter (scalar-subquery#x [x#x] = 1) : +- Filter (outer(x#x) = a#x) : +- SubqueryAlias spark_catalog.default.other : +- Relation spark_catalog.default.other[a#x,b#x] json -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -939,7 +1571,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "sqlState" : "42703", "messageParameters" : { "objectName" : "`y`", - "proposal" : "`x`, `z`" + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" }, "queryContext" : [ { "objectType" : "", @@ -951,6 +1583,78 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +table t +|> select x, length(y) as z +|> limit 1000 +|> where x + length(y) < 4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`y`", + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 71, + "stopIndex" : 71, + "fragment" : "y" + } ] +} + + +-- !query +table t +|> select x, length(y) as z +|> limit 1000 offset 1 +|> where x + length(y) < 4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`y`", + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 80, + "stopIndex" : 80, + "fragment" : "y" + } ] +} + + +-- !query +table t +|> select x, length(y) as z +|> order by x, y +|> where x + length(y) < 4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`y`", + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 52, + "stopIndex" : 52, + "fragment" : "y" + } ] +} + + -- !query (select x, sum(length(y)) as sum_len from t group by x) |> where sum(length(y)) = 3 @@ -961,7 +1665,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "sqlState" : "42703", "messageParameters" : { "objectName" : "`y`", - "proposal" : "`x`, `sum_len`" + "proposal" : "`sum_len`, `spark_catalog`.`default`.`t`.`x`" }, "queryContext" : [ { "objectType" : "", @@ -1004,7 +1708,7 @@ table courseSales Project [c#x, __pivot_sum(e) AS s AS `sum(e) AS s`#x[0] AS firstYear_s#xL, __pivot_avg(e) AS a AS `avg(e) AS a`#x[0] AS firstYear_a#x, __pivot_sum(e) AS s AS `sum(e) AS s`#x[1] AS secondYear_s#xL, __pivot_avg(e) AS a AS `avg(e) AS a`#x[1] AS secondYear_a#x] +- Aggregate [c#x], [c#x, pivotfirst(y#x, sum(e) AS s#xL, 2012, 2013, 0, 0) AS __pivot_sum(e) AS s AS `sum(e) AS s`#x, pivotfirst(y#x, avg(e) AS a#x, 2012, 2013, 0, 0) AS __pivot_avg(e) AS a AS `avg(e) AS a`#x] +- Aggregate [c#x, y#x], [c#x, y#x, sum(e#x) AS sum(e) AS s#xL, avg(e#x) AS avg(e) AS a#x] - +- Project [pipeexpression(year#x, false, SELECT) AS y#x, pipeexpression(course#x, false, SELECT) AS c#x, pipeexpression(earnings#x, false, SELECT) AS e#x] + +- Project [year#x AS y#x, course#x AS c#x, earnings#x AS e#x] +- SubqueryAlias coursesales +- View (`courseSales`, [course#x, year#x, earnings#x]) +- Project [cast(course#x as string) AS course#x, cast(year#x as int) AS year#x, cast(earnings#x as int) AS earnings#x] @@ -1923,7 +2627,7 @@ table natural_join_test_t1 |> where k = "one" -- !query analysis Filter (k#x = one) -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Project [k#x, v1#x, v2#x] +- Join Inner, (k#x = k#x) :- SubqueryAlias natural_join_test_t1 @@ -2110,21 +2814,34 @@ Union false, false -- !query -values (0, 1) tab(x, y) +values (2, 'xyz') tab(x, y) |> union table t |> where x = 0 -- !query analysis -Distinct -+- Union false, false - :- Project [x#x, cast(y#x as bigint) AS y#xL] - : +- SubqueryAlias tab - : +- LocalRelation [x#x, y#x] - +- Project [x#x, cast(y#x as bigint) AS y#xL] - +- Filter (x#x = 0) +Filter (x#x = 0) ++- PipeOperator + +- Distinct + +- Union false, false + :- SubqueryAlias tab + : +- LocalRelation [x#x, y#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv +-- !query +values (2, 'xyz') tab(x, y) +|> union table t +|> drop x +-- !query analysis +Project [y#x] ++- Distinct + +- Union false, false + :- SubqueryAlias tab + : +- LocalRelation [x#x, y#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv + + -- !query (select * from t) |> union all (select * from t) @@ -2260,8 +2977,9 @@ table t |> order by x -- !query analysis Sort [x#x ASC NULLS FIRST], true -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2269,7 +2987,7 @@ Sort [x#x ASC NULLS FIRST], true |> order by x -- !query analysis Sort [x#x ASC NULLS FIRST], true -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Project [x#x, y#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -2280,8 +2998,9 @@ values (0, 'abc') tab(x, y) |> order by x -- !query analysis Sort [x#x ASC NULLS FIRST], true -+- SubqueryAlias tab - +- LocalRelation [x#x, y#x] ++- PipeOperator + +- SubqueryAlias tab + +- LocalRelation [x#x, y#x] -- !query @@ -2291,10 +3010,11 @@ table t -- !query analysis GlobalLimit 1 +- LocalLimit 1 - +- SubqueryAlias __auto_generated_subquery_name + +- PipeOperator +- Sort [x#x ASC NULLS FIRST], true - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2306,11 +3026,12 @@ table t GlobalLimit 2 +- LocalLimit 2 +- Offset 1 - +- SubqueryAlias __auto_generated_subquery_name + +- PipeOperator +- Project [y#x] +- Filter (x#x = 1) - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2320,11 +3041,12 @@ table t |> offset 1 -- !query analysis Offset 1 -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Project [y#x] +- Filter (x#x = 1) - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2332,8 +3054,9 @@ table t |> limit all offset 0 -- !query analysis Offset 0 -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2341,8 +3064,9 @@ table t |> distribute by x -- !query analysis RepartitionByExpression [x#x] -+- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv ++- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2351,8 +3075,9 @@ table t -- !query analysis Sort [x#x ASC NULLS FIRST], false +- RepartitionByExpression [x#x] - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2361,8 +3086,9 @@ table t -- !query analysis RepartitionByExpression [x#x] +- Sort [x#x ASC NULLS FIRST], false - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2372,8 +3098,9 @@ order by y -- !query analysis Sort [y#x ASC NULLS FIRST], true +- Sort [x#x DESC NULLS LAST], true - +- SubqueryAlias spark_catalog.default.t - +- Relation spark_catalog.default.t[x#x,y#x] csv + +- PipeOperator + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[x#x,y#x] csv -- !query @@ -2482,7 +3209,7 @@ org.apache.spark.sql.catalyst.parser.ParseException table other |> aggregate sum(b) as result group by a -- !query analysis -Aggregate [a#x], [a#x, pipeexpression(sum(b#x), true, AGGREGATE) AS result#xL] +Aggregate [a#x], [a#x, sum(b#x) AS result#xL] +- SubqueryAlias spark_catalog.default.other +- Relation spark_catalog.default.other[a#x,b#x] json @@ -2493,7 +3220,7 @@ table other |> select result -- !query analysis Project [result#xL] -+- Aggregate [a#x], [a#x, pipeexpression(sum(b#x), true, AGGREGATE) AS result#xL] ++- Aggregate [a#x], [a#x, sum(b#x) AS result#xL] +- SubqueryAlias spark_catalog.default.other +- Relation spark_catalog.default.other[a#x,b#x] json @@ -2504,7 +3231,7 @@ table other |> select gkey -- !query analysis Project [gkey#x] -+- Aggregate [(a#x + 1)], [(a#x + 1) AS gkey#x, pipeexpression(sum(b#x), true, AGGREGATE) AS pipeexpression(sum(b))#xL] ++- Aggregate [(a#x + 1)], [(a#x + 1) AS gkey#x, sum(b#x) AS sum(b)#xL] +- SubqueryAlias spark_catalog.default.other +- Relation spark_catalog.default.other[a#x,b#x] json @@ -2522,16 +3249,106 @@ Aggregate [x#x, y#x], [x#x, y#x] select 3 as x, 4 as y |> aggregate group by 1, 2 -- !query analysis -Aggregate [1, 2], [1 AS 1#x, 2 AS 2#x] +Aggregate [x#x, y#x], [x#x, y#x] ++- Project [3 AS x#x, 4 AS y#x] + +- OneRowRelation + + +-- !query +values (3, 4) as tab(x, y) +|> aggregate sum(y) group by 1 +-- !query analysis +Aggregate [x#x], [x#x, sum(y#x) AS sum(y)#xL] ++- SubqueryAlias tab + +- LocalRelation [x#x, y#x] + + +-- !query +values (3, 4), (5, 4) as tab(x, y) +|> aggregate sum(y) group by 1 +-- !query analysis +Aggregate [x#x], [x#x, sum(y#x) AS sum(y)#xL] ++- SubqueryAlias tab + +- LocalRelation [x#x, y#x] + + +-- !query +select 3 as x, 4 as y +|> aggregate sum(y) group by 1, 1 +-- !query analysis +Aggregate [x#x, x#x], [x#x, x#x, sum(y#x) AS sum(y)#xL] ++- Project [3 AS x#x, 4 AS y#x] + +- OneRowRelation + + +-- !query +select 1 as `1`, 2 as `2` +|> aggregate sum(`2`) group by `1` +-- !query analysis +Aggregate [1#x], [1#x, sum(2#x) AS sum(2)#xL] ++- Project [1 AS 1#x, 2 AS 2#x] + +- OneRowRelation + + +-- !query +select 3 as x, 4 as y +|> aggregate sum(y) group by 2 +-- !query analysis +Aggregate [y#x], [y#x, sum(y#x) AS sum(y)#xL] +- Project [3 AS x#x, 4 AS y#x] +- OneRowRelation +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 2 +-- !query analysis +Aggregate [y#x], [y#x, sum(y#x) AS sum(y)#xL] ++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x] + +- OneRowRelation + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 3 +-- !query analysis +Aggregate [z#x], [z#x, sum(y#x) AS sum(y)#xL] ++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x] + +- OneRowRelation + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 2, 3 +-- !query analysis +Aggregate [y#x, z#x], [y#x, z#x, sum(y#x) AS sum(y)#xL] ++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x] + +- OneRowRelation + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 1, 2, 3 +-- !query analysis +Aggregate [x#x, y#x, z#x], [x#x, y#x, z#x, sum(y#x) AS sum(y)#xL] ++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x] + +- OneRowRelation + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by x, 2, 3 +-- !query analysis +Aggregate [x#x, y#x, z#x], [x#x, y#x, z#x, sum(y#x) AS sum(y)#xL] ++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x] + +- OneRowRelation + + -- !query table t |> aggregate sum(x) -- !query analysis -Aggregate [pipeexpression(sum(x#x), true, AGGREGATE) AS pipeexpression(sum(x))#xL] +Aggregate [sum(x#x) AS sum(x)#xL] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -2540,7 +3357,7 @@ Aggregate [pipeexpression(sum(x#x), true, AGGREGATE) AS pipeexpression(sum(x))#x table t |> aggregate sum(x) + 1 as result_plus_one -- !query analysis -Aggregate [pipeexpression((sum(x#x) + cast(1 as bigint)), true, AGGREGATE) AS result_plus_one#xL] +Aggregate [(sum(x#x) + cast(1 as bigint)) AS result_plus_one#xL] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -2551,7 +3368,7 @@ table other |> where a = 1 -- !query analysis Filter (a#x = 1) -+- SubqueryAlias __auto_generated_subquery_name ++- PipeOperator +- Aggregate [a#x], [a#x] +- SubqueryAlias spark_catalog.default.other +- Relation spark_catalog.default.other[a#x,b#x] json @@ -2590,9 +3407,9 @@ select 1 x, 2 y, 3 z |> aggregate avg(z) z group by x |> aggregate count(distinct z) c -- !query analysis -Aggregate [pipeexpression(count(distinct z#x), true, AGGREGATE) AS c#xL] -+- Aggregate [x#x], [x#x, pipeexpression(avg(z#xL), true, AGGREGATE) AS z#x] - +- Aggregate [x#x, y#x], [x#x, y#x, pipeexpression(sum(z#x), true, AGGREGATE) AS z#xL] +Aggregate [count(distinct z#x) AS c#xL] ++- Aggregate [x#x], [x#x, avg(z#xL) AS z#x] + +- Aggregate [x#x, y#x], [x#x, y#x, sum(z#x) AS z#xL] +- Project [1 AS x#x, 2 AS y#x, 3 AS z#x] +- OneRowRelation @@ -2603,7 +3420,7 @@ select 1 x, 3 z |> select x -- !query analysis Project [x#x] -+- Aggregate [x#x, z#x, x#x], [x#x, z#x, x#x, pipeexpression(count(1), true, AGGREGATE) AS pipeexpression(count(1))#xL] ++- Aggregate [x#x, z#x, x#x], [x#x, z#x, x#x, count(1) AS count(1)#xL] +- Project [1 AS x#x, 3 AS z#x] +- OneRowRelation @@ -2612,7 +3429,7 @@ Project [x#x] table other |> aggregate a + count(b) group by a -- !query analysis -Aggregate [a#x], [a#x, pipeexpression((cast(a#x as bigint) + count(b#x)), true, AGGREGATE) AS pipeexpression((a + count(b)))#xL] +Aggregate [a#x], [a#x, (cast(a#x as bigint) + count(b#x)) AS (a + count(b))#xL] +- SubqueryAlias spark_catalog.default.other +- Relation spark_catalog.default.other[a#x,b#x] json @@ -2895,7 +3712,7 @@ org.apache.spark.sql.catalyst.parser.ParseException "condition" : "UNSUPPORTED_FEATURE.PIPE_OPERATOR_AGGREGATE_UNSUPPORTED_CASE", "sqlState" : "0A000", "messageParameters" : { - "case" : "window functions" + "case" : "window functions; please update the query to move the window functions to a subsequent |> SELECT operator instead" }, "queryContext" : [ { "objectType" : "", @@ -2991,8 +3808,8 @@ Project [cate#x, val#x, sum_val#xL, first_value(cate) OVER (ORDER BY val ASC NUL +- Window [first_value(cate#x, false) windowspecdefinition(val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value(cate) OVER (ORDER BY val ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x], [val#x ASC NULLS FIRST] +- Project [cate#x, val#x, sum_val#xL] +- Project [cate#x, val#x, sum_val#xL] - +- Project [cate#x, val#x, _we0#xL, pipeexpression(_we0#xL, false, SELECT) AS sum_val#xL] - +- Window [sum(val#x) windowspecdefinition(cate#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#xL], [cate#x] + +- Project [cate#x, val#x, sum_val#xL, sum_val#xL] + +- Window [sum(val#x) windowspecdefinition(cate#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS sum_val#xL], [cate#x] +- Project [cate#x, val#x] +- SubqueryAlias windowtestdata +- View (`windowTestData`, [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x]) @@ -3159,6 +3976,709 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +with customer_total_return as +(select + sr_customer_sk as ctr_customer_sk, + sr_store_sk as ctr_store_sk, + sum(sr_return_amt) as ctr_total_return + from store_returns, date_dim + where sr_returned_date_sk = d_date_sk and d_year = 2000 + group by sr_customer_sk, sr_store_sk) +select c_customer_id +from customer_total_return ctr1, store, customer +where ctr1.ctr_total_return > + (select avg(ctr_total_return) * 1.2 + from customer_total_return ctr2 + where ctr1.ctr_store_sk = ctr2.ctr_store_sk) + and s_store_sk = ctr1.ctr_store_sk + and s_state = 'tn' + and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`store_returns`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 161, + "stopIndex" : 173, + "fragment" : "store_returns" + } ] +} + + +-- !query +with customer_total_return as + (from store_returns + |> join date_dim + |> where sr_returned_date_sk = d_date_sk and d_year = 2000 + |> aggregate sum(sr_return_amt) as ctr_total_return + group by sr_customer_sk as ctr_customer_sk, sr_store_sk as ctr_store_sk) +from customer_total_return ctr1 +|> join store +|> join customer +|> where ctr1.ctr_total_return > + (table customer_total_return + |> as ctr2 + |> where ctr1.ctr_store_sk = ctr2.ctr_store_sk + |> aggregate avg(ctr_total_return) * 1.2) + and s_store_sk = ctr1.ctr_store_sk + and s_state = 'tn' + and ctr1.ctr_customer_sk = c_customer_sk +|> order by c_customer_id +|> limit 100 +|> select c_customer_id +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`store_returns`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 39, + "stopIndex" : 51, + "fragment" : "store_returns" + } ] +} + + +-- !query +with wscs as +( select + sold_date_sk, + sales_price + from (select + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + from web_sales) x + union all + (select + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price + from catalog_sales)), + wswscs as + ( select + d_week_seq, + sum(case when (d_day_name = 'sunday') + then sales_price + else null end) + sun_sales, + sum(case when (d_day_name = 'monday') + then sales_price + else null end) + mon_sales, + sum(case when (d_day_name = 'tuesday') + then sales_price + else null end) + tue_sales, + sum(case when (d_day_name = 'wednesday') + then sales_price + else null end) + wed_sales, + sum(case when (d_day_name = 'thursday') + then sales_price + else null end) + thu_sales, + sum(case when (d_day_name = 'friday') + then sales_price + else null end) + fri_sales, + sum(case when (d_day_name = 'saturday') + then sales_price + else null end) + sat_sales + from wscs, date_dim + where d_date_sk = sold_date_sk + group by d_week_seq) +select + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2) +from + (select + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + from wswscs, date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and d_year = 2001) y, + (select + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + from wswscs, date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and d_year = 2001 + 1) z +where d_week_seq1 = d_week_seq2 - 53 +order by d_week_seq1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`web_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 148, + "stopIndex" : 156, + "fragment" : "web_sales" + } ] +} + + +-- !query +with wscs as + (table web_sales + |> select + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + |> as x + |> union all ( + table catalog_sales + |> select + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price) + |> select + sold_date_sk, + sales_price), +wswscs as + (table wscs + |> join date_dim + |> where d_date_sk = sold_date_sk + |> aggregate + sum(case when (d_day_name = 'sunday') + then sales_price + else null end) + sun_sales, + sum(case when (d_day_name = 'monday') + then sales_price + else null end) + mon_sales, + sum(case when (d_day_name = 'tuesday') + then sales_price + else null end) + tue_sales, + sum(case when (d_day_name = 'wednesday') + then sales_price + else null end) + wed_sales, + sum(case when (d_day_name = 'thursday') + then sales_price + else null end) + thu_sales, + sum(case when (d_day_name = 'friday') + then sales_price + else null end) + fri_sales, + sum(case when (d_day_name = 'saturday') + then sales_price + else null end) + sat_sales + group by d_week_seq) +table wswscs +|> join date_dim +|> where date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 +|> select + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 +|> as y +|> join ( + table wswscs + |> join date_dim + |> where date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 + 1 + |> select + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + |> as z) +|> where d_week_seq1 = d_week_seq2 - 53 +|> order by d_week_seq1 +|> select + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`web_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 23, + "stopIndex" : 31, + "fragment" : "web_sales" + } ] +} + + +-- !query +select + dt.d_year, + item.i_brand_id brand_id, + item.i_brand brand, + sum(ss_ext_sales_price) sum_agg +from date_dim dt, store_sales, item +where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 128 + and dt.d_moy = 11 +group by dt.d_year, item.i_brand, item.i_brand_id +order by dt.d_year, sum_agg desc, brand_id +limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`date_dim`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 110, + "stopIndex" : 117, + "fragment" : "date_dim" + } ] +} + + +-- !query +table date_dim +|> as dt +|> join store_sales +|> join item +|> where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 128 + and dt.d_moy = 11 +|> aggregate sum(ss_ext_sales_price) sum_agg + group by dt.d_year d_year, item.i_brand_id brand_id, item.i_brand brand +|> order by d_year, sum_agg desc, brand_id +|> limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`date_dim`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 7, + "stopIndex" : 14, + "fragment" : "date_dim" + } ] +} + + +-- !query +select + i_item_desc, + i_category, + i_class, + i_current_price, + sum(ws_ext_sales_price) as itemrevenue, + sum(ws_ext_sales_price) * 100 / sum(sum(ws_ext_sales_price)) + over + (partition by i_class) as revenueratio +from + web_sales, item, date_dim +where + ws_item_sk = i_item_sk + and i_category in ('sports', 'books', 'home') + and ws_sold_date_sk = d_date_sk + and d_date between cast('1999-02-22' as date) + and (cast('1999-02-22' as date) + interval 30 days) +group by + i_item_id, i_item_desc, i_category, i_class, i_current_price +order by + i_category, i_class, i_item_id, i_item_desc, revenueratio +limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`web_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 227, + "stopIndex" : 235, + "fragment" : "web_sales" + } ] +} + + +-- !query +table web_sales +|> join item +|> join date_dim +|> where ws_item_sk = i_item_sk + and i_category in ('sports', 'books', 'home') + and ws_sold_date_sk = d_date_sk + and d_date between cast('1999-02-22' as date) + and (cast('1999-02-22' as date) + interval 30 days) +|> aggregate sum(ws_ext_sales_price) AS itemrevenue + group by i_item_id, i_item_desc, i_category, i_class, i_current_price +|> extend + itemrevenue * 100 / sum(itemrevenue) + over (partition by i_class) as revenueratio +|> order by i_category, i_class, i_item_id, i_item_desc, revenueratio +|> select i_item_desc, i_category, i_class, i_current_price, itemrevenue, revenueratio +|> limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`web_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 7, + "stopIndex" : 15, + "fragment" : "web_sales" + } ] +} + + +-- !query +select + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing +from (select * +from (select + item_sk, + rank() + over ( + order by rank_col asc) rnk +from (select + ss_item_sk item_sk, + avg(ss_net_profit) rank_col +from store_sales ss1 +where ss_store_sk = 4 +group by ss_item_sk +having avg(ss_net_profit) > 0.9 * (select avg(ss_net_profit) rank_col +from store_sales +where ss_store_sk = 4 + and ss_addr_sk is null +group by ss_store_sk)) v1) v11 +where rnk < 11) asceding, + (select * + from (select + item_sk, + rank() + over ( + order by rank_col desc) rnk + from (select + ss_item_sk item_sk, + avg(ss_net_profit) rank_col + from store_sales ss1 + where ss_store_sk = 4 + group by ss_item_sk + having avg(ss_net_profit) > 0.9 * (select avg(ss_net_profit) rank_col + from store_sales + where ss_store_sk = 4 + and ss_addr_sk is null + group by ss_store_sk)) v2) v21 + where rnk < 11) descending, + item i1, item i2 +where asceding.rnk = descending.rnk + and i1.i_item_sk = asceding.item_sk + and i2.i_item_sk = descending.item_sk +order by asceding.rnk +limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`store_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 256, + "stopIndex" : 266, + "fragment" : "store_sales" + } ] +} + + +-- !query +from store_sales ss1 +|> where ss_store_sk = 4 +|> aggregate avg(ss_net_profit) rank_col + group by ss_item_sk as item_sk +|> where rank_col > 0.9 * ( + from store_sales + |> where ss_store_sk = 4 + and ss_addr_sk is null + |> aggregate avg(ss_net_profit) rank_col + group by ss_store_sk + |> select rank_col) +|> as v1 +|> select + item_sk, + rank() over ( + order by rank_col asc) rnk +|> as v11 +|> where rnk < 11 +|> as asceding +|> join ( + from store_sales ss1 + |> where ss_store_sk = 4 + |> aggregate avg(ss_net_profit) rank_col + group by ss_item_sk as item_sk + |> where rank_col > 0.9 * ( + table store_sales + |> where ss_store_sk = 4 + and ss_addr_sk is null + |> aggregate avg(ss_net_profit) rank_col + group by ss_store_sk + |> select rank_col) + |> as v2 + |> select + item_sk, + rank() over ( + order by rank_col asc) rnk + |> as v21 + |> where rnk < 11) descending +|> join item i1 +|> join item i2 +|> where asceding.rnk = descending.rnk + and i1.i_item_sk = asceding.item_sk + and i2.i_item_sk = descending.item_sk +|> order by asceding.rnk +|> select + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`store_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 6, + "stopIndex" : 16, + "fragment" : "store_sales" + } ] +} + + +-- !query +with web_v1 as ( + select + ws_item_sk item_sk, + d_date, + sum(sum(ws_sales_price)) + over (partition by ws_item_sk + order by d_date + rows between unbounded preceding and current row) cume_sales + from web_sales, date_dim + where ws_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ws_item_sk is not null + group by ws_item_sk, d_date), + store_v1 as ( + select + ss_item_sk item_sk, + d_date, + sum(sum(ss_sales_price)) + over (partition by ss_item_sk + order by d_date + rows between unbounded preceding and current row) cume_sales + from store_sales, date_dim + where ss_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ss_item_sk is not null + group by ss_item_sk, d_date) +select * +from (select + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) web_cumulative, + max(store_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) store_cumulative +from (select + case when web.item_sk is not null + then web.item_sk + else store.item_sk end item_sk, + case when web.d_date is not null + then web.d_date + else store.d_date end d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +from web_v1 web full outer join store_v1 store on (web.item_sk = store.item_sk + and web.d_date = store.d_date) + ) x) y +where web_cumulative > store_cumulative +order by item_sk, d_date +limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`web_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 222, + "stopIndex" : 230, + "fragment" : "web_sales" + } ] +} + + +-- !query +with web_v1 as ( + table web_sales + |> join date_dim + |> where ws_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ws_item_sk is not null + |> aggregate sum(ws_sales_price) as sum_ws_sales_price + group by ws_item_sk as item_sk, d_date + |> extend sum(sum_ws_sales_price) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) + as cume_sales), +store_v1 as ( + table store_sales + |> join date_dim + |> where ss_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ss_item_sk is not null + |> aggregate sum(ss_sales_price) as sum_ss_sales_price + group by ss_item_sk as item_sk, d_date + |> extend sum(sum_ss_sales_price) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) + as cume_sales) +table web_v1 +|> as web +|> full outer join store_v1 store + on (web.item_sk = store.item_sk and web.d_date = store.d_date) +|> select + case when web.item_sk is not null + then web.item_sk + else store.item_sk end item_sk, + case when web.d_date is not null + then web.d_date + else store.d_date end d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +|> as x +|> select + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) web_cumulative, + max(store_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) store_cumulative +|> as y +|> where web_cumulative > store_cumulative +|> order by item_sk, d_date +|> limit 100 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`web_sales`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 34, + "fragment" : "web_sales" + } ] +} + + -- !query drop table t -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out index 1892741aa4232..6f862b36f9de6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out @@ -1326,7 +1326,7 @@ Aggregate [count(1) AS count(1)#xL] +- Filter unique1#x IN (list#x []) : +- Project [unique1#x] : +- Filter (unique2#x = 42) - : +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, ... 7 more fields] + : +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, ... 6 more fields] : +- Join Inner, (unique1#x = unique1#x) : :- SubqueryAlias b : : +- SubqueryAlias spark_catalog.default.tenk1 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out index 87831f7f30384..db223603c8fd9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out @@ -99,7 +99,7 @@ WithCTE +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, 1)) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] +- Project [x#xL] +- SubqueryAlias cte - +- CTERelationRef xxxx, true, [x#xL], false + +- CTERelationRef xxxx, true, [x#xL], false, false -- !query @@ -121,7 +121,7 @@ WithCTE +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, cast(-1 as bigint), cast(1 as bigint))) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] +- Project [x#xL] +- SubqueryAlias cte - +- CTERelationRef xxxx, true, [x#xL], false + +- CTERelationRef xxxx, true, [x#xL], false, false -- !query @@ -154,7 +154,7 @@ WithCTE +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, 1)) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] +- Project [x#xL] +- SubqueryAlias cte - +- CTERelationRef xxxx, true, [x#xL], false + +- CTERelationRef xxxx, true, [x#xL], false, false -- !query @@ -187,7 +187,7 @@ WithCTE +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, cast(-1 as bigint), cast(1 as bigint))) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] +- Project [x#xL] +- SubqueryAlias cte - +- CTERelationRef xxxx, true, [x#xL], false + +- CTERelationRef xxxx, true, [x#xL], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out index 8582043c1a375..4a220f59ac52a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out @@ -12,10 +12,10 @@ WithCTE +- Project [x#x, y#x, x#x, y#x] +- Join Inner :- SubqueryAlias q1 - : +- CTERelationRef xxxx, true, [x#x, y#x], false + : +- CTERelationRef xxxx, true, [x#x, y#x], false, false +- SubqueryAlias q2 +- SubqueryAlias q1 - +- CTERelationRef xxxx, true, [x#x, y#x], false + +- CTERelationRef xxxx, true, [x#x, y#x], false, false -- !query @@ -194,7 +194,7 @@ WithCTE +- SubqueryAlias q +- Project [foo#x] +- SubqueryAlias cte - +- CTERelationRef xxxx, true, [foo#x], false + +- CTERelationRef xxxx, true, [foo#x], false, false -- !query @@ -222,13 +222,13 @@ WithCTE : +- Union false, false : :- Project [2#x] : : +- SubqueryAlias innermost -: : +- CTERelationRef xxxx, true, [2#x], false +: : +- CTERelationRef xxxx, true, [2#x], false, false : +- Project [3 AS 3#x] : +- OneRowRelation +- Sort [x#x ASC NULLS FIRST], true +- Project [x#x] +- SubqueryAlias outermost - +- CTERelationRef xxxx, true, [x#x], false + +- CTERelationRef xxxx, true, [x#x], false, false -- !query @@ -418,7 +418,7 @@ WithCTE : +- OneRowRelation +- Project [x#x] +- SubqueryAlias ordinality - +- CTERelationRef xxxx, true, [x#x], false + +- CTERelationRef xxxx, true, [x#x], false, false -- !query @@ -459,7 +459,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d : +- OneRowRelation +- Project [42#x] +- SubqueryAlias test - +- CTERelationRef xxxx, true, [42#x], false + +- CTERelationRef xxxx, true, [42#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/random.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/random.sql.out index 4b945238dddaa..c46708e5e8661 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/random.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/random.sql.out @@ -119,6 +119,18 @@ SELECT uniform(0, 10L, 0) AS result [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT uniform(0, cast(10 as tinyint), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(0, cast(10 as smallint), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query SELECT uniform(0, 10S, 0) AS result -- !query analysis @@ -137,6 +149,30 @@ SELECT uniform(10.0F, 20.0F, 0) AS result [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(11, 4)), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(10, cast(20 as decimal(10, 3)), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(cast(10 as decimal(10, 3)), 20, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query SELECT uniform(10.0D, 20.0D, CAST(3 / 7 AS LONG)) AS result -- !query analysis @@ -161,24 +197,108 @@ SELECT uniform(10, 20.0F) IS NOT NULL AS result [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT uniform(-10L, 10L, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(-20L, -10L, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(-20L, -10L, -10) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query SELECT uniform(NULL, 1, 0) AS result -- !query analysis [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT uniform(cast(NULL AS int), 1, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(cast(NULL AS float), 1, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query SELECT uniform(0, NULL, 0) AS result -- !query analysis [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT uniform(0, cast(NULL AS int), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(0, cast(NULL AS float), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query SELECT uniform(0, 1, NULL) AS result -- !query analysis [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT uniform(NULL, NULL, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(NULL, NULL, NULL) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(0, 1, cast(NULL as int)) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT uniform(0, 1, cast(NULL as float)) AS result +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"CAST(NULL AS FLOAT)\"", + "inputType" : "\"FLOAT\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(0, 1, CAST(NULL AS FLOAT))\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 41, + "fragment" : "uniform(0, 1, cast(NULL as float))" + } ] +} + + -- !query SELECT uniform(10, 20, col) AS result FROM VALUES (0), (1), (2) tab(col) -- !query analysis @@ -272,161 +392,251 @@ org.apache.spark.sql.AnalysisException -- !query -SELECT randstr(1, 0) AS result --- !query analysis -[Analyzer test output redacted due to nondeterminism] - - --- !query -SELECT randstr(5, 0) AS result --- !query analysis -[Analyzer test output redacted due to nondeterminism] - - --- !query -SELECT randstr(10, 0) AS result --- !query analysis -[Analyzer test output redacted due to nondeterminism] - - --- !query -SELECT randstr(10S, 0) AS result +SELECT uniform(10.0F, 20.0F, 0.0F) AS result -- !query analysis -[Analyzer test output redacted due to nondeterminism] - - --- !query -SELECT randstr(10, 0) AS result FROM VALUES (0), (1), (2) tab(col) --- !query analysis -[Analyzer test output redacted due to nondeterminism] - - --- !query -SELECT randstr(10) IS NOT NULL AS result --- !query analysis -[Analyzer test output redacted due to nondeterminism] +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"0.0\"", + "inputType" : "\"FLOAT\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(10.0, 20.0, 0.0)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 34, + "fragment" : "uniform(10.0F, 20.0F, 0.0F)" + } ] +} -- !query -SELECT randstr(10L, 0) AS result +SELECT uniform(10.0F, 20.0F, 0.0D) AS result -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"10\"", - "inputType" : "\"BIGINT\"", - "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(10, 0)\"" + "inputSql" : "\"0.0\"", + "inputType" : "\"DOUBLE\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(10.0, 20.0, 0.0)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 22, - "fragment" : "randstr(10L, 0)" + "stopIndex" : 34, + "fragment" : "uniform(10.0F, 20.0F, 0.0D)" } ] } -- !query -SELECT randstr(10.0F, 0) AS result +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), cast(0 as decimal(10, 3))) -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"10.0\"", - "inputType" : "\"FLOAT\"", - "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(10.0, 0)\"" + "inputSql" : "\"CAST(0 AS DECIMAL(10,3))\"", + "inputType" : "\"DECIMAL(10,3)\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(CAST(10 AS DECIMAL(10,3)), CAST(20 AS DECIMAL(10,3)), CAST(0 AS DECIMAL(10,3)))\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 24, - "fragment" : "randstr(10.0F, 0)" + "stopIndex" : 97, + "fragment" : "uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), cast(0 as decimal(10, 3)))" } ] } -- !query -SELECT randstr(10.0D, 0) AS result +SELECT uniform('abc', 10, 0) AS result -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"10.0\"", - "inputType" : "\"DOUBLE\"", + "inputSql" : "\"abc\"", + "inputType" : "\"STRING\"", "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(10.0, 0)\"" + "requiredType" : "\"NUMERIC\"", + "sqlExpr" : "\"uniform(abc, 10, 0)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 24, - "fragment" : "randstr(10.0D, 0)" + "stopIndex" : 28, + "fragment" : "uniform('abc', 10, 0)" } ] } -- !query -SELECT randstr(NULL, 0) AS result +SELECT uniform(0, 'def', 0) AS result -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"NULL\"", - "inputType" : "\"VOID\"", - "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(NULL, 0)\"" + "inputSql" : "\"def\"", + "inputType" : "\"STRING\"", + "paramIndex" : "second", + "requiredType" : "\"NUMERIC\"", + "sqlExpr" : "\"uniform(0, def, 0)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 23, - "fragment" : "randstr(NULL, 0)" + "stopIndex" : 27, + "fragment" : "uniform(0, 'def', 0)" } ] } -- !query -SELECT randstr(0, NULL) AS result +SELECT uniform(0, 10, 'ghi') AS result -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"NULL\"", - "inputType" : "\"VOID\"", - "paramIndex" : "second", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(0, NULL)\"" + "inputSql" : "\"ghi\"", + "inputType" : "\"STRING\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(0, 10, ghi)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 23, - "fragment" : "randstr(0, NULL)" + "stopIndex" : 28, + "fragment" : "uniform(0, 10, 'ghi')" } ] } +-- !query +SELECT randstr(1, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(5, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10S, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(CAST(10 AS TINYINT), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(CAST(10 AS BIGINT), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(1.0F, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(1.0D, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(cast(1 AS DECIMAL(10, 2)), 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10, 0) AS result FROM VALUES (0), (1), (2) tab(col) +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10) IS NOT NULL AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(1, -1) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10L, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10.0F, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10.0D, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(NULL, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(0, NULL) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query SELECT randstr(col, 0) AS result FROM VALUES (0), (1), (2) tab(col) -- !query analysis @@ -437,7 +647,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "messageParameters" : { "inputExpr" : "\"col\"", "inputName" : "`length`", - "inputType" : "INT or SMALLINT", + "inputType" : "integer", "sqlExpr" : "\"randstr(col, 0)\"" }, "queryContext" : [ { @@ -460,7 +670,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "messageParameters" : { "inputExpr" : "\"col\"", "inputName" : "`seed`", - "inputType" : "INT or SMALLINT", + "inputType" : "integer", "sqlExpr" : "\"randstr(10, col)\"" }, "queryContext" : [ { @@ -494,3 +704,57 @@ org.apache.spark.sql.AnalysisException "fragment" : "randstr(10, 0, 1)" } ] } + + +-- !query +SELECT randstr(-1, 0) AS result +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT randstr(10, "a") AS result FROM VALUES (0) tab(a) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"a\"", + "inputType" : "\"STRING\"", + "paramIndex" : "second", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"randstr(10, a)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 23, + "fragment" : "randstr(10, \"a\")" + } ] +} + + +-- !query +SELECT randstr(10, 1.5) AS result FROM VALUES (0) tab(a) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1.5\"", + "inputType" : "\"DECIMAL(2,1)\"", + "paramIndex" : "second", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"randstr(10, 1.5)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 23, + "fragment" : "randstr(10, 1.5)" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out index bb1d695c4e546..fb985d6b6b0ca 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out @@ -166,10 +166,11 @@ SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(a='Us', d=1) -- !query analysis org.apache.spark.sql.AnalysisException { - "condition" : "_LEGACY_ERROR_TEMP_1231", + "condition" : "PARTITIONS_NOT_FOUND", + "sqlState" : "428FT", "messageParameters" : { - "key" : "a", - "tblName" : "`spark_catalog`.`showdb`.`show_t1`" + "partitionList" : "`a`", + "tableName" : "`spark_catalog`.`showdb`.`show_t1`" } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out index 441034ea65e9f..a70bdc9f05214 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out @@ -231,3 +231,25 @@ DROP DATABASE sql_on_files -- !query analysis DropNamespace false, false +- ResolvedNamespace V2SessionCatalog(spark_catalog), [sql_on_files] + + +-- !query +SELECT * FROM json.`https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "FAILED_READ_FILE.UNSUPPORTED_FILE_SYSTEM", + "sqlState" : "KD001", + "messageParameters" : { + "fileSystemClass" : "org.apache.hadoop.fs.http.HttpsFileSystem", + "method" : "listStatus", + "path" : "https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 128, + "fragment" : "json.`https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json`" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out index add7e79a98993..a18a889821500 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out @@ -2050,7 +2050,7 @@ WithCTE : +- OneRowRelation +- Project [c1#x AS 1#x] +- SubqueryAlias v1 - +- CTERelationRef xxxx, true, [c1#x], false + +- CTERelationRef xxxx, true, [c1#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out new file mode 100644 index 0000000000000..3316642de1f8b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out @@ -0,0 +1,575 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE FUNCTION foo1a0() RETURNS INT RETURN 1 +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo1a0`" + } +} + + +-- !query +SELECT foo1a0() +-- !query analysis +Project [spark_catalog.default.foo1a0() AS spark_catalog.default.foo1a0()#x] ++- Project + +- OneRowRelation + + +-- !query +SELECT foo1a0(1) +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + "sqlState" : "42605", + "messageParameters" : { + "actualNum" : "1", + "docroot" : "https://spark.apache.org/docs/latest", + "expectedNum" : "0", + "functionName" : "`spark_catalog`.`default`.`foo1a0`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 16, + "fragment" : "foo1a0(1)" + } ] +} + + +-- !query +CREATE FUNCTION foo1a1(a INT) RETURNS INT RETURN 1 +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo1a1`" + } +} + + +-- !query +SELECT foo1a1(1) +-- !query analysis +Project [spark_catalog.default.foo1a1(a#x) AS spark_catalog.default.foo1a1(1)#x] ++- Project [cast(1 as int) AS a#x] + +- OneRowRelation + + +-- !query +SELECT foo1a1(1, 2) +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + "sqlState" : "42605", + "messageParameters" : { + "actualNum" : "2", + "docroot" : "https://spark.apache.org/docs/latest", + "expectedNum" : "1", + "functionName" : "`spark_catalog`.`default`.`foo1a1`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 19, + "fragment" : "foo1a1(1, 2)" + } ] +} + + +-- !query +CREATE FUNCTION foo1a2(a INT, b INT, c INT, d INT) RETURNS INT RETURN 1 +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo1a2`" + } +} + + +-- !query +SELECT foo1a2(1, 2, 3, 4) +-- !query analysis +Project [spark_catalog.default.foo1a2(a#x, b#x, c#x, d#x) AS spark_catalog.default.foo1a2(1, 2, 3, 4)#x] ++- Project [cast(1 as int) AS a#x, cast(2 as int) AS b#x, cast(3 as int) AS c#x, cast(4 as int) AS d#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_1a(a INT) RETURNS INT RETURN a +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_1a`" + } +} + + +-- !query +SELECT foo2_1a(5) +-- !query analysis +Project [spark_catalog.default.foo2_1a(a#x) AS spark_catalog.default.foo2_1a(5)#x] ++- Project [cast(5 as int) AS a#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_1b(a INT, b INT) RETURNS INT RETURN a + b +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_1b`" + } +} + + +-- !query +SELECT foo2_1b(5, 6) +-- !query analysis +Project [spark_catalog.default.foo2_1b(a#x, b#x) AS spark_catalog.default.foo2_1b(5, 6)#x] ++- Project [cast(5 as int) AS a#x, cast(6 as int) AS b#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_1c(a INT, b INT) RETURNS INT RETURN 10 * (a + b) + 100 * (a -b) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_1c`" + } +} + + +-- !query +SELECT foo2_1c(5, 6) +-- !query analysis +Project [spark_catalog.default.foo2_1c(a#x, b#x) AS spark_catalog.default.foo2_1c(5, 6)#x] ++- Project [cast(5 as int) AS a#x, cast(6 as int) AS b#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_1d(a INT, b INT) RETURNS INT RETURN ABS(a) - LENGTH(CAST(b AS VARCHAR(10))) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_1d`" + } +} + + +-- !query +SELECT foo2_1d(-5, 6) +-- !query analysis +Project [spark_catalog.default.foo2_1d(a#x, b#x) AS spark_catalog.default.foo2_1d(-5, 6)#x] ++- Project [cast(-5 as int) AS a#x, cast(6 as int) AS b#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_2a(a INT) RETURNS INT RETURN SELECT a +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_2a`" + } +} + + +-- !query +SELECT foo2_2a(5) +-- !query analysis +Project [spark_catalog.default.foo2_2a(a#x) AS spark_catalog.default.foo2_2a(5)#x] ++- Project [cast(5 as int) AS a#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_2b(a INT) RETURNS INT RETURN 1 + (SELECT a) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_2b`" + } +} + + +-- !query +SELECT foo2_2b(5) +-- !query analysis +Project [spark_catalog.default.foo2_2b(a#x) AS spark_catalog.default.foo2_2b(5)#x] +: +- Project [outer(a#x)] +: +- OneRowRelation ++- Project [cast(5 as int) AS a#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_2c(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT a)) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`a`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 21, + "stopIndex" : 21, + "fragment" : "a" + } ] +} + + +-- !query +CREATE FUNCTION foo2_2d(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT (SELECT (SELECT a)))) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`a`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 37, + "stopIndex" : 37, + "fragment" : "a" + } ] +} + + +-- !query +CREATE FUNCTION foo2_2e(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) WHERE c1 = 2 +UNION ALL +SELECT a + 1 FROM (VALUES 1) AS V(c1) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_2e`" + } +} + + +-- !query +CREATE FUNCTION foo2_2f(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) +EXCEPT +SELECT a + 1 FROM (VALUES 1) AS V(a) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_2f`" + } +} + + +-- !query +CREATE FUNCTION foo2_2g(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) +INTERSECT +SELECT a FROM (VALUES 1) AS V(a) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_2g`" + } +} + + +-- !query +DROP TABLE IF EXISTS t1 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1 + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t2 + + +-- !query +DROP TABLE IF EXISTS ts +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.ts + + +-- !query +DROP TABLE IF EXISTS tm +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tm + + +-- !query +DROP TABLE IF EXISTS ta +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.ta + + +-- !query +DROP TABLE IF EXISTS V1 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.V1 + + +-- !query +DROP TABLE IF EXISTS V2 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.V2 + + +-- !query +DROP VIEW IF EXISTS t1 +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`t1`, true, true, false + + +-- !query +DROP VIEW IF EXISTS t2 +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`t2`, true, true, false + + +-- !query +DROP VIEW IF EXISTS ts +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`ts`, true, true, false + + +-- !query +DROP VIEW IF EXISTS tm +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`tm`, true, true, false + + +-- !query +DROP VIEW IF EXISTS ta +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`ta`, true, true, false + + +-- !query +DROP VIEW IF EXISTS V1 +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`V1`, true, true, false + + +-- !query +DROP VIEW IF EXISTS V2 +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`V2`, true, true, false + + +-- !query +CREATE FUNCTION foo2_3(a INT, b INT) RETURNS INT RETURN a + b +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_3`" + } +} + + +-- !query +CREATE VIEW V1(c1, c2) AS VALUES (1, 2), (3, 4), (5, 6) +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`V1`, [(c1,None), (c2,None)], VALUES (1, 2), (3, 4), (5, 6), false, false, PersistedView, COMPENSATION, true + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE VIEW V2(c1, c2) AS VALUES (-1, -2), (-3, -4), (-5, -6) +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`V2`, [(c1,None), (c2,None)], VALUES (-1, -2), (-3, -4), (-5, -6), false, false, PersistedView, COMPENSATION, true + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT foo2_3(c1, c2), foo2_3(c2, 1), foo2_3(c1, c2) - foo2_3(c2, c1 - 1) FROM V1 ORDER BY 1, 2, 3 +-- !query analysis +Sort [spark_catalog.default.foo2_3(c1, c2)#x ASC NULLS FIRST, spark_catalog.default.foo2_3(c2, 1)#x ASC NULLS FIRST, (spark_catalog.default.foo2_3(c1, c2) - spark_catalog.default.foo2_3(c2, (c1 - 1)))#x ASC NULLS FIRST], true ++- Project [spark_catalog.default.foo2_3(a#x, b#x) AS spark_catalog.default.foo2_3(c1, c2)#x, spark_catalog.default.foo2_3(a#x, b#x) AS spark_catalog.default.foo2_3(c2, 1)#x, (spark_catalog.default.foo2_3(a#x, b#x) - spark_catalog.default.foo2_3(a#x, b#x)) AS (spark_catalog.default.foo2_3(c1, c2) - spark_catalog.default.foo2_3(c2, (c1 - 1)))#x] + +- Project [c1#x, c2#x, cast(c1#x as int) AS a#x, cast(c2#x as int) AS b#x, cast(c2#x as int) AS a#x, cast(1 as int) AS b#x, cast(c1#x as int) AS a#x, cast(c2#x as int) AS b#x, cast(c2#x as int) AS a#x, cast((c1#x - 1) as int) AS b#x] + +- SubqueryAlias spark_catalog.default.v1 + +- View (`spark_catalog`.`default`.`v1`, [c1#x, c2#x]) + +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM V1 WHERE foo2_3(c1, 0) = c1 AND foo2_3(c1, c2) < 8 +-- !query analysis +Project [c1#x, c2#x] ++- Project [c1#x, c2#x] + +- Filter ((spark_catalog.default.foo2_3(a#x, b#x) = c1#x) AND (spark_catalog.default.foo2_3(a#x, b#x) < 8)) + +- Project [c1#x, c2#x, cast(c1#x as int) AS a#x, cast(0 as int) AS b#x, cast(c1#x as int) AS a#x, cast(c2#x as int) AS b#x] + +- SubqueryAlias spark_catalog.default.v1 + +- View (`spark_catalog`.`default`.`v1`, [c1#x, c2#x]) + +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT foo2_3(SUM(c1), SUM(c2)), SUM(c1) + SUM(c2), SUM(foo2_3(c1, c2) + foo2_3(c2, c1) - foo2_3(c2, c1)) +FROM V1 +-- !query analysis +Project [spark_catalog.default.foo2_3(a#x, b#x) AS spark_catalog.default.foo2_3(sum(c1), sum(c2))#x, (sum(c1) + sum(c2))#xL, sum(((spark_catalog.default.foo2_3(c1, c2) + spark_catalog.default.foo2_3(c2, c1)) - spark_catalog.default.foo2_3(c2, c1)))#xL] ++- Project [sum(c1)#xL, sum(c2)#xL, (sum(c1) + sum(c2))#xL, sum(((spark_catalog.default.foo2_3(c1, c2) + spark_catalog.default.foo2_3(c2, c1)) - spark_catalog.default.foo2_3(c2, c1)))#xL, cast(sum(c1)#xL as int) AS a#x, cast(sum(c2)#xL as int) AS b#x] + +- Aggregate [sum(c1#x) AS sum(c1)#xL, sum(c2#x) AS sum(c2)#xL, (sum(c1#x) + sum(c2#x)) AS (sum(c1) + sum(c2))#xL, sum(((spark_catalog.default.foo2_3(a#x, b#x) + spark_catalog.default.foo2_3(a#x, b#x)) - spark_catalog.default.foo2_3(a#x, b#x))) AS sum(((spark_catalog.default.foo2_3(c1, c2) + spark_catalog.default.foo2_3(c2, c1)) - spark_catalog.default.foo2_3(c2, c1)))#xL] + +- Project [c1#x, c2#x, cast(c1#x as int) AS a#x, cast(c2#x as int) AS b#x, cast(c2#x as int) AS a#x, cast(c1#x as int) AS b#x, cast(c2#x as int) AS a#x, cast(c1#x as int) AS b#x] + +- SubqueryAlias spark_catalog.default.v1 + +- View (`spark_catalog`.`default`.`v1`, [c1#x, c2#x]) + +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE FUNCTION foo2_4a(a ARRAY) RETURNS STRING RETURN +SELECT array_sort(a, (i, j) -> rank[i] - rank[j])[0] FROM (SELECT MAP('a', 1, 'b', 2) rank) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_4a`" + } +} + + +-- !query +SELECT foo2_4a(ARRAY('a', 'b')) +-- !query analysis +Project [spark_catalog.default.foo2_4a(a#x) AS spark_catalog.default.foo2_4a(array(a, b))#x] +: +- Project [array_sort(outer(a#x), lambdafunction((rank#x[lambda i#x] - rank#x[lambda j#x]), lambda i#x, lambda j#x, false), false)[0] AS array_sort(outer(foo2_4a.a), lambdafunction((rank[namedlambdavariable()] - rank[namedlambdavariable()]), namedlambdavariable(), namedlambdavariable()))[0]#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- Project [map(a, 1, b, 2) AS rank#x] +: +- OneRowRelation ++- Project [cast(array(a, b) as array) AS a#x] + +- OneRowRelation + + +-- !query +CREATE FUNCTION foo2_4b(m MAP, k STRING) RETURNS STRING RETURN +SELECT v || ' ' || v FROM (SELECT upper(m[k]) AS v) +-- !query analysis +org.apache.spark.sql.catalyst.analysis.FunctionAlreadyExistsException +{ + "condition" : "ROUTINE_ALREADY_EXISTS", + "sqlState" : "42723", + "messageParameters" : { + "existingRoutineType" : "routine", + "newRoutineType" : "routine", + "routineName" : "`default`.`foo2_4b`" + } +} + + +-- !query +SELECT foo2_4b(map('a', 'hello', 'b', 'world'), 'a') +-- !query analysis +Project [spark_catalog.default.foo2_4b(m#x, k#x) AS spark_catalog.default.foo2_4b(map(a, hello, b, world), a)#x] +: +- Project [concat(concat(v#x, ), v#x) AS concat(concat(v, ), v)#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- Project [upper(outer(m#x)[outer(k#x)]) AS v#x] +: +- OneRowRelation ++- Project [cast(map(a, hello, b, world) as map) AS m#x, cast(a as string) AS k#x] + +- OneRowRelation + + +-- !query +DROP VIEW V2 +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`V2`, false, true, false + + +-- !query +DROP VIEW V1 +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`V1`, false, true, false diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out index 7c3678c66c117..abaf6a2432251 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out @@ -133,7 +133,7 @@ WithCTE : +- Filter (outer(emp_name#x) = emp_name#x) : +- SubqueryAlias b : +- SubqueryAlias bonus_cte - : +- CTERelationRef xxxx, true, [emp_name#x, bonus_amt#x], false + : +- CTERelationRef xxxx, true, [emp_name#x, bonus_amt#x], false, false +- SubqueryAlias a +- SubqueryAlias bonus +- View (`BONUS`, [emp_name#x, bonus_amt#x]) @@ -189,10 +189,10 @@ WithCTE : +- Join Inner, (dept_id#x = dept_id#x) : :- SubqueryAlias a : : +- SubqueryAlias emp_cte - : : +- CTERelationRef xxxx, true, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x], false + : : +- CTERelationRef xxxx, true, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x], false, false : +- SubqueryAlias b : +- SubqueryAlias dept_cte - : +- CTERelationRef xxxx, true, [dept_id#x, dept_name#x, state#x], false + : +- CTERelationRef xxxx, true, [dept_id#x, dept_name#x, state#x], false, false +- SubqueryAlias bonus +- View (`BONUS`, [emp_name#x, bonus_amt#x]) +- Project [cast(emp_name#x as string) AS emp_name#x, cast(bonus_amt#x as double) AS bonus_amt#x] @@ -253,10 +253,10 @@ WithCTE : +- Join LeftOuter, (dept_id#x = dept_id#x) : :- SubqueryAlias a : : +- SubqueryAlias emp_cte - : : +- CTERelationRef xxxx, true, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x], false + : : +- CTERelationRef xxxx, true, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x], false, false : +- SubqueryAlias b : +- SubqueryAlias dept_cte - : +- CTERelationRef xxxx, true, [dept_id#x, dept_name#x, state#x], false + : +- CTERelationRef xxxx, true, [dept_id#x, dept_name#x, state#x], false, false +- Join Inner :- Join Inner : :- SubqueryAlias b @@ -268,7 +268,7 @@ WithCTE : : +- LocalRelation [emp_name#x, bonus_amt#x] : +- SubqueryAlias e : +- SubqueryAlias emp_cte - : +- CTERelationRef xxxx, true, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x], false + : +- CTERelationRef xxxx, true, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x], false, false +- SubqueryAlias d +- SubqueryAlias dept +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) @@ -322,7 +322,7 @@ WithCTE : +- Filter (count(1)#xL > cast(1 as bigint)) : +- Aggregate [dept_id#x], [dept_id#x, max(salary#x) AS max(salary)#x, count(1) AS count(1)#xL] : +- SubqueryAlias empdept - : +- CTERelationRef xxxx, true, [id#x, salary#x, emp_name#x, dept_id#x], false + : +- CTERelationRef xxxx, true, [id#x, salary#x, emp_name#x, dept_id#x], false, false +- SubqueryAlias bonus +- View (`BONUS`, [emp_name#x, bonus_amt#x]) +- Project [cast(emp_name#x as string) AS emp_name#x, cast(bonus_amt#x as double) AS bonus_amt#x] @@ -375,7 +375,7 @@ WithCTE : +- Filter (count(1)#xL < cast(1 as bigint)) : +- Aggregate [dept_id#x], [dept_id#x, max(salary#x) AS max(salary)#x, count(1) AS count(1)#xL] : +- SubqueryAlias empdept - : +- CTERelationRef xxxx, true, [id#x, salary#x, emp_name#x, dept_id#x], false + : +- CTERelationRef xxxx, true, [id#x, salary#x, emp_name#x, dept_id#x], false, false +- SubqueryAlias bonus +- View (`BONUS`, [emp_name#x, bonus_amt#x]) +- Project [cast(emp_name#x as string) AS emp_name#x, cast(bonus_amt#x as double) AS bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out index 39748a324e527..230ffc005e90d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out @@ -330,7 +330,7 @@ WithCTE +- Project [t1a#x, t1b#x, t1a#x, t1b#x] +- Join Inner, (t1b#x = t1b#x) :- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias cte2 +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out index 0074991b4ea6a..199b876fb9a86 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out @@ -138,7 +138,7 @@ WithCTE : +- Project [t1b#x] : +- Filter (cast(t1b#x as int) > 0) : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias t1 +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as double) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] @@ -197,21 +197,21 @@ WithCTE : : : :- Project [t1b#x] : : : : +- Filter (cast(t1b#x as int) > 0) : : : : +- SubqueryAlias cte1 - : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false : : : +- Project [t1b#x] : : : +- Filter (cast(t1b#x as int) > 5) : : : +- SubqueryAlias cte1 - : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false : : +- Intersect false : : :- Project [t1b#x] : : : +- SubqueryAlias cte1 - : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false : : +- Project [t1b#x] : : +- SubqueryAlias cte1 - : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false : +- Project [t1b#x] : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias t1 +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as double) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] @@ -268,22 +268,22 @@ WithCTE : : : :- Join FullOuter, (t1c#x = t1c#x) : : : : :- Join Inner, (t1b#x > t1b#x) : : : : : :- SubqueryAlias cte1 - : : : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false + : : : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false, false : : : : : +- SubqueryAlias cte2 : : : : : +- SubqueryAlias cte1 - : : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false + : : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false, false : : : : +- SubqueryAlias cte3 : : : : +- SubqueryAlias cte1 - : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false + : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false, false : : : +- SubqueryAlias cte4 : : : +- SubqueryAlias cte1 - : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false + : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false, false : : +- SubqueryAlias cte5 : : +- SubqueryAlias cte1 - : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false + : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false, false : +- SubqueryAlias cte6 : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x], false, false +- SubqueryAlias t1 +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as double) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] @@ -354,16 +354,16 @@ WithCTE :- Join FullOuter, (t1a#x = t1a#x) : :- Join Inner, ((cast(t1b#x as int) > 5) AND (t1a#x = t1a#x)) : : :- SubqueryAlias cte1 - : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false : : +- SubqueryAlias cte2 : : +- SubqueryAlias cte1 - : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false : +- SubqueryAlias cte3 : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias cte4 +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false -- !query @@ -424,10 +424,10 @@ WithCTE +- Project [t1a#x, t1b#x] +- Join Inner, (t1h#x >= t1h#x) :- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1h#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1h#x], false, false +- SubqueryAlias cte2 +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1h#x], false + +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1h#x], false, false -- !query @@ -485,16 +485,16 @@ WithCTE :- Join RightOuter, (t1b#x = t1b#x) : :- Join Inner, (t1a#x = t1a#x) : : :- SubqueryAlias cte1 - : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false + : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false, false : : +- SubqueryAlias cte2 : : +- SubqueryAlias cte1 - : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false + : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false, false : +- SubqueryAlias cte3 : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false, false +- SubqueryAlias cte4 +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false + +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x], false, false -- !query @@ -538,10 +538,10 @@ WithCTE +- Project [t1a#x, t1b#x] +- Join RightOuter, (t1a#x = t1a#x) :- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias cte2 +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false -- !query @@ -599,15 +599,15 @@ WithCTE : : +- SubqueryAlias t1 : : +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias s +- Project [t1b#x] +- Join LeftOuter, (t1b#x = t1b#x) :- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias cte2 +- SubqueryAlias cte1 - +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false -- !query @@ -642,7 +642,7 @@ WithCTE : +- Project [t1b#x] : +- Filter (cast(t1b#x as int) < 0) : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x], false, false +- SubqueryAlias t1 +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as double) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] @@ -722,16 +722,16 @@ WithCTE : :- Join RightOuter, (t1b#x = t1b#x) : : :- Join Inner, (t1a#x = t1a#x) : : : :- SubqueryAlias cte1 - : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false + : : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false, false : : : +- SubqueryAlias cte2 : : : +- SubqueryAlias cte1 - : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false + : : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false, false : : +- SubqueryAlias cte3 : : +- SubqueryAlias cte1 - : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false + : : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false, false : +- SubqueryAlias cte4 : +- SubqueryAlias cte1 - : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false + : +- CTERelationRef xxxx, true, [t1a#x, t1b#x, t1c#x, t1d#xL, t1h#x], false, false +- SubqueryAlias t1 +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as double) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out index 72e230f9bb881..2a3a87e5cab81 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out @@ -623,7 +623,7 @@ Project [c1#x, scalar-subquery#x [c1#x] AS scalarsubquery(c1)#x] : : +- OneRowRelation : +- Project [(a#x + outer(c1#x)) AS (a + outer(t1.c1))#x] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [a#x], false +: +- CTERelationRef xxxx, true, [a#x], false, false +- SubqueryAlias t1 +- View (`t1`, [c1#x, c2#x]) +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] @@ -647,7 +647,7 @@ Project [c1#x, scalar-subquery#x [c1#x] AS scalarsubquery(c1)#xL] : : +- LocalRelation [c1#x, c2#x] : +- Aggregate [sum(c2#x) AS sum(c2)#xL] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [c1#x, c2#x], false +: +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false +- SubqueryAlias t1 +- View (`t1`, [c1#x, c2#x]) +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] @@ -677,10 +677,10 @@ Project [c1#x, scalar-subquery#x [c1#x] AS scalarsubquery(c1)#xL] : : +- Project [c1#x, c2#x] : : +- Filter (outer(c1#x) = c1#x) : : +- SubqueryAlias t3 -: : +- CTERelationRef xxxx, true, [c1#x, c2#x], false +: : +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false : +- Aggregate [sum(c2#x) AS sum(c2)#xL] : +- SubqueryAlias t4 -: +- CTERelationRef xxxx, true, [c1#x, c2#x], false +: +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false +- SubqueryAlias t1 +- View (`t1`, [c1#x, c2#x]) +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] @@ -713,10 +713,10 @@ Project [c1#x, scalar-subquery#x [c1#x] AS scalarsubquery(c1)#xL] : +- Union false, false : :- Project [c1#x, c2#x] : : +- SubqueryAlias t -: : +- CTERelationRef xxxx, true, [c1#x, c2#x], false +: : +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false : +- Project [c2#x, c1#x] : +- SubqueryAlias t -: +- CTERelationRef xxxx, true, [c1#x, c2#x], false +: +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false +- SubqueryAlias t1 +- View (`t1`, [c1#x, c2#x]) +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] @@ -756,9 +756,9 @@ WithCTE : : +- Aggregate [sum(c2#x) AS sum(c2)#xL] : : +- Filter (c1#x = outer(c1#x)) : : +- SubqueryAlias t - : : +- CTERelationRef xxxx, true, [c1#x, c2#x], false + : : +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false : +- SubqueryAlias v - : +- CTERelationRef xxxx, true, [c1#x, c2#x], false + : +- CTERelationRef xxxx, true, [c1#x, c2#x], false, false +- SubqueryAlias t1 +- View (`t1`, [c1#x, c2#x]) +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] @@ -779,7 +779,7 @@ WithCTE : +- Project [a#x] : +- Filter (a#x = outer(c1#x)) : +- SubqueryAlias t - : +- CTERelationRef xxxx, true, [a#x], false + : +- CTERelationRef xxxx, true, [a#x], false, false +- SubqueryAlias t1 +- View (`t1`, [c1#x, c2#x]) +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] @@ -1027,7 +1027,7 @@ WithCTE : +- Aggregate [sum(1) AS sum(1)#xL] : +- Filter ((a#x = cast(outer(col#x) as int)) OR (upper(cast(outer(col#x) as string)) = Y)) : +- SubqueryAlias T - : +- CTERelationRef xxxx, true, [a#x], false + : +- CTERelationRef xxxx, true, [a#x], false, false +- SubqueryAlias foo +- Project [null AS col#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out index afbdc4293e6a3..bca87e0b5da5f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out @@ -217,3 +217,45 @@ Project [a#x, b#x, c#x, d#x] +- Project [id#x, v2#x] +- SubqueryAlias src2 +- LocalRelation [id#x, v2#x] + + +-- !query +SELECT src1.* FROM src1 a ORDER BY id LIMIT 1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "condition" : "CANNOT_RESOLVE_STAR_EXPAND", + "sqlState" : "42704", + "messageParameters" : { + "columns" : "`id`, `v1`", + "targetString" : "`src1`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 13, + "fragment" : "src1.*" + } ] +} + + +-- !query +SELECT src1.id FROM (SELECT * FROM src1 ORDER BY id LIMIT 1) a +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`src1`.`id`", + "proposal" : "`a`.`id`, `a`.`v1`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 14, + "fragment" : "src1.id" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out index 6acd4e3774f78..3e185decc2578 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out @@ -759,14 +759,14 @@ Project [from_csv(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMM -- !query select timestampadd(MONTH, -1, timestamp'2022-02-14 01:02:03') -- !query analysis -Project [timestampadd(MONTH, -1, 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MONTH, -1, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +Project [timestampadd(MONTH, cast(-1 as bigint), 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MONTH, -1, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +- OneRowRelation -- !query select timestampadd(MINUTE, 58, timestamp'2022-02-14 01:02:03') -- !query analysis -Project [timestampadd(MINUTE, 58, 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MINUTE, 58, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +Project [timestampadd(MINUTE, cast(58 as bigint), 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MINUTE, 58, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out index dd2c2d5032798..0134892e0caab 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out @@ -819,14 +819,14 @@ Project [from_csv(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMM -- !query select timestampadd(MONTH, -1, timestamp'2022-02-14 01:02:03') -- !query analysis -Project [timestampadd(MONTH, -1, 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MONTH, -1, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +Project [timestampadd(MONTH, cast(-1 as bigint), 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MONTH, -1, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +- OneRowRelation -- !query select timestampadd(MINUTE, 58, timestamp'2022-02-14 01:02:03') -- !query analysis -Project [timestampadd(MINUTE, 58, 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MINUTE, 58, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +Project [timestampadd(MINUTE, cast(58 as bigint), 2022-02-14 01:02:03, Some(America/Los_Angeles)) AS timestampadd(MINUTE, 58, TIMESTAMP_NTZ '2022-02-14 01:02:03')#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out index 17c6797545c3d..2675008424872 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out @@ -888,10 +888,10 @@ WithCTE +- Join Inner, (b#x = b#x) :- SubqueryAlias t1 : +- SubqueryAlias temp - : +- CTERelationRef xxxx, true, [b#x], false + : +- CTERelationRef xxxx, true, [b#x], false, false +- SubqueryAlias t2 +- SubqueryAlias temp - +- CTERelationRef xxxx, true, [b#x], false + +- CTERelationRef xxxx, true, [b#x], false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index 6f3bc9ccb66f3..4a35fffe3191b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -226,9 +226,9 @@ CreateViewCommand `various_arrays`, SELECT * FROM VALUES ( struct_array1, struct_array2, map_array1, map_array2 ), false, false, LocalTempView, UNSUPPORTED, true - +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] + +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields] +- SubqueryAlias various_arrays - +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] + +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields] -- !query @@ -251,11 +251,11 @@ FROM various_arrays -- !query analysis Project [concat(boolean_array1#x, boolean_array2#x) AS boolean_array#x, concat(tinyint_array1#x, tinyint_array2#x) AS tinyint_array#x, concat(smallint_array1#x, smallint_array2#x) AS smallint_array#x, concat(int_array1#x, int_array2#x) AS int_array#x, concat(bigint_array1#x, bigint_array2#x) AS bigint_array#x, concat(decimal_array1#x, decimal_array2#x) AS decimal_array#x, concat(double_array1#x, double_array2#x) AS double_array#x, concat(float_array1#x, float_array2#x) AS float_array#x, concat(date_array1#x, data_array2#x) AS data_array#x, concat(timestamp_array1#x, timestamp_array2#x) AS timestamp_array#x, concat(string_array1#x, string_array2#x) AS string_array#x, concat(array_array1#x, array_array2#x) AS array_array#x, concat(struct_array1#x, struct_array2#x) AS struct_array#x, concat(map_array1#x, map_array2#x) AS map_array#x] +- SubqueryAlias various_arrays - +- View (`various_arrays`, [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields]) - +- Project [cast(boolean_array1#x as array) AS boolean_array1#x, cast(boolean_array2#x as array) AS boolean_array2#x, cast(tinyint_array1#x as array) AS tinyint_array1#x, cast(tinyint_array2#x as array) AS tinyint_array2#x, cast(smallint_array1#x as array) AS smallint_array1#x, cast(smallint_array2#x as array) AS smallint_array2#x, cast(int_array1#x as array) AS int_array1#x, cast(int_array2#x as array) AS int_array2#x, cast(bigint_array1#x as array) AS bigint_array1#x, cast(bigint_array2#x as array) AS bigint_array2#x, cast(decimal_array1#x as array) AS decimal_array1#x, cast(decimal_array2#x as array) AS decimal_array2#x, cast(double_array1#x as array) AS double_array1#x, cast(double_array2#x as array) AS double_array2#x, cast(float_array1#x as array) AS float_array1#x, cast(float_array2#x as array) AS float_array2#x, cast(date_array1#x as array) AS date_array1#x, cast(data_array2#x as array) AS data_array2#x, cast(timestamp_array1#x as array) AS timestamp_array1#x, cast(timestamp_array2#x as array) AS timestamp_array2#x, cast(string_array1#x as array) AS string_array1#x, cast(string_array2#x as array) AS string_array2#x, cast(array_array1#x as array>) AS array_array1#x, cast(array_array2#x as array>) AS array_array2#x, ... 4 more fields] - +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] + +- View (`various_arrays`, [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields]) + +- Project [cast(boolean_array1#x as array) AS boolean_array1#x, cast(boolean_array2#x as array) AS boolean_array2#x, cast(tinyint_array1#x as array) AS tinyint_array1#x, cast(tinyint_array2#x as array) AS tinyint_array2#x, cast(smallint_array1#x as array) AS smallint_array1#x, cast(smallint_array2#x as array) AS smallint_array2#x, cast(int_array1#x as array) AS int_array1#x, cast(int_array2#x as array) AS int_array2#x, cast(bigint_array1#x as array) AS bigint_array1#x, cast(bigint_array2#x as array) AS bigint_array2#x, cast(decimal_array1#x as array) AS decimal_array1#x, cast(decimal_array2#x as array) AS decimal_array2#x, cast(double_array1#x as array) AS double_array1#x, cast(double_array2#x as array) AS double_array2#x, cast(float_array1#x as array) AS float_array1#x, cast(float_array2#x as array) AS float_array2#x, cast(date_array1#x as array) AS date_array1#x, cast(data_array2#x as array) AS data_array2#x, cast(timestamp_array1#x as array) AS timestamp_array1#x, cast(timestamp_array2#x as array) AS timestamp_array2#x, cast(string_array1#x as array) AS string_array1#x, cast(string_array2#x as array) AS string_array2#x, cast(array_array1#x as array>) AS array_array1#x, cast(array_array2#x as array>) AS array_array2#x, cast(struct_array1#x as array>) AS struct_array1#x, ... 3 more fields] + +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields] +- SubqueryAlias various_arrays - +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] + +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields] -- !query @@ -273,8 +273,8 @@ FROM various_arrays -- !query analysis Project [concat(cast(tinyint_array1#x as array), smallint_array2#x) AS ts_array#x, concat(cast(smallint_array1#x as array), int_array2#x) AS si_array#x, concat(cast(int_array1#x as array), bigint_array2#x) AS ib_array#x, concat(cast(bigint_array1#x as array), cast(decimal_array2#x as array)) AS bd_array#x, concat(cast(decimal_array1#x as array), double_array2#x) AS dd_array#x, concat(double_array1#x, cast(float_array2#x as array)) AS df_array#x, concat(cast(string_array1#x as array), data_array2#x) AS std_array#x, concat(timestamp_array1#x, cast(string_array2#x as array)) AS tst_array#x, concat(cast(string_array1#x as array), cast(int_array2#x as array)) AS sti_array#x] +- SubqueryAlias various_arrays - +- View (`various_arrays`, [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields]) - +- Project [cast(boolean_array1#x as array) AS boolean_array1#x, cast(boolean_array2#x as array) AS boolean_array2#x, cast(tinyint_array1#x as array) AS tinyint_array1#x, cast(tinyint_array2#x as array) AS tinyint_array2#x, cast(smallint_array1#x as array) AS smallint_array1#x, cast(smallint_array2#x as array) AS smallint_array2#x, cast(int_array1#x as array) AS int_array1#x, cast(int_array2#x as array) AS int_array2#x, cast(bigint_array1#x as array) AS bigint_array1#x, cast(bigint_array2#x as array) AS bigint_array2#x, cast(decimal_array1#x as array) AS decimal_array1#x, cast(decimal_array2#x as array) AS decimal_array2#x, cast(double_array1#x as array) AS double_array1#x, cast(double_array2#x as array) AS double_array2#x, cast(float_array1#x as array) AS float_array1#x, cast(float_array2#x as array) AS float_array2#x, cast(date_array1#x as array) AS date_array1#x, cast(data_array2#x as array) AS data_array2#x, cast(timestamp_array1#x as array) AS timestamp_array1#x, cast(timestamp_array2#x as array) AS timestamp_array2#x, cast(string_array1#x as array) AS string_array1#x, cast(string_array2#x as array) AS string_array2#x, cast(array_array1#x as array>) AS array_array1#x, cast(array_array2#x as array>) AS array_array2#x, ... 4 more fields] - +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] + +- View (`various_arrays`, [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields]) + +- Project [cast(boolean_array1#x as array) AS boolean_array1#x, cast(boolean_array2#x as array) AS boolean_array2#x, cast(tinyint_array1#x as array) AS tinyint_array1#x, cast(tinyint_array2#x as array) AS tinyint_array2#x, cast(smallint_array1#x as array) AS smallint_array1#x, cast(smallint_array2#x as array) AS smallint_array2#x, cast(int_array1#x as array) AS int_array1#x, cast(int_array2#x as array) AS int_array2#x, cast(bigint_array1#x as array) AS bigint_array1#x, cast(bigint_array2#x as array) AS bigint_array2#x, cast(decimal_array1#x as array) AS decimal_array1#x, cast(decimal_array2#x as array) AS decimal_array2#x, cast(double_array1#x as array) AS double_array1#x, cast(double_array2#x as array) AS double_array2#x, cast(float_array1#x as array) AS float_array1#x, cast(float_array2#x as array) AS float_array2#x, cast(date_array1#x as array) AS date_array1#x, cast(data_array2#x as array) AS data_array2#x, cast(timestamp_array1#x as array) AS timestamp_array1#x, cast(timestamp_array2#x as array) AS timestamp_array2#x, cast(string_array1#x as array) AS string_array1#x, cast(string_array2#x as array) AS string_array2#x, cast(array_array1#x as array>) AS array_array1#x, cast(array_array2#x as array>) AS array_array2#x, cast(struct_array1#x as array>) AS struct_array1#x, ... 3 more fields] + +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields] +- SubqueryAlias various_arrays - +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] + +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, struct_array1#x, ... 3 more fields] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out index 62bbdeba34c2e..7ae45bc0f241a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out @@ -71,9 +71,9 @@ CreateViewCommand `various_maps`, SELECT * FROM VALUES ( string_int_map1, string_int_map2, int_string_map1, int_string_map2 ), false, false, LocalTempView, UNSUPPORTED, true - +- Project [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields] + +- Project [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, struct_map1#x, ... 5 more fields] +- SubqueryAlias various_maps - +- LocalRelation [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields] + +- LocalRelation [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, struct_map1#x, ... 5 more fields] -- !query @@ -97,11 +97,11 @@ FROM various_maps -- !query analysis Project [map_concat(boolean_map1#x, boolean_map2#x) AS boolean_map#x, map_concat(tinyint_map1#x, tinyint_map2#x) AS tinyint_map#x, map_concat(smallint_map1#x, smallint_map2#x) AS smallint_map#x, map_concat(int_map1#x, int_map2#x) AS int_map#x, map_concat(bigint_map1#x, bigint_map2#x) AS bigint_map#x, map_concat(decimal_map1#x, decimal_map2#x) AS decimal_map#x, map_concat(float_map1#x, float_map2#x) AS float_map#x, map_concat(double_map1#x, double_map2#x) AS double_map#x, map_concat(date_map1#x, date_map2#x) AS date_map#x, map_concat(timestamp_map1#x, timestamp_map2#x) AS timestamp_map#x, map_concat(string_map1#x, string_map2#x) AS string_map#x, map_concat(array_map1#x, array_map2#x) AS array_map#x, map_concat(struct_map1#x, struct_map2#x) AS struct_map#x, map_concat(string_int_map1#x, string_int_map2#x) AS string_int_map#x, map_concat(int_string_map1#x, int_string_map2#x) AS int_string_map#x] +- SubqueryAlias various_maps - +- View (`various_maps`, [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields]) - +- Project [cast(boolean_map1#x as map) AS boolean_map1#x, cast(boolean_map2#x as map) AS boolean_map2#x, cast(tinyint_map1#x as map) AS tinyint_map1#x, cast(tinyint_map2#x as map) AS tinyint_map2#x, cast(smallint_map1#x as map) AS smallint_map1#x, cast(smallint_map2#x as map) AS smallint_map2#x, cast(int_map1#x as map) AS int_map1#x, cast(int_map2#x as map) AS int_map2#x, cast(bigint_map1#x as map) AS bigint_map1#x, cast(bigint_map2#x as map) AS bigint_map2#x, cast(decimal_map1#x as map) AS decimal_map1#x, cast(decimal_map2#x as map) AS decimal_map2#x, cast(double_map1#x as map) AS double_map1#x, cast(double_map2#x as map) AS double_map2#x, cast(float_map1#x as map) AS float_map1#x, cast(float_map2#x as map) AS float_map2#x, cast(date_map1#x as map) AS date_map1#x, cast(date_map2#x as map) AS date_map2#x, cast(timestamp_map1#x as map) AS timestamp_map1#x, cast(timestamp_map2#x as map) AS timestamp_map2#x, cast(string_map1#x as map) AS string_map1#x, cast(string_map2#x as map) AS string_map2#x, cast(array_map1#x as map,array>) AS array_map1#x, cast(array_map2#x as map,array>) AS array_map2#x, ... 6 more fields] - +- Project [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields] + +- View (`various_maps`, [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, struct_map1#x, ... 5 more fields]) + +- Project [cast(boolean_map1#x as map) AS boolean_map1#x, cast(boolean_map2#x as map) AS boolean_map2#x, cast(tinyint_map1#x as map) AS tinyint_map1#x, cast(tinyint_map2#x as map) AS tinyint_map2#x, cast(smallint_map1#x as map) AS smallint_map1#x, cast(smallint_map2#x as map) AS smallint_map2#x, cast(int_map1#x as map) AS int_map1#x, cast(int_map2#x as map) AS int_map2#x, cast(bigint_map1#x as map) AS bigint_map1#x, cast(bigint_map2#x as map) AS bigint_map2#x, cast(decimal_map1#x as map) AS decimal_map1#x, cast(decimal_map2#x as map) AS decimal_map2#x, cast(double_map1#x as map) AS double_map1#x, cast(double_map2#x as map) AS double_map2#x, cast(float_map1#x as map) AS float_map1#x, cast(float_map2#x as map) AS float_map2#x, cast(date_map1#x as map) AS date_map1#x, cast(date_map2#x as map) AS date_map2#x, cast(timestamp_map1#x as map) AS timestamp_map1#x, cast(timestamp_map2#x as map) AS timestamp_map2#x, cast(string_map1#x as map) AS string_map1#x, cast(string_map2#x as map) AS string_map2#x, cast(array_map1#x as map,array>) AS array_map1#x, cast(array_map2#x as map,array>) AS array_map2#x, cast(struct_map1#x as map,struct>) AS struct_map1#x, ... 5 more fields] + +- Project [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, struct_map1#x, ... 5 more fields] +- SubqueryAlias various_maps - +- LocalRelation [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields] + +- LocalRelation [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, struct_map1#x, ... 5 more fields] -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out index 1d76b9a8be8b9..27f02a3dcdb64 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out @@ -1330,7 +1330,7 @@ Aggregate [cast(udf(cast(cast(udf(cast(count(1) as string)) as bigint) as string +- Filter cast(udf(cast(cast(udf(cast(unique1#x as string)) as int) as string)) as int) IN (list#x []) : +- Project [cast(udf(cast(unique1#x as string)) as int) AS udf(unique1)#x] : +- Filter (cast(udf(cast(cast(udf(cast(unique2#x as string)) as int) as string)) as int) = cast(udf(cast(42 as string)) as int)) - : +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, ... 7 more fields] + : +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, ... 6 more fields] : +- Join Inner, (unique1#x = unique1#x) : :- SubqueryAlias b : : +- SubqueryAlias spark_catalog.default.tenk1 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out index f4e11d7628601..f9ff41cc81a6d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out @@ -385,11 +385,11 @@ FROM testData WINDOW w AS (PARTITION BY udf(cate) ORDER BY udf(val)) ORDER BY cate, udf(val) -- !query analysis -Project [udf(val)#x, cate#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ntile#x, row_number#x, ... 11 more fields] +Project [udf(val)#x, cate#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ntile#x, row_number#x, var_pop#x, ... 10 more fields] +- Sort [cate#x ASC NULLS FIRST, cast(udf(cast(val#x as string)) as int) ASC NULLS FIRST], true - +- Project [udf(val)#x, cate#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ntile#x, row_number#x, ... 12 more fields] - +- Project [udf(val)#x, cate#x, _w0#x, _w1#x, _w2#x, _w3#x, _w4#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, ... 50 more fields] - +- Window [max(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS max#x, min(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, min(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, count(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS count#xL, sum(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS sum#xL, avg(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS avg#x, stddev(_w2#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS stddev#x, first_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value#x, first_value(_w0#x, true) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_ignore_null#x, first_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_contain_null#x, any_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value#x, any_value(_w0#x, true) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_ignore_null#x, any_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_contain_null#x, last_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value#x, last_value(_w0#x, true) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_ignore_null#x, last_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_contain_null#x, rank(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#x, dense_rank(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS dense_rank#x, cume_dist() windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS cume_dist#x, percent_rank(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS percent_rank#x, ntile(2) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS ntile#x, row_number() windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#x, var_pop(_w2#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_pop#x, var_samp(_w2#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_samp#x, ... 9 more fields], [_w1#x], [_w0#x ASC NULLS FIRST] + +- Project [udf(val)#x, cate#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ntile#x, row_number#x, var_pop#x, ... 11 more fields] + +- Project [udf(val)#x, cate#x, _w0#x, _w1#x, _w2#x, _w3#x, _w4#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, ... 49 more fields] + +- Window [max(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS max#x, min(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, min(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, count(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS count#xL, sum(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS sum#xL, avg(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS avg#x, stddev(_w2#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS stddev#x, first_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value#x, first_value(_w0#x, true) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_ignore_null#x, first_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_contain_null#x, any_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value#x, any_value(_w0#x, true) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_ignore_null#x, any_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_contain_null#x, last_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value#x, last_value(_w0#x, true) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_ignore_null#x, last_value(_w0#x, false) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_contain_null#x, rank(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#x, dense_rank(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS dense_rank#x, cume_dist() windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS cume_dist#x, percent_rank(_w0#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS percent_rank#x, ntile(2) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS ntile#x, row_number() windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#x, var_pop(_w2#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_pop#x, var_samp(_w2#x) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_samp#x, approx_count_distinct(_w0#x, 0.05, 0, 0) windowspecdefinition(_w1#x, _w0#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS approx_count_distinct#xL, ... 8 more fields], [_w1#x], [_w0#x ASC NULLS FIRST] +- Project [cast(udf(cast(val#x as string)) as int) AS udf(val)#x, cate#x, cast(udf(cast(val#x as string)) as int) AS _w0#x, cast(udf(cast(cate#x as string)) as string) AS _w1#x, cast(cast(udf(cast(val#x as string)) as int) as double) AS _w2#x, cast(cast(udf(cast(val_long#xL as string)) as bigint) as double) AS _w3#x, cast(udf(cast(val_double#x as string)) as double) AS _w4#x, val#x] +- SubqueryAlias testdata +- View (`testData`, [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x]) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out index 5a74c4be107e3..d26c5ba4430da 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out @@ -833,6 +833,6 @@ WithCTE +- Project [coalesce(key#x, key#x) AS key#x, key#x, key#x, key#x] +- Join FullOuter, (key#x = key#x) :- SubqueryAlias t1 - : +- CTERelationRef xxxx, true, [key#x], false + : +- CTERelationRef xxxx, true, [key#x], false, false +- SubqueryAlias t2 - +- CTERelationRef xxxx, true, [key#x], false + +- CTERelationRef xxxx, true, [key#x], false, false diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out index c1638096312bd..f8a03652c02b6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out @@ -583,9 +583,9 @@ WINDOW w AS (PARTITION BY cate ORDER BY val) ORDER BY cate, val -- !query analysis Sort [cate#x ASC NULLS FIRST, val#x ASC NULLS FIRST], true -+- Project [val#x, cate#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ntile#x, row_number#x, ... 11 more fields] - +- Project [val#x, cate#x, _w0#x, _w1#x, val_double#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, ... 47 more fields] - +- Window [max(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS max#x, min(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, min(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, count(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS count#xL, sum(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS sum#xL, avg(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS avg#x, stddev(_w0#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS stddev#x, first_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value#x, first_value(val#x, true) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_ignore_null#x, first_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_contain_null#x, any_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value#x, any_value(val#x, true) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_ignore_null#x, any_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_contain_null#x, last_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value#x, last_value(val#x, true) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_ignore_null#x, last_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_contain_null#x, rank(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#x, dense_rank(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS dense_rank#x, cume_dist() windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS cume_dist#x, percent_rank(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS percent_rank#x, ntile(2) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS ntile#x, row_number() windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#x, var_pop(_w0#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_pop#x, var_samp(_w0#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_samp#x, ... 9 more fields], [cate#x], [val#x ASC NULLS FIRST] ++- Project [val#x, cate#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ntile#x, row_number#x, var_pop#x, ... 10 more fields] + +- Project [val#x, cate#x, _w0#x, _w1#x, val_double#x, max#x, min#x, min#x, count#xL, sum#xL, avg#x, stddev#x, first_value#x, first_value_ignore_null#x, first_value_contain_null#x, any_value#x, any_value_ignore_null#x, any_value_contain_null#x, last_value#x, last_value_ignore_null#x, last_value_contain_null#x, rank#x, dense_rank#x, cume_dist#x, percent_rank#x, ... 46 more fields] + +- Window [max(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS max#x, min(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, min(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min#x, count(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS count#xL, sum(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS sum#xL, avg(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS avg#x, stddev(_w0#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS stddev#x, first_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value#x, first_value(val#x, true) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_ignore_null#x, first_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value_contain_null#x, any_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value#x, any_value(val#x, true) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_ignore_null#x, any_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value_contain_null#x, last_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value#x, last_value(val#x, true) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_ignore_null#x, last_value(val#x, false) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value_contain_null#x, rank(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#x, dense_rank(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS dense_rank#x, cume_dist() windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS cume_dist#x, percent_rank(val#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS percent_rank#x, ntile(2) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS ntile#x, row_number() windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#x, var_pop(_w0#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_pop#x, var_samp(_w0#x) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS var_samp#x, approx_count_distinct(val#x, 0.05, 0, 0) windowspecdefinition(cate#x, val#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS approx_count_distinct#xL, ... 8 more fields], [cate#x], [val#x ASC NULLS FIRST] +- Project [val#x, cate#x, cast(val#x as double) AS _w0#x, cast(val_long#xL as double) AS _w1#x, val_double#x] +- SubqueryAlias testdata +- View (`testData`, [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x]) @@ -1009,7 +1009,7 @@ ORDER BY id -- !query analysis Sort [id#x ASC NULLS FIRST], true +- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x, lag_0#x, lag_1#x, lag_2#x, lag_3#x, lag_plus_3#x, nth_value_1#x, nth_value_2#x, nth_value_3#x, first_value#x, any_value#x, last_value#x] - +- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x, lag_0#x, lag_1#x, lag_2#x, lag_3#x, lag_plus_3#x, nth_value_1#x, nth_value_2#x, nth_value_3#x, first_value#x, any_value#x, last_value#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x, lag_0#x, lag_1#x, ... 9 more fields] + +- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x, lag_0#x, lag_1#x, lag_2#x, lag_3#x, lag_plus_3#x, nth_value_1#x, nth_value_2#x, nth_value_3#x, first_value#x, any_value#x, last_value#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x, lag_0#x, lag_1#x, lag_2#x, ... 8 more fields] +- Window [lead(v#x, 0, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 0, 0)) AS lead_0#x, lead(v#x, 1, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1, 1)) AS lead_1#x, lead(v#x, 2, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 2, 2)) AS lead_2#x, lead(v#x, 3, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 3, 3)) AS lead_3#x, lag(v#x, 0, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 0, 0)) AS lag_0#x, lag(v#x, -1, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS lag_1#x, lag(v#x, -2, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, -2, -2)) AS lag_2#x, lag(v#x, -3, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, -3, -3)) AS lag_3#x, lag(v#x, -3, null) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, -3, -3)) AS lag_plus_3#x, nth_value(v#x, 1, true) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS nth_value_1#x, nth_value(v#x, 2, true) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS nth_value_2#x, nth_value(v#x, 3, true) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS nth_value_3#x, first(v#x, true) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_value#x, any_value(v#x, true) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS any_value#x, last(v#x, true) windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS last_value#x], [id#x ASC NULLS FIRST] +- Project [content#x, id#x, v#x] +- SubqueryAlias test_ignore_null diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql index b4d33bb0196c9..df15adf2f8fe4 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/collations.sql @@ -328,6 +328,7 @@ select bit_length(utf8_binary), bit_length(utf8_lcase) from t5; select bit_length(utf8_binary collate utf8_lcase), bit_length(utf8_lcase collate utf8_binary) from t5; select octet_length(utf8_binary), octet_length(utf8_lcase) from t5; select octet_length(utf8_binary collate utf8_lcase), octet_length(utf8_lcase collate utf8_binary) from t5; +select octet_length(utf8_binary collate utf8_lcase_rtrim), octet_length(utf8_lcase collate utf8_binary_rtrim) from t5; -- Luhncheck select luhn_check(num) from t9; @@ -344,18 +345,22 @@ select levenshtein(utf8_binary, 'AaAA' collate utf8_lcase, 3), levenshtein(utf8_ -- IsValidUTF8 select is_valid_utf8(utf8_binary), is_valid_utf8(utf8_lcase) from t5; select is_valid_utf8(utf8_binary collate utf8_lcase), is_valid_utf8(utf8_lcase collate utf8_binary) from t5; +select is_valid_utf8(utf8_binary collate utf8_lcase_rtrim), is_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t5; -- MakeValidUTF8 select make_valid_utf8(utf8_binary), make_valid_utf8(utf8_lcase) from t5; select make_valid_utf8(utf8_binary collate utf8_lcase), make_valid_utf8(utf8_lcase collate utf8_binary) from t5; +select make_valid_utf8(utf8_binary collate utf8_lcase_rtrim), make_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t5; -- ValidateUTF8 select validate_utf8(utf8_binary), validate_utf8(utf8_lcase) from t5; select validate_utf8(utf8_binary collate utf8_lcase), validate_utf8(utf8_lcase collate utf8_binary) from t5; +select validate_utf8(utf8_binary collate utf8_lcase_rtrim), validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t5; -- TryValidateUTF8 select try_validate_utf8(utf8_binary), try_validate_utf8(utf8_lcase) from t5; select try_validate_utf8(utf8_binary collate utf8_lcase), try_validate_utf8(utf8_lcase collate utf8_binary) from t5; +select try_validate_utf8(utf8_binary collate utf8_lcase_rtrim), try_validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t5; -- Left/Right/Substr select substr(utf8_binary, 2, 2), substr(utf8_lcase, 2, 2) from t5; diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql index 67a94ce61617d..1e17529d545bf 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql @@ -1,5 +1,6 @@ create temporary view t as select * from values 0, 1, 2 as t(id); create temporary view t2 as select * from values 0, 1 as t(id); +create temporary view t3 as select * from t; -- WITH clause should not fall into infinite loop by referencing self WITH s AS (SELECT 1 FROM s) SELECT * FROM s; @@ -10,6 +11,9 @@ SELECT * FROM r; -- WITH clause should reference the base table WITH t AS (SELECT 1 FROM t) SELECT * FROM t; +-- Table `t` referenced by a view should take precedence over the top CTE `t` +WITH t AS (SELECT 1) SELECT * FROM t3; + -- WITH clause should not allow cross reference WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2; @@ -175,3 +179,4 @@ with cte as (select * from cte) select * from cte; -- Clean up DROP VIEW IF EXISTS t; DROP VIEW IF EXISTS t2; +DROP VIEW IF EXISTS t3; diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe.sql b/sql/core/src/test/resources/sql-tests/inputs/describe.sql index b37931456d00c..aa6f38defdecc 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/describe.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/describe.sql @@ -21,6 +21,14 @@ ALTER TABLE t ADD PARTITION (c='Us', d=1); DESCRIBE t; +DESCRIBE EXTENDED t AS JSON; + +-- AnalysisException: describe table as json must be extended +DESCRIBE t AS JSON; + +-- AnalysisException: describe col as json unsupported +DESC FORMATTED t a AS JSON; + DESC default.t; DESC TABLE t; @@ -39,6 +47,8 @@ DESC EXTENDED t; DESC t PARTITION (c='Us', d=1); +DESC EXTENDED t PARTITION (c='Us', d=1) AS JSON; + DESC EXTENDED t PARTITION (c='Us', d=1); DESC FORMATTED t PARTITION (c='Us', d=1); @@ -88,6 +98,7 @@ EXPLAIN DESC EXTENDED t; EXPLAIN EXTENDED DESC t; EXPLAIN DESCRIBE t b; EXPLAIN DESCRIBE t PARTITION (c='Us', d=2); +EXPLAIN DESCRIBE EXTENDED t PARTITION (c='Us', d=2) AS JSON; -- DROP TEST TABLES/VIEWS DROP TABLE t; @@ -119,3 +130,4 @@ DESC EXTENDED e; DESC TABLE EXTENDED e; DESC FORMATTED e; + diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 6dd0adbc87221..0cc1f62b0583a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -221,6 +221,8 @@ SELECT histogram_numeric(col, 3) FROM VALUES (CAST(1 AS SMALLINT)), (CAST(2 AS SMALLINT)), (CAST(3 AS SMALLINT)) AS tab(col); SELECT histogram_numeric(col, 3) FROM VALUES (CAST(1 AS BIGINT)), (CAST(2 AS BIGINT)), (CAST(3 AS BIGINT)) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS DECIMAL(4, 2))), (CAST(2 AS DECIMAL(4, 2))), (CAST(3 AS DECIMAL(4, 2))) AS tab(col); SELECT histogram_numeric(col, 3) FROM VALUES (TIMESTAMP '2017-03-01 00:00:00'), (TIMESTAMP '2017-04-01 00:00:00'), (TIMESTAMP '2017-05-01 00:00:00') AS tab(col); SELECT histogram_numeric(col, 3) FROM VALUES (INTERVAL '100-00' YEAR TO MONTH), diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql new file mode 100644 index 0000000000000..35f86183c37b3 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql @@ -0,0 +1,12 @@ +-- Test cases with collations +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); +SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1); +SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('x'), ('abc')) AS t(c1); +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1); +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1); + +-- Error case with collations +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql new file mode 100644 index 0000000000000..15c8cfa823e9b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql @@ -0,0 +1,38 @@ +-- Create temporary views +CREATE TEMP VIEW df AS +SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b); + +CREATE TEMP VIEW df2 AS +SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b); + +-- Test cases for listagg function +SELECT listagg(b) FROM df GROUP BY a; +SELECT string_agg(b) FROM df GROUP BY a; +SELECT listagg(b, NULL) FROM df GROUP BY a; +SELECT listagg(b) FROM df WHERE 1 != 1; +SELECT listagg(b, '|') FROM df GROUP BY a; +SELECT listagg(a) FROM df; +SELECT listagg(DISTINCT a) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df; +SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df; +SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); +SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); +SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); +SELECT listagg(a), listagg(b, ',') FROM df2; + +-- Error cases +SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1); +SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); +SELECT listagg(b, a) FROM df GROUP BY a; +SELECT listagg(a) OVER (ORDER BY a) FROM df; +SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df; +SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df; +SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df; +SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df; +SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql b/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql index b9224db129ea4..0cae29d722a8b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql @@ -71,6 +71,60 @@ create temporary view windowTestData as select * from values (3, 1L, 1.0D, date("2017-08-01"), timestamp_seconds(1501545600), null) AS testData(val, val_long, val_double, val_date, val_timestamp, cate); +-- FROM operators: positive tests. +---------------------------------- + +-- FromClause alone. +from t; + +-- Table alone. +table t; + +-- Selecting from a constant. +from t +|> select 1 as x; + +-- Selecting using a table alias. +from t as t_alias +|> select t_alias.x; + +-- Selecting using a table alias. +from t as t_alias +|> select t_alias.x as tx, t_alias.y as ty +|> where ty = 'def' +|> select tx; + +-- Selecting from multiple relations. +from t, other +|> select t.x + other.a as z; + +-- Selecting from multiple relations with join. +from t join other on (t.x = other.a) +|> select t.x + other.a as z; + +-- Selecting from lateral view. +from t lateral view explode(array(100, 101)) as ly +|> select t.x + ly as z; + +-- Selecting struct fields. +from st +|> select col.i1; + +-- Selecting struct fields using a table alias. +from st as st_alias +|> select st_alias.col.i1; + +-- Selecting from a VALUES list. +from values (0), (1) tab(col) +|> select col as x; + +-- FROM operators: negative tests. +---------------------------------- + +-- It is not possible to use the FROM operator accepting an input relation. +from t +|> from t; + -- SELECT operators: positive tests. --------------------------------------- @@ -241,6 +295,190 @@ table t table t |> extend *; +-- SET operators: positive tests. +--------------------------------- + +-- Setting with a constant. +-- The indicated column is not the last column in the table, and the SET operator will replace it +-- with the new value in its existing position. +table t +|> set x = 1; + +-- Setting with an attribute. +table t +|> set y = x; + +-- Setting with an expression. +table t +|> extend 1 as z +|> set z = x + length(y); + +-- Setting two times. +table t +|> extend 1 as z +|> extend 2 as zz +|> set z = x + length(y), zz = x + 1; + +table other +|> extend 3 as c +|> set a = b, b = c; + +-- Setting two times with a lateral reference. +table t +|> extend 1 as z +|> extend 2 as zz +|> set z = x + length(y), zz = z + 1; + +-- Setting two times in sequence. +table t +|> extend 1 as z +|> set z = x + length(y) +|> set z = z + 1; + +-- SET assignments with duplicate keys. This is supported, and we can update the column as we go. +table t +|> extend 1 as z +|> set z = x + length(y), z = z + 1; + +-- Setting with a struct field. +select col from st +|> extend 1 as z +|> set z = col.i1; + +-- Setting with a subquery. +table t +|> set y = (select a from other where x = a limit 1); + +-- Setting with a backquoted column name with a dot inside. +table t +|> extend 1 as `x.y.z` +|> set `x.y.z` = x + length(y); + +-- Window functions are allowed in the pipe operator SET list. +table t +|> extend 1 as z +|> set z = first_value(x) over (partition by y); + +-- Any prior table aliases remain visible after a SET operator. +values (0), (1) lhs(a) +|> inner join values (1), (2) rhs(a) using (a) +|> extend lhs.a + rhs.a as z1 +|> extend lhs.a - rhs.a as z2 +|> drop z1 +|> where z2 = 0 +|> order by lhs.a, rhs.a, z2 +|> set z2 = 4 +|> limit 2 +|> select lhs.a, rhs.a, z2; + +-- SET operators: negative tests. +--------------------------------- + +-- SET with a column name that does not exist in the input relation. +table t +|> set z = 1; + +-- SET with an alias. +table t +|> set x = 1 as z; + +-- Setting nested fields in structs is not supported. +select col from st +|> set col.i1 = 42; + +-- DROP operators: positive tests. +------------------------------------ + +-- Dropping a column. +table t +|> drop y; + +-- Dropping two times. +select 1 as x, 2 as y, 3 as z +|> drop z, y; + +-- Dropping two times in sequence. +select 1 as x, 2 as y, 3 as z +|> drop z +|> drop y; + +-- Dropping all columns in the input relation. +select x from t +|> drop x; + +-- Dropping a backquoted column name with a dot inside. +table t +|> extend 1 as `x.y.z` +|> drop `x.y.z`; + +-- DROP operators: negative tests. +---------------------------------- + +-- Dropping a column that is not present in the input relation. +table t +|> drop z; + +-- Attempting to drop a struct field. +table st +|> drop col.i1; + +table st +|> drop `col.i1`; + +-- Duplicate fields in the drop list. +select 1 as x, 2 as y, 3 as z +|> drop z, y, z; + +-- AS operators: positive tests. +-------------------------------- + +-- Renaming a table. +table t +|> as u +|> select u.x, u.y; + +-- Renaming an input relation that is not a table. +select 1 as x, 2 as y +|> as u +|> select u.x, u.y; + +-- Renaming as a backquoted name including a period. +table t +|> as `u.v` +|> select `u.v`.x, `u.v`.y; + +-- Renaming two times. +table t +|> as u +|> as v +|> select v.x, v.y; + +-- Filtering by referring to the table or table subquery alias. +table t +|> as u +|> where u.x = 1; + +-- AS operators: negative tests. +-------------------------------- + +-- Multiple aliases are not supported. +table t +|> as u, v; + +-- Expressions are not supported. +table t +|> as 1 + 2; + +-- Renaming as an invalid name. +table t +|> as u-v; + +table t +|> as u@v; + +table t +|> as u#######v; + -- WHERE operators: positive tests. ----------------------------------- @@ -316,6 +554,21 @@ table t |> select x, length(y) as z |> where x + length(y) < 4; +table t +|> select x, length(y) as z +|> limit 1000 +|> where x + length(y) < 4; + +table t +|> select x, length(y) as z +|> limit 1000 offset 1 +|> where x + length(y) < 4; + +table t +|> select x, length(y) as z +|> order by x, y +|> where x + length(y) < 4; + -- If the WHERE clause wants to filter rows produced by an aggregation, it is not valid to try to -- refer to the aggregate functions directly; it is necessary to use aliases instead. (select x, sum(length(y)) as sum_len from t group by x) @@ -617,10 +870,17 @@ values (0, 'abc') tab(x, y) |> union all table t; -- Union distinct with a VALUES list. -values (0, 1) tab(x, y) +-- The |> WHERE operator applies to the result of the |> UNION operator, not to the "table t" input. +values (2, 'xyz') tab(x, y) |> union table t |> where x = 0; +-- Union distinct with a VALUES list. +-- The |> DROP operator applies to the result of the |> UNION operator, not to the "table t" input. +values (2, 'xyz') tab(x, y) +|> union table t +|> drop x; + -- Union all with a table subquery on both the source and target sides. (select * from t) |> union all (select * from t); @@ -772,6 +1032,36 @@ select 1 as x, 2 as y select 3 as x, 4 as y |> aggregate group by 1, 2; +values (3, 4) as tab(x, y) +|> aggregate sum(y) group by 1; + +values (3, 4), (5, 4) as tab(x, y) +|> aggregate sum(y) group by 1; + +select 3 as x, 4 as y +|> aggregate sum(y) group by 1, 1; + +select 1 as `1`, 2 as `2` +|> aggregate sum(`2`) group by `1`; + +select 3 as x, 4 as y +|> aggregate sum(y) group by 2; + +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 2; + +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 3; + +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 2, 3; + +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 1, 2, 3; + +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by x, 2, 3; + -- Basic table aggregation. table t |> aggregate sum(x); @@ -961,6 +1251,502 @@ table windowTestData |> select cate, val, sum(val) over w as sum_val window w as (order by val); +-- Exercise SQL compilation using a subset of TPC-DS table schemas. +------------------------------------------------------------------- + +-- Q1 +with customer_total_return as +(select + sr_customer_sk as ctr_customer_sk, + sr_store_sk as ctr_store_sk, + sum(sr_return_amt) as ctr_total_return + from store_returns, date_dim + where sr_returned_date_sk = d_date_sk and d_year = 2000 + group by sr_customer_sk, sr_store_sk) +select c_customer_id +from customer_total_return ctr1, store, customer +where ctr1.ctr_total_return > + (select avg(ctr_total_return) * 1.2 + from customer_total_return ctr2 + where ctr1.ctr_store_sk = ctr2.ctr_store_sk) + and s_store_sk = ctr1.ctr_store_sk + and s_state = 'tn' + and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100; + +with customer_total_return as + (from store_returns + |> join date_dim + |> where sr_returned_date_sk = d_date_sk and d_year = 2000 + |> aggregate sum(sr_return_amt) as ctr_total_return + group by sr_customer_sk as ctr_customer_sk, sr_store_sk as ctr_store_sk) +from customer_total_return ctr1 +|> join store +|> join customer +|> where ctr1.ctr_total_return > + (table customer_total_return + |> as ctr2 + |> where ctr1.ctr_store_sk = ctr2.ctr_store_sk + |> aggregate avg(ctr_total_return) * 1.2) + and s_store_sk = ctr1.ctr_store_sk + and s_state = 'tn' + and ctr1.ctr_customer_sk = c_customer_sk +|> order by c_customer_id +|> limit 100 +|> select c_customer_id; + +-- Q2 +with wscs as +( select + sold_date_sk, + sales_price + from (select + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + from web_sales) x + union all + (select + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price + from catalog_sales)), + wswscs as + ( select + d_week_seq, + sum(case when (d_day_name = 'sunday') + then sales_price + else null end) + sun_sales, + sum(case when (d_day_name = 'monday') + then sales_price + else null end) + mon_sales, + sum(case when (d_day_name = 'tuesday') + then sales_price + else null end) + tue_sales, + sum(case when (d_day_name = 'wednesday') + then sales_price + else null end) + wed_sales, + sum(case when (d_day_name = 'thursday') + then sales_price + else null end) + thu_sales, + sum(case when (d_day_name = 'friday') + then sales_price + else null end) + fri_sales, + sum(case when (d_day_name = 'saturday') + then sales_price + else null end) + sat_sales + from wscs, date_dim + where d_date_sk = sold_date_sk + group by d_week_seq) +select + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2) +from + (select + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + from wswscs, date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and d_year = 2001) y, + (select + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + from wswscs, date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and d_year = 2001 + 1) z +where d_week_seq1 = d_week_seq2 - 53 +order by d_week_seq1; + +with wscs as + (table web_sales + |> select + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + |> as x + |> union all ( + table catalog_sales + |> select + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price) + |> select + sold_date_sk, + sales_price), +wswscs as + (table wscs + |> join date_dim + |> where d_date_sk = sold_date_sk + |> aggregate + sum(case when (d_day_name = 'sunday') + then sales_price + else null end) + sun_sales, + sum(case when (d_day_name = 'monday') + then sales_price + else null end) + mon_sales, + sum(case when (d_day_name = 'tuesday') + then sales_price + else null end) + tue_sales, + sum(case when (d_day_name = 'wednesday') + then sales_price + else null end) + wed_sales, + sum(case when (d_day_name = 'thursday') + then sales_price + else null end) + thu_sales, + sum(case when (d_day_name = 'friday') + then sales_price + else null end) + fri_sales, + sum(case when (d_day_name = 'saturday') + then sales_price + else null end) + sat_sales + group by d_week_seq) +table wswscs +|> join date_dim +|> where date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 +|> select + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 +|> as y +|> join ( + table wswscs + |> join date_dim + |> where date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 + 1 + |> select + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + |> as z) +|> where d_week_seq1 = d_week_seq2 - 53 +|> order by d_week_seq1 +|> select + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2); + +-- Q3 +select + dt.d_year, + item.i_brand_id brand_id, + item.i_brand brand, + sum(ss_ext_sales_price) sum_agg +from date_dim dt, store_sales, item +where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 128 + and dt.d_moy = 11 +group by dt.d_year, item.i_brand, item.i_brand_id +order by dt.d_year, sum_agg desc, brand_id +limit 100; + +table date_dim +|> as dt +|> join store_sales +|> join item +|> where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 128 + and dt.d_moy = 11 +|> aggregate sum(ss_ext_sales_price) sum_agg + group by dt.d_year d_year, item.i_brand_id brand_id, item.i_brand brand +|> order by d_year, sum_agg desc, brand_id +|> limit 100; + +-- Q12 +select + i_item_desc, + i_category, + i_class, + i_current_price, + sum(ws_ext_sales_price) as itemrevenue, + sum(ws_ext_sales_price) * 100 / sum(sum(ws_ext_sales_price)) + over + (partition by i_class) as revenueratio +from + web_sales, item, date_dim +where + ws_item_sk = i_item_sk + and i_category in ('sports', 'books', 'home') + and ws_sold_date_sk = d_date_sk + and d_date between cast('1999-02-22' as date) + and (cast('1999-02-22' as date) + interval 30 days) +group by + i_item_id, i_item_desc, i_category, i_class, i_current_price +order by + i_category, i_class, i_item_id, i_item_desc, revenueratio +limit 100; + +table web_sales +|> join item +|> join date_dim +|> where ws_item_sk = i_item_sk + and i_category in ('sports', 'books', 'home') + and ws_sold_date_sk = d_date_sk + and d_date between cast('1999-02-22' as date) + and (cast('1999-02-22' as date) + interval 30 days) +|> aggregate sum(ws_ext_sales_price) AS itemrevenue + group by i_item_id, i_item_desc, i_category, i_class, i_current_price +|> extend + itemrevenue * 100 / sum(itemrevenue) + over (partition by i_class) as revenueratio +|> order by i_category, i_class, i_item_id, i_item_desc, revenueratio +|> select i_item_desc, i_category, i_class, i_current_price, itemrevenue, revenueratio +|> limit 100; + +-- Q44 +select + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing +from (select * +from (select + item_sk, + rank() + over ( + order by rank_col asc) rnk +from (select + ss_item_sk item_sk, + avg(ss_net_profit) rank_col +from store_sales ss1 +where ss_store_sk = 4 +group by ss_item_sk +having avg(ss_net_profit) > 0.9 * (select avg(ss_net_profit) rank_col +from store_sales +where ss_store_sk = 4 + and ss_addr_sk is null +group by ss_store_sk)) v1) v11 +where rnk < 11) asceding, + (select * + from (select + item_sk, + rank() + over ( + order by rank_col desc) rnk + from (select + ss_item_sk item_sk, + avg(ss_net_profit) rank_col + from store_sales ss1 + where ss_store_sk = 4 + group by ss_item_sk + having avg(ss_net_profit) > 0.9 * (select avg(ss_net_profit) rank_col + from store_sales + where ss_store_sk = 4 + and ss_addr_sk is null + group by ss_store_sk)) v2) v21 + where rnk < 11) descending, + item i1, item i2 +where asceding.rnk = descending.rnk + and i1.i_item_sk = asceding.item_sk + and i2.i_item_sk = descending.item_sk +order by asceding.rnk +limit 100; + +from store_sales ss1 +|> where ss_store_sk = 4 +|> aggregate avg(ss_net_profit) rank_col + group by ss_item_sk as item_sk +|> where rank_col > 0.9 * ( + from store_sales + |> where ss_store_sk = 4 + and ss_addr_sk is null + |> aggregate avg(ss_net_profit) rank_col + group by ss_store_sk + |> select rank_col) +|> as v1 +|> select + item_sk, + rank() over ( + order by rank_col asc) rnk +|> as v11 +|> where rnk < 11 +|> as asceding +|> join ( + from store_sales ss1 + |> where ss_store_sk = 4 + |> aggregate avg(ss_net_profit) rank_col + group by ss_item_sk as item_sk + |> where rank_col > 0.9 * ( + table store_sales + |> where ss_store_sk = 4 + and ss_addr_sk is null + |> aggregate avg(ss_net_profit) rank_col + group by ss_store_sk + |> select rank_col) + |> as v2 + |> select + item_sk, + rank() over ( + order by rank_col asc) rnk + |> as v21 + |> where rnk < 11) descending +|> join item i1 +|> join item i2 +|> where asceding.rnk = descending.rnk + and i1.i_item_sk = asceding.item_sk + and i2.i_item_sk = descending.item_sk +|> order by asceding.rnk +|> select + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing; + +-- Q51 +with web_v1 as ( + select + ws_item_sk item_sk, + d_date, + sum(sum(ws_sales_price)) + over (partition by ws_item_sk + order by d_date + rows between unbounded preceding and current row) cume_sales + from web_sales, date_dim + where ws_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ws_item_sk is not null + group by ws_item_sk, d_date), + store_v1 as ( + select + ss_item_sk item_sk, + d_date, + sum(sum(ss_sales_price)) + over (partition by ss_item_sk + order by d_date + rows between unbounded preceding and current row) cume_sales + from store_sales, date_dim + where ss_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ss_item_sk is not null + group by ss_item_sk, d_date) +select * +from (select + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) web_cumulative, + max(store_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) store_cumulative +from (select + case when web.item_sk is not null + then web.item_sk + else store.item_sk end item_sk, + case when web.d_date is not null + then web.d_date + else store.d_date end d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +from web_v1 web full outer join store_v1 store on (web.item_sk = store.item_sk + and web.d_date = store.d_date) + ) x) y +where web_cumulative > store_cumulative +order by item_sk, d_date +limit 100; + +with web_v1 as ( + table web_sales + |> join date_dim + |> where ws_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ws_item_sk is not null + |> aggregate sum(ws_sales_price) as sum_ws_sales_price + group by ws_item_sk as item_sk, d_date + |> extend sum(sum_ws_sales_price) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) + as cume_sales), +store_v1 as ( + table store_sales + |> join date_dim + |> where ss_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ss_item_sk is not null + |> aggregate sum(ss_sales_price) as sum_ss_sales_price + group by ss_item_sk as item_sk, d_date + |> extend sum(sum_ss_sales_price) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) + as cume_sales) +table web_v1 +|> as web +|> full outer join store_v1 store + on (web.item_sk = store.item_sk and web.d_date = store.d_date) +|> select + case when web.item_sk is not null + then web.item_sk + else store.item_sk end item_sk, + case when web.d_date is not null + then web.d_date + else store.d_date end d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +|> as x +|> select + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) web_cumulative, + max(store_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) store_cumulative +|> as y +|> where web_cumulative > store_cumulative +|> order by item_sk, d_date +|> limit 100; + -- Cleanup. ----------- drop table t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/random.sql b/sql/core/src/test/resources/sql-tests/inputs/random.sql index a71b0293295fc..95be99595cc8c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/random.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/random.sql @@ -22,30 +22,59 @@ SELECT uniform(0, 1, 0) AS result; SELECT uniform(0, 10, 0) AS result; SELECT uniform(0L, 10L, 0) AS result; SELECT uniform(0, 10L, 0) AS result; +SELECT uniform(0, cast(10 as tinyint), 0) AS result; +SELECT uniform(0, cast(10 as smallint), 0) AS result; SELECT uniform(0, 10S, 0) AS result; SELECT uniform(10, 20, 0) AS result; SELECT uniform(10.0F, 20.0F, 0) AS result; +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), 0) AS result; +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(11, 4)), 0) AS result; +SELECT uniform(10, cast(20 as decimal(10, 3)), 0) AS result; +SELECT uniform(cast(10 as decimal(10, 3)), 20, 0) AS result; SELECT uniform(10.0D, 20.0D, CAST(3 / 7 AS LONG)) AS result; SELECT uniform(10, 20.0F, 0) AS result; SELECT uniform(10, 20, 0) AS result FROM VALUES (0), (1), (2) tab(col); SELECT uniform(10, 20.0F) IS NOT NULL AS result; --- Negative test cases for the uniform random number generator. +SELECT uniform(-10L, 10L, 0) AS result; +SELECT uniform(-20L, -10L, 0) AS result; +SELECT uniform(-20L, -10L, -10) AS result; SELECT uniform(NULL, 1, 0) AS result; +SELECT uniform(cast(NULL AS int), 1, 0) AS result; +SELECT uniform(cast(NULL AS float), 1, 0) AS result; SELECT uniform(0, NULL, 0) AS result; +SELECT uniform(0, cast(NULL AS int), 0) AS result; +SELECT uniform(0, cast(NULL AS float), 0) AS result; SELECT uniform(0, 1, NULL) AS result; +SELECT uniform(NULL, NULL, 0) AS result; +SELECT uniform(NULL, NULL, NULL) AS result; +-- Negative test cases for the uniform random number generator. +SELECT uniform(0, 1, cast(NULL as int)) AS result; +SELECT uniform(0, 1, cast(NULL as float)) AS result; SELECT uniform(10, 20, col) AS result FROM VALUES (0), (1), (2) tab(col); SELECT uniform(col, 10, 0) AS result FROM VALUES (0), (1), (2) tab(col); SELECT uniform(10) AS result; SELECT uniform(10, 20, 30, 40) AS result; +SELECT uniform(10.0F, 20.0F, 0.0F) AS result; +SELECT uniform(10.0F, 20.0F, 0.0D) AS result; +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), cast(0 as decimal(10, 3))); +SELECT uniform('abc', 10, 0) AS result; +SELECT uniform(0, 'def', 0) AS result; +SELECT uniform(0, 10, 'ghi') AS result; -- The randstr random string generation function supports generating random strings within a --- specified length. We use a seed of zero for these queries to keep tests deterministic. +-- specified length. We use a seed of zero for most queries to keep tests deterministic. SELECT randstr(1, 0) AS result; SELECT randstr(5, 0) AS result; SELECT randstr(10, 0) AS result; SELECT randstr(10S, 0) AS result; +SELECT randstr(CAST(10 AS TINYINT), 0) AS result; +SELECT randstr(CAST(10 AS BIGINT), 0) AS result; +SELECT randstr(1.0F, 0) AS result; +SELECT randstr(1.0D, 0) AS result; +SELECT randstr(cast(1 AS DECIMAL(10, 2)), 0) AS result; SELECT randstr(10, 0) AS result FROM VALUES (0), (1), (2) tab(col); SELECT randstr(10) IS NOT NULL AS result; +SELECT randstr(1, -1) AS result; -- Negative test cases for the randstr random number generator. SELECT randstr(10L, 0) AS result; SELECT randstr(10.0F, 0) AS result; @@ -55,3 +84,6 @@ SELECT randstr(0, NULL) AS result; SELECT randstr(col, 0) AS result FROM VALUES (0), (1), (2) tab(col); SELECT randstr(10, col) AS result FROM VALUES (0), (1), (2) tab(col); SELECT randstr(10, 0, 1) AS result; +SELECT randstr(-1, 0) AS result; +SELECT randstr(10, "a") AS result FROM VALUES (0) tab(a); +SELECT randstr(10, 1.5) AS result FROM VALUES (0) tab(a); diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql index 8a00e4400e6b0..c3a16ca577ee9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql @@ -28,3 +28,5 @@ SELECT * FROM json.`${spark.sql.warehouse.dir}/sql_on_files.db/test_json`; DROP TABLE sql_on_files.test_json; DROP DATABASE sql_on_files; + +SELECT * FROM json.`https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json`; diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-udf.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-udf.sql new file mode 100644 index 0000000000000..34cb41d726766 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/sql-udf.sql @@ -0,0 +1,122 @@ +-- test cases for SQL User Defined Functions + +-- 1. CREATE FUNCTION +-- 1.1 Parameter +-- 1.1.a A scalar function with various numbers of parameter +-- Expect success +CREATE FUNCTION foo1a0() RETURNS INT RETURN 1; +-- Expect: 1 +SELECT foo1a0(); +-- Expect failure +SELECT foo1a0(1); + +CREATE FUNCTION foo1a1(a INT) RETURNS INT RETURN 1; +-- Expect: 1 +SELECT foo1a1(1); +-- Expect failure +SELECT foo1a1(1, 2); + +CREATE FUNCTION foo1a2(a INT, b INT, c INT, d INT) RETURNS INT RETURN 1; +-- Expect: 1 +SELECT foo1a2(1, 2, 3, 4); + +------------------------------- +-- 2. Scalar SQL UDF +-- 2.1 deterministic simple expressions +CREATE FUNCTION foo2_1a(a INT) RETURNS INT RETURN a; +SELECT foo2_1a(5); + +CREATE FUNCTION foo2_1b(a INT, b INT) RETURNS INT RETURN a + b; +SELECT foo2_1b(5, 6); + +CREATE FUNCTION foo2_1c(a INT, b INT) RETURNS INT RETURN 10 * (a + b) + 100 * (a -b); +SELECT foo2_1c(5, 6); + +CREATE FUNCTION foo2_1d(a INT, b INT) RETURNS INT RETURN ABS(a) - LENGTH(CAST(b AS VARCHAR(10))); +SELECT foo2_1d(-5, 6); + +-- 2.2 deterministic complex expression with subqueries +-- 2.2.1 Nested Scalar subqueries +CREATE FUNCTION foo2_2a(a INT) RETURNS INT RETURN SELECT a; +SELECT foo2_2a(5); + +CREATE FUNCTION foo2_2b(a INT) RETURNS INT RETURN 1 + (SELECT a); +SELECT foo2_2b(5); + +-- Expect error: deep correlation is not yet supported +CREATE FUNCTION foo2_2c(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT a)); +-- SELECT foo2_2c(5); + +-- Expect error: deep correlation is not yet supported +CREATE FUNCTION foo2_2d(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT (SELECT (SELECT a)))); +-- SELECT foo2_2d(5); + +-- 2.2.2 Set operations +-- Expect error: correlated scalar subquery must be aggregated. +CREATE FUNCTION foo2_2e(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) WHERE c1 = 2 +UNION ALL +SELECT a + 1 FROM (VALUES 1) AS V(c1); +-- SELECT foo2_2e(5); + +-- Expect error: correlated scalar subquery must be aggregated. +CREATE FUNCTION foo2_2f(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) +EXCEPT +SELECT a + 1 FROM (VALUES 1) AS V(a); +-- SELECT foo2_2f(5); + +-- Expect error: correlated scalar subquery must be aggregated. +CREATE FUNCTION foo2_2g(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) +INTERSECT +SELECT a FROM (VALUES 1) AS V(a); +-- SELECT foo2_2g(5); + +-- Prepare by dropping views or tables if they already exist. +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS ts; +DROP TABLE IF EXISTS tm; +DROP TABLE IF EXISTS ta; +DROP TABLE IF EXISTS V1; +DROP TABLE IF EXISTS V2; +DROP VIEW IF EXISTS t1; +DROP VIEW IF EXISTS t2; +DROP VIEW IF EXISTS ts; +DROP VIEW IF EXISTS tm; +DROP VIEW IF EXISTS ta; +DROP VIEW IF EXISTS V1; +DROP VIEW IF EXISTS V2; + +-- 2.3 Calling Scalar UDF from various places +CREATE FUNCTION foo2_3(a INT, b INT) RETURNS INT RETURN a + b; +CREATE VIEW V1(c1, c2) AS VALUES (1, 2), (3, 4), (5, 6); +CREATE VIEW V2(c1, c2) AS VALUES (-1, -2), (-3, -4), (-5, -6); + +-- 2.3.1 Multiple times in the select list +SELECT foo2_3(c1, c2), foo2_3(c2, 1), foo2_3(c1, c2) - foo2_3(c2, c1 - 1) FROM V1 ORDER BY 1, 2, 3; + +-- 2.3.2 In the WHERE clause +SELECT * FROM V1 WHERE foo2_3(c1, 0) = c1 AND foo2_3(c1, c2) < 8; + +-- 2.3.3 Different places around an aggregate +SELECT foo2_3(SUM(c1), SUM(c2)), SUM(c1) + SUM(c2), SUM(foo2_3(c1, c2) + foo2_3(c2, c1) - foo2_3(c2, c1)) +FROM V1; + +-- 2.4 Scalar UDF with complex one row relation subquery +-- 2.4.1 higher order functions +CREATE FUNCTION foo2_4a(a ARRAY) RETURNS STRING RETURN +SELECT array_sort(a, (i, j) -> rank[i] - rank[j])[0] FROM (SELECT MAP('a', 1, 'b', 2) rank); + +SELECT foo2_4a(ARRAY('a', 'b')); + +-- 2.4.2 built-in functions +CREATE FUNCTION foo2_4b(m MAP, k STRING) RETURNS STRING RETURN +SELECT v || ' ' || v FROM (SELECT upper(m[k]) AS v); + +SELECT foo2_4b(map('a', 'hello', 'b', 'world'), 'a'); + +-- Clean up +DROP VIEW V2; +DROP VIEW V1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql index 5b98f056ebc5a..d2aef1f83863b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql @@ -33,3 +33,8 @@ CREATE OR REPLACE TEMPORARY VIEW src2 AS SELECT * FROM VALUES (2, 1.0), (3, 3.2) SELECT * FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d); SELECT dst.* FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d); + +-- Negative examples after aliasing +SELECT src1.* FROM src1 a ORDER BY id LIMIT 1; + +SELECT src1.id FROM (SELECT * FROM src1 ORDER BY id LIMIT 1) a; diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out index 70de8585ef782..0c141c08d436f 100644 --- a/sql/core/src/test/resources/sql-tests/results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out @@ -185,7 +185,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX_IN_ELEMENT_AT", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "3", "indexValue" : "5" }, @@ -209,7 +208,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX_IN_ELEMENT_AT", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "3", "indexValue" : "-5" }, @@ -252,7 +250,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "2", "indexValue" : "4" }, @@ -276,7 +273,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "2", "indexValue" : "0" }, @@ -300,7 +296,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "2", "indexValue" : "-1" }, @@ -356,7 +351,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "3", "indexValue" : "5" }, @@ -380,7 +374,6 @@ org.apache.spark.SparkArrayIndexOutOfBoundsException "condition" : "INVALID_ARRAY_INDEX", "sqlState" : "22003", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", "arraySize" : "3", "indexValue" : "-1" }, diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out index ef084a8ce47d1..93ff8dd4b320b 100644 --- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out @@ -195,7 +195,7 @@ View Text select * from char_tbl View Original Text select * from char_tbl View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c, v] +View Query Output Columns [`c`, `v`] -- !query @@ -366,7 +366,7 @@ View Text select * from char_tbl2 View Original Text select * from char_tbl2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c, v] +View Query Output Columns [`c`, `v`] -- !query @@ -427,7 +427,7 @@ View Text select * from char_tbl2 View Original Text select * from char_tbl2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c, v] +View Query Output Columns [`c`, `v`] Table Properties [yes=no] @@ -488,7 +488,7 @@ View Text select * from char_tbl2 View Original Text select * from char_tbl2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c, v] +View Query Output Columns [`c`, `v`] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index 245e1dd0b56de..8c150b1de03e9 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -479,7 +479,7 @@ struct +struct<(collate(a, unicode) < 'A' collate UNICODE):boolean> -- !query output true @@ -487,7 +487,7 @@ true -- !query select 'a' collate unicode_ci = 'A' -- !query schema -struct<(collate(a, unicode_ci) = A):boolean> +struct<(collate(a, unicode_ci) = 'A' collate UNICODE_CI):boolean> -- !query output true @@ -495,7 +495,7 @@ true -- !query select 'a' collate unicode_ai = 'å' -- !query schema -struct<(collate(a, unicode_ai) = å):boolean> +struct<(collate(a, unicode_ai) = 'å' collate UNICODE_AI):boolean> -- !query output true @@ -503,7 +503,7 @@ true -- !query select 'a' collate unicode_ci_ai = 'Å' -- !query schema -struct<(collate(a, unicode_ci_ai) = Å):boolean> +struct<(collate(a, unicode_ci_ai) = 'Å' collate UNICODE_CI_AI):boolean> -- !query output true @@ -511,7 +511,7 @@ true -- !query select 'a' collate en < 'A' -- !query schema -struct<(collate(a, en) < A):boolean> +struct<(collate(a, en) < 'A' collate en):boolean> -- !query output true @@ -519,7 +519,7 @@ true -- !query select 'a' collate en_ci = 'A' -- !query schema -struct<(collate(a, en_ci) = A):boolean> +struct<(collate(a, en_ci) = 'A' collate en_CI):boolean> -- !query output true @@ -527,7 +527,7 @@ true -- !query select 'a' collate en_ai = 'å' -- !query schema -struct<(collate(a, en_ai) = å):boolean> +struct<(collate(a, en_ai) = 'å' collate en_AI):boolean> -- !query output true @@ -535,7 +535,7 @@ true -- !query select 'a' collate en_ci_ai = 'Å' -- !query schema -struct<(collate(a, en_ci_ai) = Å):boolean> +struct<(collate(a, en_ci_ai) = 'Å' collate en_CI_AI):boolean> -- !query output true @@ -543,7 +543,7 @@ true -- !query select 'Kypper' collate sv < 'Köpfe' -- !query schema -struct<(collate(Kypper, sv) < Köpfe):boolean> +struct<(collate(Kypper, sv) < 'Köpfe' collate sv):boolean> -- !query output true @@ -551,7 +551,7 @@ true -- !query select 'Kypper' collate de > 'Köpfe' -- !query schema -struct<(collate(Kypper, de) > Köpfe):boolean> +struct<(collate(Kypper, de) > 'Köpfe' collate de):boolean> -- !query output true @@ -559,7 +559,7 @@ true -- !query select 'I' collate tr_ci = 'ı' -- !query schema -struct<(collate(I, tr_ci) = ı):boolean> +struct<(collate(I, tr_ci) = 'ı' collate tr_CI):boolean> -- !query output true @@ -1109,7 +1109,7 @@ kitten -- !query select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5 -- !query schema -struct +struct -- !query output Hello, world! Nice day. Hello, world! Nice day. Something else. Nothing here. Something else. Nothing here. @@ -2492,7 +2492,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "inputType" : "\"STRING COLLATE UNICODE_AI\"", "paramIndex" : "first", "requiredType" : "\"STRING\"", - "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), abc)\"" + "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\"" }, "queryContext" : [ { "objectType" : "", @@ -3342,7 +3342,7 @@ ksitTing -- !query select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5 -- !query schema -struct +struct -- !query output Hallo, world! Nice day. Hallo, world! Nice day. Saark SaL @@ -3583,6 +3583,28 @@ struct +-- !query output +23 23 +29 29 +3 3 +3 3 +3 4 +3 4 +4 3 +4 4 +5 3 +6 7 +7 7 +8 1 +8 24 +8 8 +8 8 + + -- !query select luhn_check(num) from t9 -- !query schema @@ -3776,6 +3798,28 @@ true true true true +-- !query +select is_valid_utf8(utf8_binary collate utf8_lcase_rtrim), is_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query schema +struct +-- !query output +true true +true true +true true +true true +true true +true true +true true +true true +true true +true true +true true +true true +true true +true true +true true + + -- !query select make_valid_utf8(utf8_binary), make_valid_utf8(utf8_lcase) from t5 -- !query schema @@ -3820,6 +3864,28 @@ kitten sitTing İo İo +-- !query +select make_valid_utf8(utf8_binary collate utf8_lcase_rtrim), make_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query schema +struct +-- !query output +Hello, world! Nice day. Hello, world! Nice day. +Something else. Nothing here. Something else. Nothing here. +Spark SQL +aaAaAAaA aaAaAAaA +aaAaAAaA aaAaaAaA +aaAaAAaA aaAaaAaAaaAaaAaAaaAaaAaA +abc abc +abcdcba aBcDCbA +bbAbAAbA a +efd2 efd2 +kitten sitTing +İo i̇o +İo İo +İo İo +İo İo + + -- !query select validate_utf8(utf8_binary), validate_utf8(utf8_lcase) from t5 -- !query schema @@ -3864,6 +3930,28 @@ kitten sitTing İo İo +-- !query +select validate_utf8(utf8_binary collate utf8_lcase_rtrim), validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query schema +struct +-- !query output +Hello, world! Nice day. Hello, world! Nice day. +Something else. Nothing here. Something else. Nothing here. +Spark SQL +aaAaAAaA aaAaAAaA +aaAaAAaA aaAaaAaA +aaAaAAaA aaAaaAaAaaAaaAaAaaAaaAaA +abc abc +abcdcba aBcDCbA +bbAbAAbA a +efd2 efd2 +kitten sitTing +İo i̇o +İo İo +İo İo +İo İo + + -- !query select try_validate_utf8(utf8_binary), try_validate_utf8(utf8_lcase) from t5 -- !query schema @@ -3908,6 +3996,28 @@ kitten sitTing İo İo +-- !query +select try_validate_utf8(utf8_binary collate utf8_lcase_rtrim), try_validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t5 +-- !query schema +struct +-- !query output +Hello, world! Nice day. Hello, world! Nice day. +Something else. Nothing here. Something else. Nothing here. +Spark SQL +aaAaAAaA aaAaAAaA +aaAaAAaA aaAaaAaA +aaAaAAaA aaAaaAaAaaAaaAaAaaAaaAaA +abc abc +abcdcba aBcDCbA +bbAbAAbA a +efd2 efd2 +kitten sitTing +İo i̇o +İo İo +İo İo +İo İo + + -- !query select substr(utf8_binary, 2, 2), substr(utf8_lcase, 2, 2) from t5 -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index 97ed7e2c4f06a..8b316207250ec 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -15,6 +15,14 @@ struct<> +-- !query +create temporary view t3 as select * from t +-- !query schema +struct<> +-- !query output + + + -- !query WITH s AS (SELECT 1 FROM s) SELECT * FROM s -- !query schema @@ -70,6 +78,16 @@ struct<1:int> 1 +-- !query +WITH t AS (SELECT 1) SELECT * FROM t3 +-- !query schema +struct +-- !query output +0 +1 +2 + + -- !query WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2 -- !query schema @@ -580,3 +598,11 @@ DROP VIEW IF EXISTS t2 struct<> -- !query output + + +-- !query +DROP VIEW IF EXISTS t3 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/date.sql.out b/sql/core/src/test/resources/sql-tests/results/date.sql.out index 6dc33b1f853e4..66d9e5419dd36 100644 --- a/sql/core/src/test/resources/sql-tests/results/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/date.sql.out @@ -207,7 +207,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_date`", "message" : "Invalid date 'February 29' as '1970' is not a leap year" } } diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index 22f98512ca5d9..9f68bb87776ab 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -207,7 +207,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_date`", "message" : "Unparseable date: \"02-29\"" } } @@ -1585,7 +1585,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.\"" } } @@ -1601,7 +1601,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.0\"" } } @@ -1617,7 +1617,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.1\"" } } @@ -1633,7 +1633,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.12\"" } } @@ -1649,7 +1649,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.123UTC\"" } } @@ -1665,7 +1665,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.1234\"" } } @@ -1681,7 +1681,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.12345CST\"" } } @@ -1697,7 +1697,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.123456PST\"" } } @@ -1713,7 +1713,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.1234567PST\"" } } @@ -1729,7 +1729,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"123456 2019-10-06 10:11:12.123456PST\"" } } @@ -1745,7 +1745,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"223456 2019-10-06 10:11:12.123456PST\"" } } @@ -1761,7 +1761,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.1234\"" } } @@ -1777,7 +1777,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.123\"" } } @@ -1793,7 +1793,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12\"" } } @@ -1809,7 +1809,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11:12.12\"" } } @@ -1825,7 +1825,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 10:11\"" } } @@ -1841,7 +1841,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06S10:11:12.12345\"" } } @@ -1857,7 +1857,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"12.12342019-10-06S10:11\"" } } @@ -1873,7 +1873,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"12.1232019-10-06S10:11\"" } } @@ -1889,7 +1889,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"12.1232019-10-06S10:11\"" } } @@ -1905,7 +1905,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"12.1234019-10-06S10:11\"" } } @@ -1977,7 +1977,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"02-29\"" } } @@ -2208,7 +2208,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Unparseable date: \"2019-10-06 A\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out index 736eba0adf713..3a7537221d98f 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out @@ -18,7 +18,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '1' could not be parsed at index 0" } } @@ -34,7 +34,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '-12' could not be parsed at index 0" } } @@ -50,7 +50,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '123' could not be parsed, unparsed text found at index 2" } } @@ -66,7 +66,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '1' could not be parsed at index 0" } } @@ -99,7 +99,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Invalid date 'DayOfYear 366' as '1970' is not a leap year" } } @@ -115,7 +115,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '9' could not be parsed at index 0" } } @@ -131,7 +131,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Invalid date 'DayOfYear 366' as '1970' is not a leap year" } } @@ -147,7 +147,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '9' could not be parsed at index 0" } } @@ -163,7 +163,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '99' could not be parsed at index 0" } } @@ -179,7 +179,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Conflict found: Field DayOfMonth 30 differs from DayOfMonth 31 derived from 1970-12-31." } } @@ -195,7 +195,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Conflict found: Field MonthOfYear 11 differs from MonthOfYear 12 derived from 1970-12-31." } } @@ -211,7 +211,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2019-366' could not be parsed: Invalid date 'DayOfYear 366' as '2019' is not a leap year" } } @@ -227,7 +227,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Conflict found: Field DayOfMonth 30 differs from DayOfMonth 31 derived from 1970-12-31." } } @@ -243,7 +243,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2020-01-365' could not be parsed: Conflict found: Field DayOfMonth 30 differs from DayOfMonth 1 derived from 2020-12-30" } } @@ -259,7 +259,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2020-10-350' could not be parsed: Conflict found: Field MonthOfYear 12 differs from MonthOfYear 10 derived from 2020-12-15" } } @@ -275,7 +275,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2020-11-31-366' could not be parsed: Invalid date 'NOVEMBER 31'" } } @@ -299,7 +299,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_date`", "message" : "Text '2020-01-27T20:06:11.847' could not be parsed at index 10" } } @@ -315,7 +315,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_date`", "message" : "Text 'Unparseable' could not be parsed at index 0" } } @@ -331,7 +331,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2020-01-27T20:06:11.847' could not be parsed at index 10" } } @@ -347,7 +347,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text 'Unparseable' could not be parsed at index 0" } } @@ -363,7 +363,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2020-01-27T20:06:11.847' could not be parsed at index 10" } } @@ -379,7 +379,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text 'Unparseable' could not be parsed at index 0" } } @@ -395,7 +395,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2020-01-27T20:06:11.847' could not be parsed at index 10" } } @@ -411,7 +411,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text 'Unparseable' could not be parsed at index 0" } } diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 39bf681d25a96..d945823191026 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -71,6 +71,41 @@ c string d string +-- !query +DESCRIBE EXTENDED t AS JSON +-- !query schema +struct +-- !query output +{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not included in comparison]/{warehouse_dir}/t","storage_properties":{"a":"1","b":"2","password":"*********(redacted)"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"parquet","comment":"table_comment","table_properties":{"e":"3","password":"*********(redacted)","t":"test"},"partition_provider":"Catalog","partition_columns":["c","d"]} + + +-- !query +DESCRIBE t AS JSON +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "DESCRIBE_JSON_NOT_EXTENDED", + "sqlState" : "0A000", + "messageParameters" : { + "tableName" : "t" + } +} + + +-- !query +DESC FORMATTED t a AS JSON +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "UNSUPPORTED_FEATURE.DESC_TABLE_COLUMN_JSON", + "sqlState" : "0A000" +} + + -- !query DESC default.t -- !query schema @@ -263,6 +298,14 @@ c string d string +-- !query +DESC EXTENDED t PARTITION (c='Us', d=1) AS JSON +-- !query schema +struct +-- !query output +{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not included in comparison]/{warehouse_dir}/t/c=Us/d=1","storage_properties":{"a":"1","b":"2","password":"*********(redacted)"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"parquet","num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"table_properties":{"password":"*********(redacted)","t":"test"},"partition_provider":"Catalog","partition_columns":["c","d"]} + + -- !query DESC EXTENDED t PARTITION (c='Us', d=1) -- !query schema @@ -538,7 +581,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [a, b, c, d] +View Query Output Columns [`a`, `b`, `c`, `d`] -- !query @@ -563,7 +606,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [a, b, c, d] +View Query Output Columns [`a`, `b`, `c`, `d`] -- !query @@ -644,6 +687,17 @@ Execute DescribeTableCommand +- DescribeTableCommand `spark_catalog`.`default`.`t`, [c=Us, d=2], false, [col_name#x, data_type#x, comment#x] +-- !query +EXPLAIN DESCRIBE EXTENDED t PARTITION (c='Us', d=2) AS JSON +-- !query schema +struct +-- !query output +== Physical Plan == +Execute DescribeRelationJsonCommand + +- DescribeRelationJsonCommand [c=Us, d=2], true, [json_metadata#x] + +- ResolvedTable V2SessionCatalog(spark_catalog), default.t, V1Table(default.t), [a#x, b#x, c#x, d#x] + + -- !query DROP TABLE t -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 2a53427b57900..0f61924aa425e 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -987,6 +987,15 @@ struct>> [{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS DECIMAL(4, 2))), (CAST(2 AS DECIMAL(4, 2))), (CAST(3 AS DECIMAL(4, 2))) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1.00,"y":1.0},{"x":2.00,"y":1.0},{"x":3.00,"y":1.0}] + + -- !query SELECT histogram_numeric(col, 3) FROM VALUES (TIMESTAMP '2017-03-01 00:00:00'), (TIMESTAMP '2017-04-01 00:00:00'), (TIMESTAMP '2017-05-01 00:00:00') AS tab(col) diff --git a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out index 7d96a3e98c832..521b0afe19264 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out @@ -170,6 +170,7 @@ IS true ITEMS false ITERATE false JOIN true +JSON false KEYS false LANGUAGE false LAST false @@ -252,6 +253,7 @@ REAL false RECORDREADER false RECORDWRITER false RECOVER false +RECURSIVE true REDUCE false REFERENCES true REFRESH false @@ -432,6 +434,7 @@ ORDER OUTER OVERLAPS PRIMARY +RECURSIVE REFERENCES RIGHT SELECT diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index 6cbfe519a76f6..4d702588ad2b3 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -170,6 +170,7 @@ IS false ITEMS false ITERATE false JOIN false +JSON false KEYS false LANGUAGE false LAST false @@ -252,6 +253,7 @@ REAL false RECORDREADER false RECORDWRITER false RECOVER false +RECURSIVE false REDUCE false REFERENCES false REFRESH false diff --git a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out new file mode 100644 index 0000000000000..a21c0ced7a124 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out @@ -0,0 +1,82 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query schema +struct +-- !query output +ABab + + +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query schema +struct +-- !query output +aAbB + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query schema +struct +-- !query output +aAbB + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +-- !query schema +struct +-- !query output +ab + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) +-- !query schema +struct +-- !query output +aB + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('x'), ('abc')) AS t(c1) +-- !query schema +struct +-- !query output +abc x + + +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +-- !query schema +struct +-- !query output +abcabc +abc abc x + + +-- !query +SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +-- !query schema +struct +-- !query output +abc abc abcabc +x + + +-- !query +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + "sqlState" : "42K0K", + "messageParameters" : { + "funcArg" : "\"collate(c1, utf8_lcase)\"", + "funcName" : "`listagg`", + "orderingExpr" : "\"collate(c1, utf8_binary)\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out new file mode 100644 index 0000000000000..4dce4cfc858d7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out @@ -0,0 +1,368 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE TEMP VIEW df AS +SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMP VIEW df2 AS +SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT listagg(b) FROM df GROUP BY a +-- !query schema +struct +-- !query output +NULL +bc +cd + + +-- !query +SELECT string_agg(b) FROM df GROUP BY a +-- !query schema +struct +-- !query output +NULL +bc +cd + + +-- !query +SELECT listagg(b, NULL) FROM df GROUP BY a +-- !query schema +struct +-- !query output +NULL +bc +cd + + +-- !query +SELECT listagg(b) FROM df WHERE 1 != 1 +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT listagg(b, '|') FROM df GROUP BY a +-- !query schema +struct +-- !query output +NULL +b|c +c|d + + +-- !query +SELECT listagg(a) FROM df +-- !query schema +struct +-- !query output +aabb + + +-- !query +SELECT listagg(DISTINCT a) FROM df +-- !query schema +struct +-- !query output +ab + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df +-- !query schema +struct +-- !query output +aabb + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df +-- !query schema +struct +-- !query output +bbaa + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df +-- !query schema +struct +-- !query output +NULL +a +b +ba +ba + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df +-- !query schema +struct +-- !query output +aabb + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df +-- !query schema +struct +-- !query output +baba + + +-- !query +SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df +-- !query schema +struct +-- !query output +b|a|b|a + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df +-- !query schema +struct +-- !query output +baba + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df +-- !query schema +struct +-- !query output +bbaa + + +-- !query +SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query schema +struct +-- !query output +ޭ�� + + +-- !query +SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query schema +struct +-- !query output +ޭ�� + + +-- !query +SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query schema +struct +-- !query output +ޭB�� + + +-- !query +SELECT listagg(a), listagg(b, ',') FROM df2 +-- !query schema +struct +-- !query output +123 true,false,false + + +-- !query +SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"c1\"", + "inputType" : "\"ARRAY\"", + "paramIndex" : "first", + "requiredType" : "(\"STRING\" or \"BINARY\")", + "sqlExpr" : "\"listagg(c1, NULL)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 18, + "fragment" : "listagg(c1)" + } ] +} + + +-- !query +SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.DATA_DIFF_TYPES", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "(\"BINARY\" or \"STRING\")", + "functionName" : "`listagg`", + "sqlExpr" : "\"listagg(c1, , )\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 24, + "fragment" : "listagg(c1, ', ')" + } ] +} + + +-- !query +SELECT listagg(b, a) FROM df GROUP BY a +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + "sqlState" : "42K09", + "messageParameters" : { + "inputExpr" : "\"a\"", + "inputName" : "`delimiter`", + "inputType" : "\"STRING\"", + "sqlExpr" : "\"listagg(b, a)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 20, + "fragment" : "listagg(b, a)" + } ] +} + + +-- !query +SELECT listagg(a) OVER (ORDER BY a) FROM df +-- !query schema +struct +-- !query output +NULL +aa +aa +aabb +aabb + + +-- !query +SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", + "sqlState" : "42601", + "messageParameters" : { + "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 61, + "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + } ] +} + + +-- !query +SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", + "sqlState" : "42601", + "messageParameters" : { + "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 64, + "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + } ] +} + + +-- !query +SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED", + "sqlState" : "0A000", + "messageParameters" : { + "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 44, + "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)" + } ] +} + + +-- !query +SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + "sqlState" : "42K0K", + "messageParameters" : { + "funcArg" : "\"a\"", + "funcName" : "`listagg`", + "orderingExpr" : "\"b\"" + } +} + + +-- !query +SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", + "sqlState" : "42K0K", + "messageParameters" : { + "funcArg" : "\"a\"", + "funcName" : "`listagg`", + "orderingExpr" : "\"a\", \"b\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/results/mode.sql.out b/sql/core/src/test/resources/sql-tests/results/mode.sql.out index 77f008b6b0204..d5ab4509102b9 100644 --- a/sql/core/src/test/resources/sql-tests/results/mode.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/mode.sql.out @@ -51,7 +51,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`mode`" @@ -373,7 +373,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`mode`" @@ -397,7 +397,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`mode`" @@ -421,7 +421,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WRONG_NUM_ORDERINGS", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WRONG_NUM_ORDERINGS", "sqlState" : "42K0K", "messageParameters" : { "actualNum" : "1", diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out index 6cbfe519a76f6..4d702588ad2b3 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out @@ -170,6 +170,7 @@ IS false ITEMS false ITERATE false JOIN false +JSON false KEYS false LANGUAGE false LAST false @@ -252,6 +253,7 @@ REAL false RECORDREADER false RECORDWRITER false RECOVER false +RECURSIVE false REDUCE false REFERENCES false REFRESH false diff --git a/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out b/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out index 6f73e928e2345..5f052c8ff22c5 100644 --- a/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out @@ -222,7 +222,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -246,7 +246,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.DISTINCT_UNSUPPORTED", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.DISTINCT_UNSUPPORTED", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -324,7 +324,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -348,7 +348,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WITHIN_GROUP_MISSING", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WITHIN_GROUP_MISSING", "sqlState" : "42K0K", "messageParameters" : { "funcName" : "`percentile_cont`" @@ -372,7 +372,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "INVALID_INVERSE_DISTRIBUTION_FUNCTION.WRONG_NUM_ORDERINGS", + "condition" : "INVALID_WITHIN_GROUP_EXPRESSION.WRONG_NUM_ORDERINGS", "sqlState" : "42K0K", "messageParameters" : { "actualNum" : "2", diff --git a/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out b/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out index 7ac81c6671a1c..8473fe0cec8ca 100644 --- a/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out @@ -232,6 +232,136 @@ struct<> +-- !query +from t +-- !query schema +struct +-- !query output +0 abc +1 def + + +-- !query +table t +-- !query schema +struct +-- !query output +0 abc +1 def + + +-- !query +from t +|> select 1 as x +-- !query schema +struct +-- !query output +1 +1 + + +-- !query +from t as t_alias +|> select t_alias.x +-- !query schema +struct +-- !query output +0 +1 + + +-- !query +from t as t_alias +|> select t_alias.x as tx, t_alias.y as ty +|> where ty = 'def' +|> select tx +-- !query schema +struct +-- !query output +1 + + +-- !query +from t, other +|> select t.x + other.a as z +-- !query schema +struct +-- !query output +1 +1 +2 +2 +2 +3 + + +-- !query +from t join other on (t.x = other.a) +|> select t.x + other.a as z +-- !query schema +struct +-- !query output +2 +2 + + +-- !query +from t lateral view explode(array(100, 101)) as ly +|> select t.x + ly as z +-- !query schema +struct +-- !query output +100 +101 +101 +102 + + +-- !query +from st +|> select col.i1 +-- !query schema +struct +-- !query output +2 + + +-- !query +from st as st_alias +|> select st_alias.col.i1 +-- !query schema +struct +-- !query output +2 + + +-- !query +from values (0), (1) tab(col) +|> select col as x +-- !query schema +struct +-- !query output +0 +1 + + +-- !query +from t +|> from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'from'", + "hint" : "" + } +} + + -- !query table t |> select 1 as x @@ -511,7 +641,7 @@ struct table t |> extend 1 -- !query schema -struct +struct -- !query output 0 abc 1 1 def 1 @@ -538,51 +668,532 @@ struct -- !query -table t -|> extend x + length(y) as z, x + 1 as zz +table t +|> extend x + length(y) as z, x + 1 as zz +-- !query schema +struct +-- !query output +0 abc 3 1 +1 def 4 2 + + +-- !query +table t +|> extend x + length(y) as z +|> extend z + 1 as zz +-- !query schema +struct +-- !query output +0 abc 3 4 +1 def 4 5 + + +-- !query +select col from st +|> extend col.i1 as z +-- !query schema +struct,z:int> +-- !query output +{"i1":2,"i2":3} 2 + + +-- !query +table t +|> extend (select a from other where x = a limit 1) as z +-- !query schema +struct +-- !query output +0 abc NULL +1 def 1 + + +-- !query +table t +|> where exists ( + table other + |> extend t.x + |> select * except (a, b)) +-- !query schema +struct +-- !query output +0 abc +1 def + + +-- !query +table t +|> extend 1 as x +-- !query schema +struct +-- !query output +0 abc 1 +1 def 1 + + +-- !query +table t +|> extend first_value(x) over (partition by y) as result +-- !query schema +struct +-- !query output +0 abc 0 +1 def 1 + + +-- !query +table t +|> extend x + length(y) as z, z + 1 as plus_one +-- !query schema +struct +-- !query output +0 abc 3 4 +1 def 4 5 + + +-- !query +table t +|> extend sum(x) as z +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", + "sqlState" : "0A000", + "messageParameters" : { + "clause" : "EXTEND", + "expr" : "sum(x#x)" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 19, + "stopIndex" : 24, + "fragment" : "sum(x)" + } ] +} + + +-- !query +table t +|> extend distinct x as z +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'as'", + "hint" : "" + } +} + + +-- !query +table t +|> extend * +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "INVALID_USAGE_OF_STAR_OR_REGEX", + "sqlState" : "42000", + "messageParameters" : { + "elem" : "'*'", + "prettyName" : "expression `pipeexpression`" + } +} + + +-- !query +table t +|> set x = 1 +-- !query schema +struct +-- !query output +1 abc +1 def + + +-- !query +table t +|> set y = x +-- !query schema +struct +-- !query output +0 0 +1 1 + + +-- !query +table t +|> extend 1 as z +|> set z = x + length(y) +-- !query schema +struct +-- !query output +0 abc 3 +1 def 4 + + +-- !query +table t +|> extend 1 as z +|> extend 2 as zz +|> set z = x + length(y), zz = x + 1 +-- !query schema +struct +-- !query output +0 abc 3 1 +1 def 4 2 + + +-- !query +table other +|> extend 3 as c +|> set a = b, b = c +-- !query schema +struct +-- !query output +1 3 3 +2 3 3 +4 3 3 + + +-- !query +table t +|> extend 1 as z +|> extend 2 as zz +|> set z = x + length(y), zz = z + 1 +-- !query schema +struct +-- !query output +0 abc 3 4 +1 def 4 5 + + +-- !query +table t +|> extend 1 as z +|> set z = x + length(y) +|> set z = z + 1 +-- !query schema +struct +-- !query output +0 abc 4 +1 def 5 + + +-- !query +table t +|> extend 1 as z +|> set z = x + length(y), z = z + 1 +-- !query schema +struct +-- !query output +0 abc 4 +1 def 5 + + +-- !query +select col from st +|> extend 1 as z +|> set z = col.i1 +-- !query schema +struct,z:int> +-- !query output +{"i1":2,"i2":3} 2 + + +-- !query +table t +|> set y = (select a from other where x = a limit 1) +-- !query schema +struct +-- !query output +0 NULL +1 1 + + +-- !query +table t +|> extend 1 as `x.y.z` +|> set `x.y.z` = x + length(y) +-- !query schema +struct +-- !query output +0 abc 3 +1 def 4 + + +-- !query +table t +|> extend 1 as z +|> set z = first_value(x) over (partition by y) +-- !query schema +struct +-- !query output +0 abc 0 +1 def 1 + + +-- !query +values (0), (1) lhs(a) +|> inner join values (1), (2) rhs(a) using (a) +|> extend lhs.a + rhs.a as z1 +|> extend lhs.a - rhs.a as z2 +|> drop z1 +|> where z2 = 0 +|> order by lhs.a, rhs.a, z2 +|> set z2 = 4 +|> limit 2 +|> select lhs.a, rhs.a, z2 +-- !query schema +struct +-- !query output +1 1 4 + + +-- !query +table t +|> set z = 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`z`", + "proposal" : "`x`, `y`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 1, + "stopIndex" : 20, + "fragment" : "table t\n|> set z = 1" + } ] +} + + +-- !query +table t +|> set x = 1 as z +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'as'", + "hint" : "" + } +} + + +-- !query +select col from st +|> set col.i1 = 42 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "_LEGACY_ERROR_TEMP_0035", + "messageParameters" : { + "message" : "SQL pipe syntax |> SET operator with multi-part assignment key (only single-part keys are allowed)" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 27, + "stopIndex" : 37, + "fragment" : "col.i1 = 42" + } ] +} + + +-- !query +table t +|> drop y +-- !query schema +struct +-- !query output +0 +1 + + +-- !query +select 1 as x, 2 as y, 3 as z +|> drop z, y +-- !query schema +struct +-- !query output +1 + + +-- !query +select 1 as x, 2 as y, 3 as z +|> drop z +|> drop y +-- !query schema +struct +-- !query output +1 + + +-- !query +select x from t +|> drop x +-- !query schema +struct<> +-- !query output + + + +-- !query +table t +|> extend 1 as `x.y.z` +|> drop `x.y.z` +-- !query schema +struct +-- !query output +0 abc +1 def + + +-- !query +table t +|> drop z +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`z`", + "proposal" : "`x`, `y`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 1, + "stopIndex" : 17, + "fragment" : "table t\n|> drop z" + } ] +} + + +-- !query +table st +|> drop col.i1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'.'", + "hint" : "" + } +} + + +-- !query +table st +|> drop `col.i1` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`col.i1`", + "proposal" : "`col`, `x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 1, + "stopIndex" : 25, + "fragment" : "table st\n|> drop `col.i1`" + } ] +} + + +-- !query +select 1 as x, 2 as y, 3 as z +|> drop z, y, z -- !query schema -struct +struct<> -- !query output -0 abc 3 1 -1 def 4 2 +org.apache.spark.sql.AnalysisException +{ + "condition" : "EXCEPT_OVERLAPPING_COLUMNS", + "sqlState" : "42702", + "messageParameters" : { + "columns" : "z, y, z" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 1, + "stopIndex" : 45, + "fragment" : "select 1 as x, 2 as y, 3 as z\n|> drop z, y, z" + } ] +} -- !query table t -|> extend x + length(y) as z -|> extend z + 1 as zz +|> as u +|> select u.x, u.y -- !query schema -struct +struct -- !query output -0 abc 3 4 -1 def 4 5 +0 abc +1 def -- !query -select col from st -|> extend col.i1 as z +select 1 as x, 2 as y +|> as u +|> select u.x, u.y -- !query schema -struct,z:int> +struct -- !query output -{"i1":2,"i2":3} 2 +1 2 -- !query table t -|> extend (select a from other where x = a limit 1) as z +|> as `u.v` +|> select `u.v`.x, `u.v`.y -- !query schema -struct +struct -- !query output -0 abc NULL -1 def 1 +0 abc +1 def -- !query table t -|> where exists ( - table other - |> extend t.x - |> select * except (a, b)) +|> as u +|> as v +|> select v.x, v.y -- !query schema struct -- !query output @@ -592,61 +1203,67 @@ struct -- !query table t -|> extend 1 as x +|> as u +|> where u.x = 1 -- !query schema -struct +struct -- !query output -0 abc 1 -1 def 1 +1 def -- !query table t -|> extend first_value(x) over (partition by y) as result +|> as u, v -- !query schema -struct +struct<> -- !query output -0 abc 0 -1 def 1 +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "','", + "hint" : "" + } +} -- !query table t -|> extend x + length(y) as z, z + 1 as plus_one +|> as 1 + 2 -- !query schema -struct +struct<> -- !query output -0 abc 3 4 -1 def 4 5 +org.apache.spark.sql.catalyst.parser.ParseException +{ + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'1'", + "hint" : "" + } +} -- !query table t -|> extend sum(x) as z +|> as u-v -- !query schema struct<> -- !query output -org.apache.spark.sql.AnalysisException +org.apache.spark.sql.catalyst.parser.ParseException { - "condition" : "PIPE_OPERATOR_CONTAINS_AGGREGATE_FUNCTION", - "sqlState" : "0A000", + "condition" : "INVALID_IDENTIFIER", + "sqlState" : "42602", "messageParameters" : { - "clause" : "EXTEND", - "expr" : "sum(x#x)" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 19, - "stopIndex" : 24, - "fragment" : "sum(x)" - } ] + "ident" : "u-v" + } } -- !query table t -|> extend distinct x as z +|> as u@v -- !query schema struct<> -- !query output @@ -655,7 +1272,7 @@ org.apache.spark.sql.catalyst.parser.ParseException "condition" : "PARSE_SYNTAX_ERROR", "sqlState" : "42601", "messageParameters" : { - "error" : "'as'", + "error" : "'@'", "hint" : "" } } @@ -663,17 +1280,17 @@ org.apache.spark.sql.catalyst.parser.ParseException -- !query table t -|> extend * +|> as u#######v -- !query schema struct<> -- !query output -org.apache.spark.sql.AnalysisException +org.apache.spark.sql.catalyst.parser.ParseException { - "condition" : "INVALID_USAGE_OF_STAR_OR_REGEX", - "sqlState" : "42000", + "condition" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", "messageParameters" : { - "elem" : "'*'", - "prettyName" : "expression `pipeexpression`" + "error" : "'#'", + "hint" : "" } } @@ -881,7 +1498,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "sqlState" : "42703", "messageParameters" : { "objectName" : "`y`", - "proposal" : "`x`, `z`" + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" }, "queryContext" : [ { "objectType" : "", @@ -893,6 +1510,84 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +table t +|> select x, length(y) as z +|> limit 1000 +|> where x + length(y) < 4 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`y`", + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 71, + "stopIndex" : 71, + "fragment" : "y" + } ] +} + + +-- !query +table t +|> select x, length(y) as z +|> limit 1000 offset 1 +|> where x + length(y) < 4 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`y`", + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 80, + "stopIndex" : 80, + "fragment" : "y" + } ] +} + + +-- !query +table t +|> select x, length(y) as z +|> order by x, y +|> where x + length(y) < 4 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`y`", + "proposal" : "`z`, `spark_catalog`.`default`.`t`.`x`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 52, + "stopIndex" : 52, + "fragment" : "y" + } ] +} + + -- !query (select x, sum(length(y)) as sum_len from t group by x) |> where sum(length(y)) = 3 @@ -905,7 +1600,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "sqlState" : "42703", "messageParameters" : { "objectName" : "`y`", - "proposal" : "`x`, `sum_len`" + "proposal" : "`sum_len`, `spark_catalog`.`default`.`t`.`x`" }, "queryContext" : [ { "objectType" : "", @@ -1762,29 +2457,25 @@ struct -- !query -values (0, 1) tab(x, y) +values (2, 'xyz') tab(x, y) |> union table t |> where x = 0 -- !query schema -struct<> +struct -- !query output -org.apache.spark.SparkNumberFormatException -{ - "condition" : "CAST_INVALID_INPUT", - "sqlState" : "22018", - "messageParameters" : { - "expression" : "'abc'", - "sourceType" : "\"STRING\"", - "targetType" : "\"BIGINT\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 1, - "stopIndex" : 55, - "fragment" : "values (0, 1) tab(x, y)\n|> union table t\n|> where x = 0" - } ] -} +0 abc + + +-- !query +values (2, 'xyz') tab(x, y) +|> union table t +|> drop x +-- !query schema +struct +-- !query output +abc +def +xyz -- !query @@ -2179,16 +2870,107 @@ struct select 3 as x, 4 as y |> aggregate group by 1, 2 -- !query schema -struct<1:int,2:int> +struct +-- !query output +3 4 + + +-- !query +values (3, 4) as tab(x, y) +|> aggregate sum(y) group by 1 +-- !query schema +struct +-- !query output +3 4 + + +-- !query +values (3, 4), (5, 4) as tab(x, y) +|> aggregate sum(y) group by 1 +-- !query schema +struct +-- !query output +3 4 +5 4 + + +-- !query +select 3 as x, 4 as y +|> aggregate sum(y) group by 1, 1 +-- !query schema +struct +-- !query output +3 3 4 + + +-- !query +select 1 as `1`, 2 as `2` +|> aggregate sum(`2`) group by `1` +-- !query schema +struct<1:int,sum(2):bigint> -- !query output 1 2 +-- !query +select 3 as x, 4 as y +|> aggregate sum(y) group by 2 +-- !query schema +struct +-- !query output +4 4 + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 2 +-- !query schema +struct +-- !query output +4 4 + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 3 +-- !query schema +struct +-- !query output +5 4 + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 2, 3 +-- !query schema +struct +-- !query output +4 5 4 + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by 1, 2, 3 +-- !query schema +struct +-- !query output +3 4 5 4 + + +-- !query +select 3 as x, 4 as y, 5 as z +|> aggregate sum(y) group by x, 2, 3 +-- !query schema +struct +-- !query output +3 4 5 4 + + -- !query table t |> aggregate sum(x) -- !query schema -struct +struct -- !query output 1 @@ -2264,7 +3046,7 @@ struct table other |> aggregate a + count(b) group by a -- !query schema -struct +struct -- !query output 1 3 2 3 @@ -2576,7 +3358,7 @@ org.apache.spark.sql.catalyst.parser.ParseException "condition" : "UNSUPPORTED_FEATURE.PIPE_OPERATOR_AGGREGATE_UNSUPPORTED_CASE", "sqlState" : "0A000", "messageParameters" : { - "case" : "window functions" + "case" : "window functions; please update the query to move the window functions to a subsequent |> SELECT operator instead" }, "queryContext" : [ { "objectType" : "", @@ -2846,6 +3628,565 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +with customer_total_return as +(select + sr_customer_sk as ctr_customer_sk, + sr_store_sk as ctr_store_sk, + sum(sr_return_amt) as ctr_total_return + from store_returns, date_dim + where sr_returned_date_sk = d_date_sk and d_year = 2000 + group by sr_customer_sk, sr_store_sk) +select c_customer_id +from customer_total_return ctr1, store, customer +where ctr1.ctr_total_return > + (select avg(ctr_total_return) * 1.2 + from customer_total_return ctr2 + where ctr1.ctr_store_sk = ctr2.ctr_store_sk) + and s_store_sk = ctr1.ctr_store_sk + and s_state = 'tn' + and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +with customer_total_return as + (from store_returns + |> join date_dim + |> where sr_returned_date_sk = d_date_sk and d_year = 2000 + |> aggregate sum(sr_return_amt) as ctr_total_return + group by sr_customer_sk as ctr_customer_sk, sr_store_sk as ctr_store_sk) +from customer_total_return ctr1 +|> join store +|> join customer +|> where ctr1.ctr_total_return > + (table customer_total_return + |> as ctr2 + |> where ctr1.ctr_store_sk = ctr2.ctr_store_sk + |> aggregate avg(ctr_total_return) * 1.2) + and s_store_sk = ctr1.ctr_store_sk + and s_state = 'tn' + and ctr1.ctr_customer_sk = c_customer_sk +|> order by c_customer_id +|> limit 100 +|> select c_customer_id +-- !query schema +struct +-- !query output + + + +-- !query +with wscs as +( select + sold_date_sk, + sales_price + from (select + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + from web_sales) x + union all + (select + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price + from catalog_sales)), + wswscs as + ( select + d_week_seq, + sum(case when (d_day_name = 'sunday') + then sales_price + else null end) + sun_sales, + sum(case when (d_day_name = 'monday') + then sales_price + else null end) + mon_sales, + sum(case when (d_day_name = 'tuesday') + then sales_price + else null end) + tue_sales, + sum(case when (d_day_name = 'wednesday') + then sales_price + else null end) + wed_sales, + sum(case when (d_day_name = 'thursday') + then sales_price + else null end) + thu_sales, + sum(case when (d_day_name = 'friday') + then sales_price + else null end) + fri_sales, + sum(case when (d_day_name = 'saturday') + then sales_price + else null end) + sat_sales + from wscs, date_dim + where d_date_sk = sold_date_sk + group by d_week_seq) +select + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2) +from + (select + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + from wswscs, date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and d_year = 2001) y, + (select + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + from wswscs, date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and d_year = 2001 + 1) z +where d_week_seq1 = d_week_seq2 - 53 +order by d_week_seq1 +-- !query schema +struct +-- !query output + + + +-- !query +with wscs as + (table web_sales + |> select + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + |> as x + |> union all ( + table catalog_sales + |> select + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price) + |> select + sold_date_sk, + sales_price), +wswscs as + (table wscs + |> join date_dim + |> where d_date_sk = sold_date_sk + |> aggregate + sum(case when (d_day_name = 'sunday') + then sales_price + else null end) + sun_sales, + sum(case when (d_day_name = 'monday') + then sales_price + else null end) + mon_sales, + sum(case when (d_day_name = 'tuesday') + then sales_price + else null end) + tue_sales, + sum(case when (d_day_name = 'wednesday') + then sales_price + else null end) + wed_sales, + sum(case when (d_day_name = 'thursday') + then sales_price + else null end) + thu_sales, + sum(case when (d_day_name = 'friday') + then sales_price + else null end) + fri_sales, + sum(case when (d_day_name = 'saturday') + then sales_price + else null end) + sat_sales + group by d_week_seq) +table wswscs +|> join date_dim +|> where date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 +|> select + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 +|> as y +|> join ( + table wswscs + |> join date_dim + |> where date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 + 1 + |> select + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + |> as z) +|> where d_week_seq1 = d_week_seq2 - 53 +|> order by d_week_seq1 +|> select + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2) +-- !query schema +struct +-- !query output + + + +-- !query +select + dt.d_year, + item.i_brand_id brand_id, + item.i_brand brand, + sum(ss_ext_sales_price) sum_agg +from date_dim dt, store_sales, item +where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 128 + and dt.d_moy = 11 +group by dt.d_year, item.i_brand, item.i_brand_id +order by dt.d_year, sum_agg desc, brand_id +limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +table date_dim +|> as dt +|> join store_sales +|> join item +|> where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 128 + and dt.d_moy = 11 +|> aggregate sum(ss_ext_sales_price) sum_agg + group by dt.d_year d_year, item.i_brand_id brand_id, item.i_brand brand +|> order by d_year, sum_agg desc, brand_id +|> limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +select + i_item_desc, + i_category, + i_class, + i_current_price, + sum(ws_ext_sales_price) as itemrevenue, + sum(ws_ext_sales_price) * 100 / sum(sum(ws_ext_sales_price)) + over + (partition by i_class) as revenueratio +from + web_sales, item, date_dim +where + ws_item_sk = i_item_sk + and i_category in ('sports', 'books', 'home') + and ws_sold_date_sk = d_date_sk + and d_date between cast('1999-02-22' as date) + and (cast('1999-02-22' as date) + interval 30 days) +group by + i_item_id, i_item_desc, i_category, i_class, i_current_price +order by + i_category, i_class, i_item_id, i_item_desc, revenueratio +limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +table web_sales +|> join item +|> join date_dim +|> where ws_item_sk = i_item_sk + and i_category in ('sports', 'books', 'home') + and ws_sold_date_sk = d_date_sk + and d_date between cast('1999-02-22' as date) + and (cast('1999-02-22' as date) + interval 30 days) +|> aggregate sum(ws_ext_sales_price) AS itemrevenue + group by i_item_id, i_item_desc, i_category, i_class, i_current_price +|> extend + itemrevenue * 100 / sum(itemrevenue) + over (partition by i_class) as revenueratio +|> order by i_category, i_class, i_item_id, i_item_desc, revenueratio +|> select i_item_desc, i_category, i_class, i_current_price, itemrevenue, revenueratio +|> limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +select + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing +from (select * +from (select + item_sk, + rank() + over ( + order by rank_col asc) rnk +from (select + ss_item_sk item_sk, + avg(ss_net_profit) rank_col +from store_sales ss1 +where ss_store_sk = 4 +group by ss_item_sk +having avg(ss_net_profit) > 0.9 * (select avg(ss_net_profit) rank_col +from store_sales +where ss_store_sk = 4 + and ss_addr_sk is null +group by ss_store_sk)) v1) v11 +where rnk < 11) asceding, + (select * + from (select + item_sk, + rank() + over ( + order by rank_col desc) rnk + from (select + ss_item_sk item_sk, + avg(ss_net_profit) rank_col + from store_sales ss1 + where ss_store_sk = 4 + group by ss_item_sk + having avg(ss_net_profit) > 0.9 * (select avg(ss_net_profit) rank_col + from store_sales + where ss_store_sk = 4 + and ss_addr_sk is null + group by ss_store_sk)) v2) v21 + where rnk < 11) descending, + item i1, item i2 +where asceding.rnk = descending.rnk + and i1.i_item_sk = asceding.item_sk + and i2.i_item_sk = descending.item_sk +order by asceding.rnk +limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +from store_sales ss1 +|> where ss_store_sk = 4 +|> aggregate avg(ss_net_profit) rank_col + group by ss_item_sk as item_sk +|> where rank_col > 0.9 * ( + from store_sales + |> where ss_store_sk = 4 + and ss_addr_sk is null + |> aggregate avg(ss_net_profit) rank_col + group by ss_store_sk + |> select rank_col) +|> as v1 +|> select + item_sk, + rank() over ( + order by rank_col asc) rnk +|> as v11 +|> where rnk < 11 +|> as asceding +|> join ( + from store_sales ss1 + |> where ss_store_sk = 4 + |> aggregate avg(ss_net_profit) rank_col + group by ss_item_sk as item_sk + |> where rank_col > 0.9 * ( + table store_sales + |> where ss_store_sk = 4 + and ss_addr_sk is null + |> aggregate avg(ss_net_profit) rank_col + group by ss_store_sk + |> select rank_col) + |> as v2 + |> select + item_sk, + rank() over ( + order by rank_col asc) rnk + |> as v21 + |> where rnk < 11) descending +|> join item i1 +|> join item i2 +|> where asceding.rnk = descending.rnk + and i1.i_item_sk = asceding.item_sk + and i2.i_item_sk = descending.item_sk +|> order by asceding.rnk +|> select + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing +-- !query schema +struct +-- !query output + + + +-- !query +with web_v1 as ( + select + ws_item_sk item_sk, + d_date, + sum(sum(ws_sales_price)) + over (partition by ws_item_sk + order by d_date + rows between unbounded preceding and current row) cume_sales + from web_sales, date_dim + where ws_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ws_item_sk is not null + group by ws_item_sk, d_date), + store_v1 as ( + select + ss_item_sk item_sk, + d_date, + sum(sum(ss_sales_price)) + over (partition by ss_item_sk + order by d_date + rows between unbounded preceding and current row) cume_sales + from store_sales, date_dim + where ss_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ss_item_sk is not null + group by ss_item_sk, d_date) +select * +from (select + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) web_cumulative, + max(store_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) store_cumulative +from (select + case when web.item_sk is not null + then web.item_sk + else store.item_sk end item_sk, + case when web.d_date is not null + then web.d_date + else store.d_date end d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +from web_v1 web full outer join store_v1 store on (web.item_sk = store.item_sk + and web.d_date = store.d_date) + ) x) y +where web_cumulative > store_cumulative +order by item_sk, d_date +limit 100 +-- !query schema +struct +-- !query output + + + +-- !query +with web_v1 as ( + table web_sales + |> join date_dim + |> where ws_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ws_item_sk is not null + |> aggregate sum(ws_sales_price) as sum_ws_sales_price + group by ws_item_sk as item_sk, d_date + |> extend sum(sum_ws_sales_price) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) + as cume_sales), +store_v1 as ( + table store_sales + |> join date_dim + |> where ss_sold_date_sk = d_date_sk + and d_month_seq between 1200 and 1200 + 11 + and ss_item_sk is not null + |> aggregate sum(ss_sales_price) as sum_ss_sales_price + group by ss_item_sk as item_sk, d_date + |> extend sum(sum_ss_sales_price) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) + as cume_sales) +table web_v1 +|> as web +|> full outer join store_v1 store + on (web.item_sk = store.item_sk and web.d_date = store.d_date) +|> select + case when web.item_sk is not null + then web.item_sk + else store.item_sk end item_sk, + case when web.d_date is not null + then web.d_date + else store.d_date end d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +|> as x +|> select + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) web_cumulative, + max(store_sales) + over (partition by item_sk + order by d_date + rows between unbounded preceding and current row) store_cumulative +|> as y +|> where web_cumulative > store_cumulative +|> order by item_sk, d_date +|> limit 100 +-- !query schema +struct +-- !query output + + + -- !query drop table t -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out index d14161e93a9f0..2583d14b512ba 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -269,7 +269,7 @@ View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [a, id] +View Query Output Columns [`a`, `id`] -- !query @@ -335,7 +335,7 @@ View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [a, id] +View Query Output Columns [`a`, `id`] -- !query @@ -391,7 +391,7 @@ View Original Text SELECT t1.a AS t1_a, t2.a AS t2_a WHERE t1.id = t2.id View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [t1_a, t2_a] +View Query Output Columns [`t1_a`, `t2_a`] -- !query @@ -464,7 +464,7 @@ View Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_t View Original Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [a, id] +View Query Output Columns [`a`, `id`] -- !query @@ -495,7 +495,7 @@ View Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_ View Original Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [id, a] +View Query Output Columns [`id`, `a`] -- !query @@ -526,7 +526,7 @@ View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_t View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [a, id] +View Query Output Columns [`a`, `id`] -- !query @@ -557,7 +557,7 @@ View Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM ba View Original Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [a, id] +View Query Output Columns [`a`, `id`] -- !query @@ -588,7 +588,7 @@ View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test -View Query Output Columns [a, id] +View Query Output Columns [`a`, `id`] -- !query @@ -800,7 +800,7 @@ View Text SELECT * FROM t1 CROSS JOIN t2 View Original Text SELECT * FROM t1 CROSS JOIN t2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 -View Query Output Columns [num, name, num2, value] +View Query Output Columns [`num`, `name`, `num2`, `value`] -- !query @@ -851,7 +851,7 @@ View Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 -View Query Output Columns [num, name, num2, value] +View Query Output Columns [`num`, `name`, `num2`, `value`] -- !query @@ -902,7 +902,7 @@ View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 -View Query Output Columns [num, name, num2, value] +View Query Output Columns [`num`, `name`, `num2`, `value`] -- !query @@ -953,7 +953,7 @@ View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.va View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 -View Query Output Columns [num, name, num2, value] +View Query Output Columns [`num`, `name`, `num2`, `value`] -- !query @@ -1074,7 +1074,7 @@ BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 -View Query Output Columns [a, b] +View Query Output Columns [`a`, `b`] -- !query @@ -1114,7 +1114,7 @@ AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 -View Query Output Columns [a, b] +View Query Output Columns [`a`, `b`] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out index d0bc5afe463dd..0f6f8dcb47561 100644 --- a/sql/core/src/test/resources/sql-tests/results/random.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out @@ -147,6 +147,22 @@ struct 7 +-- !query +SELECT uniform(0, cast(10 as tinyint), 0) AS result +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT uniform(0, cast(10 as smallint), 0) AS result +-- !query schema +struct +-- !query output +7 + + -- !query SELECT uniform(0, 10S, 0) AS result -- !query schema @@ -171,6 +187,38 @@ struct 17.604954 +-- !query +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), 0) AS result +-- !query schema +struct +-- !query output +17.605 + + +-- !query +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(11, 4)), 0) AS result +-- !query schema +struct +-- !query output +17.6050 + + +-- !query +SELECT uniform(10, cast(20 as decimal(10, 3)), 0) AS result +-- !query schema +struct +-- !query output +17.605 + + +-- !query +SELECT uniform(cast(10 as decimal(10, 3)), 20, 0) AS result +-- !query schema +struct +-- !query output +17.605 + + -- !query SELECT uniform(10.0D, 20.0D, CAST(3 / 7 AS LONG)) AS result -- !query schema @@ -205,10 +253,50 @@ struct true +-- !query +SELECT uniform(-10L, 10L, 0) AS result +-- !query schema +struct +-- !query output +5 + + +-- !query +SELECT uniform(-20L, -10L, 0) AS result +-- !query schema +struct +-- !query output +-12 + + +-- !query +SELECT uniform(-20L, -10L, -10) AS result +-- !query schema +struct +-- !query output +-17 + + -- !query SELECT uniform(NULL, 1, 0) AS result -- !query schema -struct +struct +-- !query output +NULL + + +-- !query +SELECT uniform(cast(NULL AS int), 1, 0) AS result +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT uniform(cast(NULL AS float), 1, 0) AS result +-- !query schema +struct -- !query output NULL @@ -216,7 +304,23 @@ NULL -- !query SELECT uniform(0, NULL, 0) AS result -- !query schema -struct +struct +-- !query output +NULL + + +-- !query +SELECT uniform(0, cast(NULL AS int), 0) AS result +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT uniform(0, cast(NULL AS float), 0) AS result +-- !query schema +struct -- !query output NULL @@ -224,11 +328,61 @@ NULL -- !query SELECT uniform(0, 1, NULL) AS result -- !query schema -struct +struct +-- !query output +0 + + +-- !query +SELECT uniform(NULL, NULL, 0) AS result +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT uniform(NULL, NULL, NULL) AS result +-- !query schema +struct -- !query output NULL +-- !query +SELECT uniform(0, 1, cast(NULL as int)) AS result +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT uniform(0, 1, cast(NULL as float)) AS result +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"CAST(NULL AS FLOAT)\"", + "inputType" : "\"FLOAT\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(0, 1, CAST(NULL AS FLOAT))\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 41, + "fragment" : "uniform(0, 1, cast(NULL as float))" + } ] +} + + -- !query SELECT uniform(10, 20, col) AS result FROM VALUES (0), (1), (2) tab(col) -- !query schema @@ -330,57 +484,33 @@ org.apache.spark.sql.AnalysisException -- !query -SELECT randstr(1, 0) AS result --- !query schema -struct --- !query output -c - - --- !query -SELECT randstr(5, 0) AS result --- !query schema -struct --- !query output -ceV0P - - --- !query -SELECT randstr(10, 0) AS result --- !query schema -struct --- !query output -ceV0PXaR2I - - --- !query -SELECT randstr(10S, 0) AS result +SELECT uniform(10.0F, 20.0F, 0.0F) AS result -- !query schema -struct --- !query output -ceV0PXaR2I - - --- !query -SELECT randstr(10, 0) AS result FROM VALUES (0), (1), (2) tab(col) --- !query schema -struct --- !query output -ceV0PXaR2I -fYxVfArnv7 -iSIv0VT2XL - - --- !query -SELECT randstr(10) IS NOT NULL AS result --- !query schema -struct +struct<> -- !query output -true +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"0.0\"", + "inputType" : "\"FLOAT\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(10.0, 20.0, 0.0)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 34, + "fragment" : "uniform(10.0F, 20.0F, 0.0F)" + } ] +} -- !query -SELECT randstr(10L, 0) AS result +SELECT uniform(10.0F, 20.0F, 0.0D) AS result -- !query schema struct<> -- !query output @@ -389,24 +519,24 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"10\"", - "inputType" : "\"BIGINT\"", - "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(10, 0)\"" + "inputSql" : "\"0.0\"", + "inputType" : "\"DOUBLE\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(10.0, 20.0, 0.0)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 22, - "fragment" : "randstr(10L, 0)" + "stopIndex" : 34, + "fragment" : "uniform(10.0F, 20.0F, 0.0D)" } ] } -- !query -SELECT randstr(10.0F, 0) AS result +SELECT uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), cast(0 as decimal(10, 3))) -- !query schema struct<> -- !query output @@ -415,24 +545,24 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"10.0\"", - "inputType" : "\"FLOAT\"", - "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(10.0, 0)\"" + "inputSql" : "\"CAST(0 AS DECIMAL(10,3))\"", + "inputType" : "\"DECIMAL(10,3)\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(CAST(10 AS DECIMAL(10,3)), CAST(20 AS DECIMAL(10,3)), CAST(0 AS DECIMAL(10,3)))\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 24, - "fragment" : "randstr(10.0F, 0)" + "stopIndex" : 97, + "fragment" : "uniform(cast(10 as decimal(10, 3)), cast(20 as decimal(10, 3)), cast(0 as decimal(10, 3)))" } ] } -- !query -SELECT randstr(10.0D, 0) AS result +SELECT uniform('abc', 10, 0) AS result -- !query schema struct<> -- !query output @@ -441,24 +571,24 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"10.0\"", - "inputType" : "\"DOUBLE\"", + "inputSql" : "\"abc\"", + "inputType" : "\"STRING\"", "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(10.0, 0)\"" + "requiredType" : "\"NUMERIC\"", + "sqlExpr" : "\"uniform(abc, 10, 0)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 24, - "fragment" : "randstr(10.0D, 0)" + "stopIndex" : 28, + "fragment" : "uniform('abc', 10, 0)" } ] } -- !query -SELECT randstr(NULL, 0) AS result +SELECT uniform(0, 'def', 0) AS result -- !query schema struct<> -- !query output @@ -467,24 +597,24 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"NULL\"", - "inputType" : "\"VOID\"", - "paramIndex" : "first", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(NULL, 0)\"" + "inputSql" : "\"def\"", + "inputType" : "\"STRING\"", + "paramIndex" : "second", + "requiredType" : "\"NUMERIC\"", + "sqlExpr" : "\"uniform(0, def, 0)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 23, - "fragment" : "randstr(NULL, 0)" + "stopIndex" : 27, + "fragment" : "uniform(0, 'def', 0)" } ] } -- !query -SELECT randstr(0, NULL) AS result +SELECT uniform(0, 10, 'ghi') AS result -- !query schema struct<> -- !query output @@ -493,22 +623,160 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", "sqlState" : "42K09", "messageParameters" : { - "inputSql" : "\"NULL\"", - "inputType" : "\"VOID\"", - "paramIndex" : "second", - "requiredType" : "INT or SMALLINT", - "sqlExpr" : "\"randstr(0, NULL)\"" + "inputSql" : "\"ghi\"", + "inputType" : "\"STRING\"", + "paramIndex" : "third", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"uniform(0, 10, ghi)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 23, - "fragment" : "randstr(0, NULL)" + "stopIndex" : 28, + "fragment" : "uniform(0, 10, 'ghi')" } ] } +-- !query +SELECT randstr(1, 0) AS result +-- !query schema +struct +-- !query output +c + + +-- !query +SELECT randstr(5, 0) AS result +-- !query schema +struct +-- !query output +ceV0P + + +-- !query +SELECT randstr(10, 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(10S, 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(CAST(10 AS TINYINT), 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(CAST(10 AS BIGINT), 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(1.0F, 0) AS result +-- !query schema +struct +-- !query output +c + + +-- !query +SELECT randstr(1.0D, 0) AS result +-- !query schema +struct +-- !query output +c + + +-- !query +SELECT randstr(cast(1 AS DECIMAL(10, 2)), 0) AS result +-- !query schema +struct +-- !query output +c + + +-- !query +SELECT randstr(10, 0) AS result FROM VALUES (0), (1), (2) tab(col) +-- !query schema +struct +-- !query output +ceV0PXaR2I +fYxVfArnv7 +iSIv0VT2XL + + +-- !query +SELECT randstr(10) IS NOT NULL AS result +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT randstr(1, -1) AS result +-- !query schema +struct +-- !query output +S + + +-- !query +SELECT randstr(10L, 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(10.0F, 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(10.0D, 0) AS result +-- !query schema +struct +-- !query output +ceV0PXaR2I + + +-- !query +SELECT randstr(NULL, 0) AS result +-- !query schema +struct +-- !query output + + + +-- !query +SELECT randstr(0, NULL) AS result +-- !query schema +struct +-- !query output + + + -- !query SELECT randstr(col, 0) AS result FROM VALUES (0), (1), (2) tab(col) -- !query schema @@ -521,7 +789,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "messageParameters" : { "inputExpr" : "\"col\"", "inputName" : "`length`", - "inputType" : "INT or SMALLINT", + "inputType" : "integer", "sqlExpr" : "\"randstr(col, 0)\"" }, "queryContext" : [ { @@ -546,7 +814,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "messageParameters" : { "inputExpr" : "\"col\"", "inputName" : "`seed`", - "inputType" : "INT or SMALLINT", + "inputType" : "integer", "sqlExpr" : "\"randstr(10, col)\"" }, "queryContext" : [ { @@ -582,3 +850,72 @@ org.apache.spark.sql.AnalysisException "fragment" : "randstr(10, 0, 1)" } ] } + + +-- !query +SELECT randstr(-1, 0) AS result +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "condition" : "INVALID_PARAMETER_VALUE.LENGTH", + "sqlState" : "22023", + "messageParameters" : { + "functionName" : "`randstr`", + "length" : "-1", + "parameter" : "`length`" + } +} + + +-- !query +SELECT randstr(10, "a") AS result FROM VALUES (0) tab(a) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"a\"", + "inputType" : "\"STRING\"", + "paramIndex" : "second", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"randstr(10, a)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 23, + "fragment" : "randstr(10, \"a\")" + } ] +} + + +-- !query +SELECT randstr(10, 1.5) AS result FROM VALUES (0) tab(a) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1.5\"", + "inputType" : "\"DECIMAL(2,1)\"", + "paramIndex" : "second", + "requiredType" : "(\"INT\" or \"BIGINT\")", + "sqlExpr" : "\"randstr(10, 1.5)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 23, + "fragment" : "randstr(10, 1.5)" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index 4ecf65d0cc51a..0911efe3e09c6 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -249,10 +249,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "condition" : "_LEGACY_ERROR_TEMP_1231", + "condition" : "PARTITIONS_NOT_FOUND", + "sqlState" : "428FT", "messageParameters" : { - "key" : "a", - "tblName" : "`spark_catalog`.`showdb`.`show_t1`" + "partitionList" : "`a`", + "tableName" : "`spark_catalog`.`showdb`.`show_t1`" } } diff --git a/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out index 9b9ac5a9edd55..13f43167c4dda 100644 --- a/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out @@ -257,3 +257,27 @@ DROP DATABASE sql_on_files struct<> -- !query output + + +-- !query +SELECT * FROM json.`https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "FAILED_READ_FILE.UNSUPPORTED_FILE_SYSTEM", + "sqlState" : "KD001", + "messageParameters" : { + "fileSystemClass" : "org.apache.hadoop.fs.http.HttpsFileSystem", + "method" : "listStatus", + "path" : "https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 128, + "fragment" : "json.`https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/src/main/resources/employees.json`" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out new file mode 100644 index 0000000000000..08f2d75cce9d7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out @@ -0,0 +1,484 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE FUNCTION foo1a0() RETURNS INT RETURN 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo1a0() +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT foo1a0(1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + "sqlState" : "42605", + "messageParameters" : { + "actualNum" : "1", + "docroot" : "https://spark.apache.org/docs/latest", + "expectedNum" : "0", + "functionName" : "`spark_catalog`.`default`.`foo1a0`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 16, + "fragment" : "foo1a0(1)" + } ] +} + + +-- !query +CREATE FUNCTION foo1a1(a INT) RETURNS INT RETURN 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo1a1(1) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT foo1a1(1, 2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + "sqlState" : "42605", + "messageParameters" : { + "actualNum" : "2", + "docroot" : "https://spark.apache.org/docs/latest", + "expectedNum" : "1", + "functionName" : "`spark_catalog`.`default`.`foo1a1`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 19, + "fragment" : "foo1a1(1, 2)" + } ] +} + + +-- !query +CREATE FUNCTION foo1a2(a INT, b INT, c INT, d INT) RETURNS INT RETURN 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo1a2(1, 2, 3, 4) +-- !query schema +struct +-- !query output +1 + + +-- !query +CREATE FUNCTION foo2_1a(a INT) RETURNS INT RETURN a +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_1a(5) +-- !query schema +struct +-- !query output +5 + + +-- !query +CREATE FUNCTION foo2_1b(a INT, b INT) RETURNS INT RETURN a + b +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_1b(5, 6) +-- !query schema +struct +-- !query output +11 + + +-- !query +CREATE FUNCTION foo2_1c(a INT, b INT) RETURNS INT RETURN 10 * (a + b) + 100 * (a -b) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_1c(5, 6) +-- !query schema +struct +-- !query output +10 + + +-- !query +CREATE FUNCTION foo2_1d(a INT, b INT) RETURNS INT RETURN ABS(a) - LENGTH(CAST(b AS VARCHAR(10))) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_1d(-5, 6) +-- !query schema +struct +-- !query output +4 + + +-- !query +CREATE FUNCTION foo2_2a(a INT) RETURNS INT RETURN SELECT a +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_2a(5) +-- !query schema +struct +-- !query output +5 + + +-- !query +CREATE FUNCTION foo2_2b(a INT) RETURNS INT RETURN 1 + (SELECT a) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_2b(5) +-- !query schema +struct +-- !query output +6 + + +-- !query +CREATE FUNCTION foo2_2c(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT a)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`a`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 21, + "stopIndex" : 21, + "fragment" : "a" + } ] +} + + +-- !query +CREATE FUNCTION foo2_2d(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT (SELECT (SELECT a)))) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`a`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 37, + "stopIndex" : 37, + "fragment" : "a" + } ] +} + + +-- !query +CREATE FUNCTION foo2_2e(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) WHERE c1 = 2 +UNION ALL +SELECT a + 1 FROM (VALUES 1) AS V(c1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE FUNCTION foo2_2f(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) +EXCEPT +SELECT a + 1 FROM (VALUES 1) AS V(a) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE FUNCTION foo2_2g(a INT) RETURNS INT RETURN +SELECT a FROM (VALUES 1) AS V(c1) +INTERSECT +SELECT a FROM (VALUES 1) AS V(a) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS ts +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS tm +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS ta +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS V1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS V2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS ts +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS tm +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS ta +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS V1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS V2 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE FUNCTION foo2_3(a INT, b INT) RETURNS INT RETURN a + b +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW V1(c1, c2) AS VALUES (1, 2), (3, 4), (5, 6) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW V2(c1, c2) AS VALUES (-1, -2), (-3, -4), (-5, -6) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_3(c1, c2), foo2_3(c2, 1), foo2_3(c1, c2) - foo2_3(c2, c1 - 1) FROM V1 ORDER BY 1, 2, 3 +-- !query schema +struct +-- !query output +3 3 1 +7 5 1 +11 7 1 + + +-- !query +SELECT * FROM V1 WHERE foo2_3(c1, 0) = c1 AND foo2_3(c1, c2) < 8 +-- !query schema +struct +-- !query output +1 2 +3 4 + + +-- !query +SELECT foo2_3(SUM(c1), SUM(c2)), SUM(c1) + SUM(c2), SUM(foo2_3(c1, c2) + foo2_3(c2, c1) - foo2_3(c2, c1)) +FROM V1 +-- !query schema +struct +-- !query output +21 21 21 + + +-- !query +CREATE FUNCTION foo2_4a(a ARRAY) RETURNS STRING RETURN +SELECT array_sort(a, (i, j) -> rank[i] - rank[j])[0] FROM (SELECT MAP('a', 1, 'b', 2) rank) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_4a(ARRAY('a', 'b')) +-- !query schema +struct +-- !query output +a + + +-- !query +CREATE FUNCTION foo2_4b(m MAP, k STRING) RETURNS STRING RETURN +SELECT v || ' ' || v FROM (SELECT upper(m[k]) AS v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT foo2_4b(map('a', 'hello', 'b', 'world'), 'a') +-- !query schema +struct +-- !query output +HELLO HELLO + + +-- !query +DROP VIEW V2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW V1 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out index d6d4a411ef59e..6fe77cd062253 100644 --- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out @@ -168,3 +168,49 @@ struct 1 a 1 8.5 2 b 2 1.0 3 c 3 3.2 + + +-- !query +SELECT src1.* FROM src1 a ORDER BY id LIMIT 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "condition" : "CANNOT_RESOLVE_STAR_EXPAND", + "sqlState" : "42704", + "messageParameters" : { + "columns" : "`id`, `v1`", + "targetString" : "`src1`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 13, + "fragment" : "src1.*" + } ] +} + + +-- !query +SELECT src1.id FROM (SELECT * FROM src1 ORDER BY id LIMIT 1) a +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "condition" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`src1`.`id`", + "proposal" : "`a`.`id`, `a`.`v1`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 14, + "fragment" : "src1.id" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/results/timestamp.sql.out index 020b97baa8eee..432f77c93f553 100644 --- a/sql/core/src/test/resources/sql-tests/results/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/timestamp.sql.out @@ -395,7 +395,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2019-10-06 10:11:12.' could not be parsed at index 20" } } @@ -467,7 +467,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2019-10-06 10:11:12.1234567PST' could not be parsed, unparsed text found at index 26" } } @@ -491,7 +491,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '223456 2019-10-06 10:11:12.123456PST' could not be parsed at index 27" } } @@ -563,7 +563,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '12.1232019-10-06S10:11' could not be parsed at index 7" } } @@ -579,7 +579,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '12.1232019-10-06S10:11' could not be parsed at index 9" } } @@ -659,7 +659,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Invalid date 'February 29' as '1970' is not a leap year" } } diff --git a/sql/core/src/test/resources/sql-tests/results/timestampNTZ/timestamp-ansi.sql.out b/sql/core/src/test/resources/sql-tests/results/timestampNTZ/timestamp-ansi.sql.out index bf1e13a1a0239..b503287804bd4 100644 --- a/sql/core/src/test/resources/sql-tests/results/timestampNTZ/timestamp-ansi.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/timestampNTZ/timestamp-ansi.sql.out @@ -409,7 +409,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2019-10-06 10:11:12.' could not be parsed at index 20" } } @@ -481,7 +481,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '2019-10-06 10:11:12.1234567PST' could not be parsed, unparsed text found at index 26" } } @@ -505,7 +505,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '223456 2019-10-06 10:11:12.123456PST' could not be parsed at index 27" } } @@ -577,7 +577,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '12.1232019-10-06S10:11' could not be parsed at index 7" } } @@ -593,7 +593,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text '12.1232019-10-06S10:11' could not be parsed at index 9" } } @@ -673,7 +673,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Invalid date 'February 29' as '1970' is not a leap year" } } diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out index f5dc87b7266de..01ba71a6a6782 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out @@ -376,7 +376,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text 'aa' could not be parsed at index 0" } } @@ -409,7 +409,7 @@ org.apache.spark.SparkDateTimeException "condition" : "CANNOT_PARSE_TIMESTAMP", "sqlState" : "22007", "messageParameters" : { - "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "func" : "`try_to_timestamp`", "message" : "Text 'aa' could not be parsed at index 0" } } diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out index 30ba31e71cc92..d59fc412d3f53 100644 --- a/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out @@ -137,7 +137,7 @@ Type VIEW View Text SELECT 1 View Original Text SELECT 1 View Catalog and Namespace spark_catalog.default -View Query Output Columns [1] +View Query Output Columns [`1`] -- !query @@ -155,7 +155,7 @@ Type: VIEW View Text: SELECT 1 View Original Text: SELECT 1 View Catalog and Namespace: spark_catalog.default -View Query Output Columns: [1] +View Query Output Columns: [`1`] Schema: root |-- 1: integer (nullable = false) @@ -206,7 +206,7 @@ Created By [not included in comparison] Type: VIEW View Text: SELECT 1 View Catalog and Namespace: spark_catalog.default -View Query Output Columns: [1] +View Query Output Columns: [`1`] Schema: root |-- 1: integer (nullable = false) @@ -269,7 +269,7 @@ Type VIEW View Text SELECT * FROM t View Original Text SELECT * FROM t View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -324,7 +324,7 @@ Type VIEW View Text SELECT * FROM t View Original Text SELECT * FROM t View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -402,7 +402,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -477,7 +477,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -550,7 +550,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -615,7 +615,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -680,7 +680,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -774,7 +774,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -837,7 +837,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -895,7 +895,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -953,7 +953,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -1007,7 +1007,7 @@ View Text SELECT 1 View Original Text SELECT 1 View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [1] +View Query Output Columns [`1`] -- !query @@ -1026,7 +1026,7 @@ View Text: SELECT 1 View Original Text: SELECT 1 View Schema Mode: BINDING View Catalog and Namespace: spark_catalog.default -View Query Output Columns: [1] +View Query Output Columns: [`1`] Schema: root |-- 1: integer (nullable = false) @@ -1069,7 +1069,7 @@ View Text SELECT 1 View Original Text SELECT 1 View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [1] +View Query Output Columns [`1`] -- !query @@ -1088,7 +1088,7 @@ View Text: SELECT 1 View Original Text: SELECT 1 View Schema Mode: BINDING View Catalog and Namespace: spark_catalog.default -View Query Output Columns: [1] +View Query Output Columns: [`1`] Schema: root |-- 1: integer (nullable = false) @@ -1165,7 +1165,7 @@ Type: VIEW View Text: SELECT 1 View Schema Mode: BINDING View Catalog and Namespace: spark_catalog.default -View Query Output Columns: [1] +View Query Output Columns: [`1`] Schema: root |-- 1: integer (nullable = false) @@ -1199,7 +1199,7 @@ Type: VIEW View Text: SELECT 1 View Schema Mode: BINDING View Catalog and Namespace: spark_catalog.default -View Query Output Columns: [1] +View Query Output Columns: [`1`] Schema: root |-- 1: integer (nullable = false) diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out index 7dd3d2114d99f..3e76ba1db9d0e 100644 --- a/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out @@ -50,7 +50,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -106,7 +106,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -161,7 +161,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -219,7 +219,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -281,7 +281,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -311,7 +311,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -367,7 +367,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out index a8bfe0891f72f..330c151b051db 100644 --- a/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out @@ -58,7 +58,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -112,7 +112,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -166,7 +166,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -260,7 +260,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -323,7 +323,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -381,7 +381,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -439,7 +439,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -493,7 +493,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode BINDING View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -565,7 +565,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out index 92cde7735c96f..0b49aafe04932 100644 --- a/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out @@ -59,7 +59,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -114,7 +114,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c4, c5] +View Query Output Columns [`c4`, `c5`] -- !query @@ -170,7 +170,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c4, c5, c6] +View Query Output Columns [`c4`, `c5`, `c6`] -- !query @@ -233,7 +233,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -279,7 +279,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -342,7 +342,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -397,7 +397,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -452,7 +452,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -515,7 +515,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -573,7 +573,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -631,7 +631,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -678,7 +678,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -709,7 +709,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -756,7 +756,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -787,7 +787,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -834,7 +834,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -865,7 +865,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -912,7 +912,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -1094,7 +1094,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out index 57193c610c0a9..de0655750d503 100644 --- a/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out @@ -59,7 +59,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -114,7 +114,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -169,7 +169,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -232,7 +232,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -290,7 +290,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -348,7 +348,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -395,7 +395,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -442,7 +442,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -473,7 +473,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -520,7 +520,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1, c2] +View Query Output Columns [`c1`, `c2`] -- !query @@ -606,7 +606,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query @@ -644,7 +644,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Schema Mode TYPE EVOLUTION View Catalog and Namespace spark_catalog.default -View Query Output Columns [c1] +View Query Output Columns [`c1`] -- !query diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.0.crc new file mode 100644 index 0000000000000..dd09db7ad216c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.1.crc new file mode 100644 index 0000000000000..dd09db7ad216c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.1.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/0 new file mode 100644 index 0000000000000..7e7f3b21c4e78 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0,"stateUniqueIds":{}} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/1 new file mode 100644 index 0000000000000..7e7f3b21c4e78 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0,"stateUniqueIds":{}} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/metadata new file mode 100644 index 0000000000000..6888983b0bc5d --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/metadata @@ -0,0 +1 @@ +{"id":"f3f30619-9175-4329-97a7-f5629deaad89"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.0.crc new file mode 100644 index 0000000000000..400184017c910 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.1.crc new file mode 100644 index 0000000000000..397dde18c6d5a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.1.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/0 new file mode 100644 index 0000000000000..8177241a333b1 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1734074255407,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"avro","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/1 new file mode 100644 index 0000000000000..cf51e39873cd2 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1734074257473,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"avro","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.changelog.crc new file mode 100644 index 0000000000000..d03e8f51a6f7f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.zip.crc new file mode 100644 index 0000000000000..9fe6838ba35e9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.2.changelog.crc new file mode 100644 index 0000000000000..c790833be99a5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.changelog new file mode 100644 index 0000000000000..a579fe940633b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.zip new file mode 100644 index 0000000000000..1f2b15b840c20 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/2.changelog new file mode 100644 index 0000000000000..6df7672a3d0ec Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/SSTs/.000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/SSTs/.000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst.crc new file mode 100644 index 0000000000000..2026a6dcab3bf Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/SSTs/.000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/SSTs/000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/SSTs/000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst new file mode 100644 index 0000000000000..c4f2175a47371 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/SSTs/000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.zip.crc new file mode 100644 index 0000000000000..0b52e7b4922a0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.2.changelog.crc new file mode 100644 index 0000000000000..889a96a47f0a3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.zip new file mode 100644 index 0000000000000..3986c42e7e5ef Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/2.changelog new file mode 100644 index 0000000000000..13ec6c9cdd843 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.zip.crc new file mode 100644 index 0000000000000..6e5b8098e6e40 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.2.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.zip new file mode 100644 index 0000000000000..59318e5f734e9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/2.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.zip.crc new file mode 100644 index 0000000000000..4899bd7e696cb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.2.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.zip new file mode 100644 index 0000000000000..29a07a94b5dbc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/2.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.zip.crc new file mode 100644 index 0000000000000..846c94b6dfcfc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.2.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.zip new file mode 100644 index 0000000000000..38e6a75814585 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/2.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/.0.crc new file mode 100644 index 0000000000000..9850f2cdfc9b6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/0 new file mode 100644 index 0000000000000..5c53036530462 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/0 @@ -0,0 +1,2 @@ +v2 +{"operatorInfo":{"operatorId":0,"operatorName":"transformWithStateExec"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5,"stateSchemaFilePath":"file:/Users/anish.shrigondekar/spark/spark/target/tmp/spark-dcaeba6f-ff09-4f91-ba1b-4d14fe53cc9f/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0"}],"operatorPropertiesJson":"{\"timeMode\":\"NoTime\",\"outputMode\":\"Update\",\"stateVariables\":[{\"stateName\":\"countState\",\"stateVariableType\":\"ValueState\",\"ttlEnabled\":false}]}"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_stateSchema/default/.0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_stateSchema/default/.0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0.crc new file mode 100644 index 0000000000000..3c16c8244a3b7 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_stateSchema/default/.0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0 new file mode 100644 index 0000000000000..cd3e8f6d96bf5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0 differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.0.crc new file mode 100644 index 0000000000000..dd09db7ad216c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.1.crc new file mode 100644 index 0000000000000..dd09db7ad216c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.1.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/0 new file mode 100644 index 0000000000000..7e7f3b21c4e78 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0,"stateUniqueIds":{}} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/1 new file mode 100644 index 0000000000000..7e7f3b21c4e78 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0,"stateUniqueIds":{}} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/metadata new file mode 100644 index 0000000000000..d236981545754 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/metadata @@ -0,0 +1 @@ +{"id":"1341f9d1-5100-4426-876c-2754aeaca02b"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.0.crc new file mode 100644 index 0000000000000..15e5afc45b88a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.1.crc new file mode 100644 index 0000000000000..de2378b2b16e9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.1.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/0 new file mode 100644 index 0000000000000..2f2a25bda6322 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1734074067729,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"unsaferow","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/1 new file mode 100644 index 0000000000000..3295f1a1579b9 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1734074071551,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"unsaferow","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.changelog.crc new file mode 100644 index 0000000000000..a1ca2c3ed5c1c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.zip.crc new file mode 100644 index 0000000000000..ad0d75698608c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.2.changelog.crc new file mode 100644 index 0000000000000..4ad793ade5782 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.changelog new file mode 100644 index 0000000000000..21cc1e7055f47 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.zip new file mode 100644 index 0000000000000..2c6fbb713b436 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/2.changelog new file mode 100644 index 0000000000000..2375a971fdc20 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/SSTs/.000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/SSTs/.000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst.crc new file mode 100644 index 0000000000000..d72d0acb543e1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/SSTs/.000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/SSTs/000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/SSTs/000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst new file mode 100644 index 0000000000000..bed4218de7ece Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/SSTs/000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.zip.crc new file mode 100644 index 0000000000000..2e4c44069438a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.2.changelog.crc new file mode 100644 index 0000000000000..57fb3ea9d4a67 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.zip new file mode 100644 index 0000000000000..85388bed9dc40 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/2.changelog new file mode 100644 index 0000000000000..daf1c187e8b70 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.zip.crc new file mode 100644 index 0000000000000..03b4665cea145 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.2.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.zip new file mode 100644 index 0000000000000..bf6c8277bc195 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/2.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.zip.crc new file mode 100644 index 0000000000000..a28994f96defc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.2.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.zip new file mode 100644 index 0000000000000..f4b734fa36955 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/2.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.zip.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.zip.crc new file mode 100644 index 0000000000000..80ad867da34f3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.zip.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.2.changelog.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.2.changelog.crc new file mode 100644 index 0000000000000..22e87bcdbe201 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.2.changelog.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.zip new file mode 100644 index 0000000000000..e91c9a6741613 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/2.changelog new file mode 100644 index 0000000000000..85a6a13b976fc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/.0.crc new file mode 100644 index 0000000000000..257c878a1611b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/0 new file mode 100644 index 0000000000000..313c13df69acb --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/0 @@ -0,0 +1,2 @@ +v2 +{"operatorInfo":{"operatorId":0,"operatorName":"transformWithStateExec"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5,"stateSchemaFilePath":"file:/Users/anish.shrigondekar/spark/spark/target/tmp/spark-ae28252a-e696-4653-a9a5-7a9a0766f4c1/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751"}],"operatorPropertiesJson":"{\"timeMode\":\"NoTime\",\"outputMode\":\"Update\",\"stateVariables\":[{\"stateName\":\"countState\",\"stateVariableType\":\"ValueState\",\"ttlEnabled\":false}]}"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_stateSchema/default/.0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_stateSchema/default/.0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751.crc new file mode 100644 index 0000000000000..3c16c8244a3b7 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_stateSchema/default/.0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751 new file mode 100644 index 0000000000000..cd3e8f6d96bf5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751 differ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/resources/testCommitLogV1/testCommitLog b/sql/core/src/test/resources/structured-streaming/testCommitLogV1/testCommitLog similarity index 100% rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/resources/testCommitLogV1/testCommitLog rename to sql/core/src/test/resources/structured-streaming/testCommitLogV1/testCommitLog diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/resources/testCommitLogV2/testCommitLog b/sql/core/src/test/resources/structured-streaming/testCommitLogV2/testCommitLog similarity index 100% rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/resources/testCommitLogV2/testCommitLog rename to sql/core/src/test/resources/structured-streaming/testCommitLogV2/testCommitLog diff --git a/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala index b253c4a70bbf9..e1b0676831549 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala @@ -21,7 +21,9 @@ import org.scalatest.BeforeAndAfter import org.apache.spark.SparkConf import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.tags.SlowSQLTest +@SlowSQLTest class SingleLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeAndAfter { override protected def sparkConf: SparkConf = super.sparkConf .set(SQLConf.CODEGEN_FALLBACK.key, "false") @@ -37,6 +39,7 @@ class SingleLevelAggregateHashMapSuite extends DataFrameAggregateSuite with Befo } } +@SlowSQLTest class TwoLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeAndAfter { override protected def sparkConf: SparkConf = super.sparkConf .set(SQLConf.CODEGEN_FALLBACK.key, "false") @@ -52,6 +55,7 @@ class TwoLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeA } } +@SlowSQLTest class TwoLevelAggregateHashMapWithVectorizedMapSuite extends DataFrameAggregateSuite with BeforeAndAfter { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala index f22d90d9f35d7..e8b9ffe284940 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql -import org.apache.spark.sql.catalyst.expressions.{And, GreaterThan, LessThan, Literal, Or} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, GreaterThan, LessThan, Literal, Or, Rand} +import org.apache.spark.sql.catalyst.optimizer.InlineCTE import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.adaptive._ import org.apache.spark.sql.execution.exchange.ReusedExchangeExec @@ -715,7 +716,7 @@ abstract class CTEInlineSuiteBase checkAnswer(df, Row(1)) } - test("SPARK-49816: should only update out-going-ref-count for referenced outer CTE relation") { + test("SPARK-49816: detect self-contained WithCTE nodes") { withView("v") { sql( """ @@ -735,6 +736,86 @@ abstract class CTEInlineSuiteBase checkAnswer(df, Row(1)) } } + + test("SPARK-49816: complicated reference count") { + // Manually build the logical plan for + // WITH + // r1 AS (SELECT random()), + // r2 AS ( + // WITH + // t1 AS (SELECT * FROM r1), + // t2 AS (SELECT * FROM r1) + // SELECT * FROM t2 + // ) + // SELECT * FROM r2 + // r1 should be inlined as it's only referenced once: main query -> r2 -> t2 -> r1 + val r1 = CTERelationDef(Project(Seq(Alias(Rand(Literal(0)), "r")()), OneRowRelation())) + val r1Ref = CTERelationRef(r1.id, r1.resolved, r1.output, r1.isStreaming) + val t1 = CTERelationDef(Project(r1.output, r1Ref)) + val t2 = CTERelationDef(Project(r1.output, r1Ref)) + val t2Ref = CTERelationRef(t2.id, t2.resolved, t2.output, t2.isStreaming) + val r2 = CTERelationDef(WithCTE(Project(t2.output, t2Ref), Seq(t1, t2))) + val r2Ref = CTERelationRef(r2.id, r2.resolved, r2.output, r2.isStreaming) + val query = WithCTE(Project(r2.output, r2Ref), Seq(r1, r2)) + val inlined = InlineCTE().apply(query) + assert(!inlined.exists(_.isInstanceOf[WithCTE])) + } + + test("SPARK-49816: complicated reference count 2") { + // Manually build the logical plan for + // WITH + // r1 AS (SELECT random()), + // r2 AS ( + // WITH + // t1 AS (SELECT * FROM r1), + // t2 AS (SELECT * FROM t1) + // SELECT * FROM t2 + // ) + // SELECT * FROM r1 + // This is similar to the previous test case, but t2 reference t1 instead of r1, and the main + // query references r1. r1 should be inlined as r2 is not referenced at all. + val r1 = CTERelationDef(Project(Seq(Alias(Rand(Literal(0)), "r")()), OneRowRelation())) + val r1Ref = CTERelationRef(r1.id, r1.resolved, r1.output, r1.isStreaming) + val t1 = CTERelationDef(Project(r1.output, r1Ref)) + val t1Ref = CTERelationRef(t1.id, t1.resolved, t1.output, t1.isStreaming) + val t2 = CTERelationDef(Project(t1.output, t1Ref)) + val t2Ref = CTERelationRef(t2.id, t2.resolved, t2.output, t2.isStreaming) + val r2 = CTERelationDef(WithCTE(Project(t2.output, t2Ref), Seq(t1, t2))) + val query = WithCTE(Project(r1.output, r1Ref), Seq(r1, r2)) + val inlined = InlineCTE().apply(query) + assert(!inlined.exists(_.isInstanceOf[WithCTE])) + } + + test("SPARK-49816: complicated reference count 3") { + // Manually build the logical plan for + // WITH + // r1 AS ( + // WITH + // t1 AS (SELECT random()), + // t2 AS (SELECT * FROM t1) + // SELECT * FROM t2 + // ), + // r2 AS ( + // WITH + // t1 AS (SELECT random()), + // t2 AS (SELECT * FROM r1) + // SELECT * FROM t2 + // ) + // SELECT * FROM r1 UNION ALL SELECT * FROM r2 + // The inner WITH in r1 and r2 should become `SELECT random()` and r1/r2 should be inlined. + val t1 = CTERelationDef(Project(Seq(Alias(Rand(Literal(0)), "r")()), OneRowRelation())) + val t1Ref = CTERelationRef(t1.id, t1.resolved, t1.output, t1.isStreaming) + val t2 = CTERelationDef(Project(t1.output, t1Ref)) + val t2Ref = CTERelationRef(t2.id, t2.resolved, t2.output, t2.isStreaming) + val cte = WithCTE(Project(t2.output, t2Ref), Seq(t1, t2)) + val r1 = CTERelationDef(cte) + val r1Ref = CTERelationRef(r1.id, r1.resolved, r1.output, r1.isStreaming) + val r2 = CTERelationDef(cte) + val r2Ref = CTERelationRef(r2.id, r2.resolved, r2.output, r2.isStreaming) + val query = WithCTE(Union(r1Ref, r2Ref), Seq(r1, r2)) + val inlined = InlineCTE().apply(query) + assert(!inlined.exists(_.isInstanceOf[WithCTE])) + } } class CTEInlineSuiteAEOff extends CTEInlineSuiteBase with DisableAdaptiveExecutionSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala index d3b11274fe1c8..47ebd387e89a3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala @@ -86,6 +86,27 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { } } + test("preserve char/varchar type info") { + Seq(CharType(5), VarcharType(5)).foreach { typ => + for { + char_varchar_as_string <- Seq(false, true) + preserve_char_varchar <- Seq(false, true) + } { + withSQLConf(SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key -> char_varchar_as_string.toString, + SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> preserve_char_varchar.toString) { + withTable("t") { + val name = typ.typeName + sql(s"CREATE TABLE t(i STRING, c $name) USING $format") + val schema = spark.table("t").schema + assert(schema.fields(0).dataType == StringType) + val expectedType = if (preserve_char_varchar) typ else StringType + assert(schema.fields(1).dataType == expectedType) + } + } + } + } + } + test("char type values should be padded or trimmed: partitioned columns") { // via dynamic partitioned columns withTable("t") { @@ -674,6 +695,90 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { } } } + + test(s"insert string literal into char/varchar column when " + + s"${SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key} is true") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES ('1234', '1234')") + checkAnswer(spark.table("t"), Row("1234 ", "1234")) + assertLengthCheckFailure("INSERT INTO t VALUES ('123456', '1')") + assertLengthCheckFailure("INSERT INTO t VALUES ('1', '123456')") + } + } + } + + test(s"insert from string column into char/varchar column when " + + s"${SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key} is true") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + withTable("a", "b") { + sql(s"CREATE TABLE a AS SELECT '1234' as c1, '1234' as c2") + sql(s"CREATE TABLE b(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO b SELECT * FROM a") + checkAnswer(spark.table("b"), Row("1234 ", "1234")) + spark.table("b").show() + } + } + } + + test(s"cast from char/varchar when ${SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key} is true") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + Seq("char(5)", "varchar(5)").foreach { typ => + Seq( + "int" -> ("123", 123), + "long" -> ("123 ", 123L), + "boolean" -> ("true ", true), + "boolean" -> ("false", false), + "double" -> ("1.2", 1.2) + ).foreach { case (toType, (from, to)) => + assert(sql(s"select cast($from :: $typ as $toType)").collect() === Array(Row(to))) + } + } + } + } + + test(s"cast to char/varchar when ${SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key} is true") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { + Seq("char(10)", "varchar(10)").foreach { typ => + Seq( + 123 -> "123", + 123L-> "123", + true -> "true", + false -> "false", + 1.2 -> "1.2" + ).foreach { case (from, to) => + val paddedTo = if (typ == "char(10)") { + to.padTo(10, ' ') + } else { + to + } + sql(s"select cast($from as $typ)").collect() === Array(Row(paddedTo)) + } + } + } + } + + test("implicitly cast char/varchar into atomics") { + Seq("char", "varchar").foreach { typ => + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true", + SQLConf.ANSI_ENABLED.key -> "true") { + checkAnswer(sql( + s""" + |SELECT + |NOT('false'::$typ(5)), + |1 + ('4'::$typ(5)), + |2L + ('4'::$typ(5)), + |3S + ('4'::$typ(5)), + |4Y - ('4'::$typ(5)), + |1.2 / ('0.6'::$typ(5)), + |MINUTE('2009-07-30 12:58:59'::$typ(30)), + |if(true, '0'::$typ(5), 1), + |if(false, '0'::$typ(5), 1) + """.stripMargin), Row(true, 5, 6, 7, 0, 2.0, 58, 0, 1)) + } + } + } } // Some basic char/varchar tests which doesn't rely on table implementation. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala index bc62fa5fdd331..1f9589c1c9ce4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.{SparkFunSuite, SparkRuntimeException} import org.apache.spark.sql.catalyst.analysis.ExpressionBuilder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.variant.ParseJson -import org.apache.spark.sql.internal.SqlApiConf +import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} import org.apache.spark.sql.internal.types._ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -104,6 +104,7 @@ class CollationExpressionWalkerSuite extends SparkFunSuite with SharedSparkSessi Literal.create("DuMmY sTrInG".getBytes) } case BooleanType => Literal(true) + case ByteType => Literal(5.toByte) case _: DatetimeType => Literal(Timestamp.valueOf("2009-07-30 12:58:59")) case DecimalType => Literal((new Decimal).set(5)) case _: DecimalType => Literal((new Decimal).set(5)) @@ -183,6 +184,7 @@ class CollationExpressionWalkerSuite extends SparkFunSuite with SharedSparkSessi case Utf8Lcase => "Cast('DuMmY sTrInG' collate utf8_lcase as BINARY)" } case BooleanType => "True" + case ByteType => "cast(5 as tinyint)" case _: DatetimeType => "date'2016-04-08'" case DecimalType => "5.0" case _: DecimalType => "5.0" @@ -243,6 +245,7 @@ class CollationExpressionWalkerSuite extends SparkFunSuite with SharedSparkSessi case AnyTimestampType => "TIMESTAMP" case BinaryType => "BINARY" case BooleanType => "BOOLEAN" + case ByteType => "TINYINT" case _: DatetimeType => "DATE" case DecimalType => "DECIMAL(2, 1)" case _: DecimalType => "DECIMAL(2, 1)" @@ -636,48 +639,49 @@ class CollationExpressionWalkerSuite extends SparkFunSuite with SharedSparkSessi val expr = headConstructor.newInstance(args: _*).asInstanceOf[ExpectsInputTypes] withTable("tbl", "tbl_lcase") { + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + val utf8_df = generateTableData(expr.inputTypes.take(2), Utf8Binary) + val utf8_lcase_df = generateTableData(expr.inputTypes.take(2), Utf8Lcase) + + val utf8BinaryResult = try { + val df = utf8_df.selectExpr(transformExpressionToString(expr, Utf8Binary)) + df.getRows(1, 0) + scala.util.Right(df) + } catch { + case e: Throwable => scala.util.Left(e) + } + val utf8LcaseResult = try { + val df = utf8_lcase_df.selectExpr(transformExpressionToString(expr, Utf8Lcase)) + df.getRows(1, 0) + scala.util.Right(df) + } catch { + case e: Throwable => scala.util.Left(e) + } - val utf8_df = generateTableData(expr.inputTypes.take(2), Utf8Binary) - val utf8_lcase_df = generateTableData(expr.inputTypes.take(2), Utf8Lcase) - - val utf8BinaryResult = try { - val df = utf8_df.selectExpr(transformExpressionToString(expr, Utf8Binary)) - df.getRows(1, 0) - scala.util.Right(df) - } catch { - case e: Throwable => scala.util.Left(e) - } - val utf8LcaseResult = try { - val df = utf8_lcase_df.selectExpr(transformExpressionToString(expr, Utf8Lcase)) - df.getRows(1, 0) - scala.util.Right(df) - } catch { - case e: Throwable => scala.util.Left(e) - } - - assert(utf8BinaryResult.isLeft === utf8LcaseResult.isLeft) + assert(utf8BinaryResult.isLeft === utf8LcaseResult.isLeft) - if (utf8BinaryResult.isRight) { - val utf8BinaryResultChecked = utf8BinaryResult.getOrElse(null) - val utf8LcaseResultChecked = utf8LcaseResult.getOrElse(null) + if (utf8BinaryResult.isRight) { + val utf8BinaryResultChecked = utf8BinaryResult.getOrElse(null) + val utf8LcaseResultChecked = utf8LcaseResult.getOrElse(null) - val dt = utf8BinaryResultChecked.schema.fields.head.dataType + val dt = utf8BinaryResultChecked.schema.fields.head.dataType - dt match { - case st if utf8BinaryResultChecked != null && utf8LcaseResultChecked != null && - hasStringType(st) => - // scalastyle:off caselocale - assert(utf8BinaryResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))(1) === - utf8LcaseResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))(1)) + dt match { + case st if utf8BinaryResultChecked != null && utf8LcaseResultChecked != null && + hasStringType(st) => + // scalastyle:off caselocale + assert(utf8BinaryResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))(1) === + utf8LcaseResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))(1)) // scalastyle:on caselocale - case _ => - assert(utf8BinaryResultChecked.getRows(1, 0)(1) === - utf8LcaseResultChecked.getRows(1, 0)(1)) + case _ => + assert(utf8BinaryResultChecked.getRows(1, 0)(1) === + utf8LcaseResultChecked.getRows(1, 0)(1)) + } + } + else { + assert(utf8BinaryResult.getOrElse(new Exception()).getClass + == utf8LcaseResult.getOrElse(new Exception()).getClass) } - } - else { - assert(utf8BinaryResult.getOrElse(new Exception()).getClass - == utf8LcaseResult.getOrElse(new Exception()).getClass) } } } @@ -728,6 +732,7 @@ class CollationExpressionWalkerSuite extends SparkFunSuite with SharedSparkSessi // other functions which are not yet supported "to_avro", "from_avro", + "schema_of_avro", "to_protobuf", "from_protobuf" ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 6feb4587b816f..384411a0fd342 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentExceptio import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Mode +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -38,8 +39,19 @@ class CollationSQLExpressionsSuite with SharedSparkSession with ExpressionEvalHelper { - private val testSuppCollations = Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI") - private val testAdditionalCollations = Seq("UNICODE", "SR", "SR_CI", "SR_AI", "SR_CI_AI") + private val testSuppCollations = + Seq( + "UTF8_BINARY", + "UTF8_BINARY_RTRIM", + "UTF8_LCASE", + "UTF8_LCASE_RTRIM", + "UNICODE", + "UNICODE_RTRIM", + "UNICODE_CI", + "UNICODE_CI_RTRIM") + private val testAdditionalCollations = Seq("UNICODE", + "SR", "SR_RTRIM", "SR_CI", "SR_AI", "SR_CI_AI") + private val fullyQualifiedPrefix = s"${CollationFactory.CATALOG}.${CollationFactory.SCHEMA}." test("Support Md5 hash expression with collation") { case class Md5TestCase( @@ -262,11 +274,19 @@ class CollationSQLExpressionsSuite val testCases = Seq( UrlEncodeTestCase("https://spark.apache.org", "UTF8_BINARY", "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UTF8_BINARY_RTRIM", + "https%3A%2F%2Fspark.apache.org"), UrlEncodeTestCase("https://spark.apache.org", "UTF8_LCASE", "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UTF8_LCASE_RTRIM", + "https%3A%2F%2Fspark.apache.org"), UrlEncodeTestCase("https://spark.apache.org", "UNICODE", "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UNICODE_RTRIM", + "https%3A%2F%2Fspark.apache.org"), UrlEncodeTestCase("https://spark.apache.org", "UNICODE_CI", + "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UNICODE_CI_RTRIM", "https%3A%2F%2Fspark.apache.org") ) @@ -296,11 +316,19 @@ class CollationSQLExpressionsSuite val testCases = Seq( UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_BINARY", "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_BINARY_RTRIM", + "https://spark.apache.org"), UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_LCASE", "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_LCASE_RTRIM", + "https://spark.apache.org"), UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UNICODE", "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UNICODE_RTRIM", + "https://spark.apache.org"), UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UNICODE_CI", + "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UNICODE_CI_RTRIM", "https://spark.apache.org") ) @@ -331,11 +359,19 @@ class CollationSQLExpressionsSuite val testCases = Seq( ParseUrlTestCase("http://spark.apache.org/path?query=1", "UTF8_BINARY", "HOST", "spark.apache.org"), + ParseUrlTestCase("http://spark.apache.org/path?query=1", "UTF8_BINARY_RTRIM", "HOST", + "spark.apache.org"), ParseUrlTestCase("http://spark.apache.org/path?query=2", "UTF8_LCASE", "PATH", "/path"), + ParseUrlTestCase("http://spark.apache.org/path?query=2", "UTF8_LCASE_RTRIM", "PATH", + "/path"), ParseUrlTestCase("http://spark.apache.org/path?query=3", "UNICODE", "QUERY", "query=3"), + ParseUrlTestCase("http://spark.apache.org/path?query=3", "UNICODE_RTRIM", "QUERY", + "query=3"), ParseUrlTestCase("http://spark.apache.org/path?query=4", "UNICODE_CI", "PROTOCOL", + "http"), + ParseUrlTestCase("http://spark.apache.org/path?query=4", "UNICODE_CI_RTRIM", "PROTOCOL", "http") ) @@ -370,15 +406,32 @@ class CollationSQLExpressionsSuite Row(1), Seq( StructField("a", IntegerType, nullable = true) )), + CsvToStructsTestCase("1", "UTF8_BINARY_RTRIM", "'a INT'", "", + Row(1), Seq( + StructField("a", IntegerType, nullable = true) + )), CsvToStructsTestCase("true, 0.8", "UTF8_LCASE", "'A BOOLEAN, B DOUBLE'", "", Row(true, 0.8), Seq( StructField("A", BooleanType, nullable = true), StructField("B", DoubleType, nullable = true) )), + CsvToStructsTestCase("true, 0.8", "UTF8_LCASE_RTRIM", "'A BOOLEAN, B DOUBLE'", "", + Row(true, 0.8), Seq( + StructField("A", BooleanType, nullable = true), + StructField("B", DoubleType, nullable = true) + )), CsvToStructsTestCase("\"Spark\"", "UNICODE", "'a STRING'", "", + Row("Spark"), Seq( + StructField("a", StringType, nullable = true) + )), + CsvToStructsTestCase("\"Spark\"", "UTF8_BINARY", "'a STRING COLLATE UNICODE'", "", Row("Spark"), Seq( StructField("a", StringType("UNICODE"), nullable = true) )), + CsvToStructsTestCase("\"Spark\"", "UNICODE_RTRIM", "'a STRING COLLATE UNICODE_RTRIM'", "", + Row("Spark"), Seq( + StructField("a", StringType("UNICODE_RTRIM"), nullable = true) + )), CsvToStructsTestCase("26/08/2015", "UTF8_BINARY", "'time Timestamp'", ", map('timestampFormat', 'dd/MM/yyyy')", Row( new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse("2015-08-26 00:00:00.0") @@ -413,10 +466,16 @@ class CollationSQLExpressionsSuite val testCases = Seq( SchemaOfCsvTestCase("1", "UTF8_BINARY", "STRUCT<_c0: INT>"), + SchemaOfCsvTestCase("1", "UTF8_BINARY_RTRIM", "STRUCT<_c0: INT>"), SchemaOfCsvTestCase("true,0.8", "UTF8_LCASE", "STRUCT<_c0: BOOLEAN, _c1: DOUBLE>"), + SchemaOfCsvTestCase("true,0.8", "UTF8_LCASE_RTRIM", + "STRUCT<_c0: BOOLEAN, _c1: DOUBLE>"), SchemaOfCsvTestCase("2015-08-26", "UNICODE", "STRUCT<_c0: DATE>"), + SchemaOfCsvTestCase("2015-08-26", "UNICODE_RTRIM", "STRUCT<_c0: DATE>"), SchemaOfCsvTestCase("abc", "UNICODE_CI", + "STRUCT<_c0: STRING>"), + SchemaOfCsvTestCase("abc", "UNICODE_CI_RTRIM", "STRUCT<_c0: STRING>") ) @@ -445,9 +504,14 @@ class CollationSQLExpressionsSuite val testCases = Seq( StructsToCsvTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", "1,2"), + StructsToCsvTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY_RTRIM", "1,2"), StructsToCsvTestCase("named_struct('A', true, 'B', 2.0)", "UTF8_LCASE", "true,2.0"), + StructsToCsvTestCase("named_struct('A', true, 'B', 2.0)", "UTF8_LCASE_RTRIM", "true,2.0"), StructsToCsvTestCase("named_struct()", "UNICODE", null), + StructsToCsvTestCase("named_struct()", "UNICODE_RTRIM", null), StructsToCsvTestCase("named_struct('time', to_timestamp('2015-08-26'))", "UNICODE_CI", + "2015-08-26T00:00:00.000-07:00"), + StructsToCsvTestCase("named_struct('time', to_timestamp('2015-08-26'))", "UNICODE_CI_RTRIM", "2015-08-26T00:00:00.000-07:00") ) @@ -478,9 +542,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( ConvTestCase("100", "2", "10", "UTF8_BINARY", "4"), + ConvTestCase("100", "2", "10", "UTF8_BINARY_RTRIM", "4"), ConvTestCase("100", "2", "10", "UTF8_LCASE", "4"), + ConvTestCase("100", "2", "10", "UTF8_LCASE_RTRIM", "4"), ConvTestCase("100", "2", "10", "UNICODE", "4"), - ConvTestCase("100", "2", "10", "UNICODE_CI", "4") + ConvTestCase("100", "2", "10", "UNICODE_RTRIM", "4"), + ConvTestCase("100", "2", "10", "UNICODE_CI", "4"), + ConvTestCase("100", "2", "10", "UNICODE_CI_RTRIM", "4") ) testCases.foreach(t => { val query = @@ -502,9 +570,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( BinTestCase("13", "UTF8_BINARY", "1101"), + BinTestCase("13", "UTF8_BINARY_RTRIM", "1101"), BinTestCase("13", "UTF8_LCASE", "1101"), + BinTestCase("13", "UTF8_LCASE_RTRIM", "1101"), BinTestCase("13", "UNICODE", "1101"), - BinTestCase("13", "UNICODE_CI", "1101") + BinTestCase("13", "UNICODE_RTRIM", "1101"), + BinTestCase("13", "UNICODE_CI", "1101"), + BinTestCase("13", "UNICODE_CI_RTRIM", "1101") ) testCases.foreach(t => { val query = @@ -527,9 +599,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( HexTestCase("13", "UTF8_BINARY", "D"), + HexTestCase("13", "UTF8_BINARY_RTRIM", "D"), HexTestCase("13", "UTF8_LCASE", "D"), + HexTestCase("13", "UTF8_LCASE_RTRIM", "D"), HexTestCase("13", "UNICODE", "D"), - HexTestCase("13", "UNICODE_CI", "D") + HexTestCase("13", "UNICODE_RTRIM", "D"), + HexTestCase("13", "UNICODE_CI", "D"), + HexTestCase("13", "UNICODE_CI_RTRIM", "D") ) testCases.foreach(t => { val query = @@ -552,10 +628,15 @@ class CollationSQLExpressionsSuite val testCases = Seq( HexTestCase("Spark SQL", "UTF8_BINARY", "537061726B2053514C"), + HexTestCase("Spark SQL", "UTF8_BINARY_RTRIM", "537061726B2053514C"), HexTestCase("Spark SQL", "UTF8_LCASE", "537061726B2053514C"), + HexTestCase("Spark SQL", "UTF8_LCASE_RTRIM", "537061726B2053514C"), HexTestCase("Spark SQL", "UNICODE", "537061726B2053514C"), + HexTestCase("Spark SQL", "UNICODE_RTRIM", "537061726B2053514C"), HexTestCase("Spark SQL", "UNICODE_CI", "537061726B2053514C"), - HexTestCase("Spark SQL", "DE_CI_AI", "537061726B2053514C") + HexTestCase("Spark SQL", "UNICODE_CI_RTRIM", "537061726B2053514C"), + HexTestCase("Spark SQL", "DE_CI_AI", "537061726B2053514C"), + HexTestCase("Spark SQL", "DE_CI_AI_RTRIM", "537061726B2053514C") ) testCases.foreach(t => { val query = @@ -576,9 +657,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( UnHexTestCase("537061726B2053514C", "UTF8_BINARY", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UTF8_BINARY_RTRIM", "Spark SQL"), UnHexTestCase("537061726B2053514C", "UTF8_LCASE", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UTF8_LCASE_RTRIM", "Spark SQL"), UnHexTestCase("537061726B2053514C", "UNICODE", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UNICODE_RTRIM", "Spark SQL"), UnHexTestCase("537061726B2053514C", "UNICODE_CI", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UNICODE_CI_RTRIM", "Spark SQL"), UnHexTestCase("537061726B2053514C", "DE", "Spark SQL") ) testCases.foreach(t => { @@ -607,16 +692,30 @@ class CollationSQLExpressionsSuite "xpath_boolean", "UTF8_BINARY", true, BooleanType), XPathTestCase("12", "sum(A/B)", "xpath_short", "UTF8_BINARY", 3, ShortType), + XPathTestCase("1", "a/b", + "xpath_boolean", "UTF8_BINARY_RTRIM", true, BooleanType), + XPathTestCase("12", "sum(A/B)", + "xpath_short", "UTF8_BINARY_RTRIM", 3, ShortType), XPathTestCase("34", "sum(a/b)", "xpath_int", "UTF8_LCASE", 7, IntegerType), XPathTestCase("56", "sum(A/B)", "xpath_long", "UTF8_LCASE", 11, LongType), + XPathTestCase("34", "sum(a/b)", + "xpath_int", "UTF8_LCASE_RTRIM", 7, IntegerType), + XPathTestCase("56", "sum(A/B)", + "xpath_long", "UTF8_LCASE_RTRIM", 11, LongType), XPathTestCase("78", "sum(a/b)", "xpath_float", "UNICODE", 15.0, FloatType), XPathTestCase("90", "sum(A/B)", "xpath_double", "UNICODE", 9.0, DoubleType), + XPathTestCase("78", "sum(a/b)", + "xpath_float", "UNICODE_RTRIM", 15.0, FloatType), + XPathTestCase("90", "sum(A/B)", + "xpath_double", "UNICODE_RTRIM", 9.0, DoubleType), XPathTestCase("bcc", "a/c", "xpath_string", "UNICODE_CI", "cc", StringType("UNICODE_CI")), + XPathTestCase("bcc ", "a/c", + "xpath_string", "UNICODE_CI_RTRIM", "cc ", StringType("UNICODE_CI_RTRIM")), XPathTestCase("b1b2b3c1c2", "a/b/text()", "xpath", "UNICODE_CI", Array("b1", "b2", "b3"), ArrayType(StringType("UNICODE_CI"))) ) @@ -645,10 +744,15 @@ class CollationSQLExpressionsSuite val testCases = Seq( StringSpaceTestCase(1, "UTF8_BINARY", " "), + StringSpaceTestCase(1, "UTF8_BINARY_RTRIM", " "), StringSpaceTestCase(2, "UTF8_LCASE", " "), + StringSpaceTestCase(2, "UTF8_LCASE_RTRIM", " "), StringSpaceTestCase(3, "UNICODE", " "), + StringSpaceTestCase(3, "UNICODE_RTRIM", " "), StringSpaceTestCase(4, "UNICODE_CI", " "), - StringSpaceTestCase(5, "AF_CI_AI", " ") + StringSpaceTestCase(4, "UNICODE_CI_RTRIM", " "), + StringSpaceTestCase(5, "AF_CI_AI", " "), + StringSpaceTestCase(5, "AF_CI_AI_RTRIM", " ") ) // Supported collations @@ -678,9 +782,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( ToNumberTestCase("123", "UTF8_BINARY", "999", 123, DecimalType(3, 0)), + ToNumberTestCase("123", "UTF8_BINARY_RTRIM", "999", 123, DecimalType(3, 0)), ToNumberTestCase("1", "UTF8_LCASE", "0.00", 1.00, DecimalType(3, 2)), + ToNumberTestCase("1", "UTF8_LCASE_RTRIM", "0.00", 1.00, DecimalType(3, 2)), ToNumberTestCase("99,999", "UNICODE", "99,999", 99999, DecimalType(5, 0)), - ToNumberTestCase("$14.99", "UNICODE_CI", "$99.99", 14.99, DecimalType(4, 2)) + ToNumberTestCase("99,999", "UNICODE_RTRIM", "99,999", 99999, DecimalType(5, 0)), + ToNumberTestCase("$14.99", "UNICODE_CI", "$99.99", 14.99, DecimalType(4, 2)), + ToNumberTestCase("$14.99", "UNICODE_CI_RTRIM", "$99.99", 14.99, DecimalType(4, 2)) ) // Supported collations (ToNumber) @@ -748,9 +856,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( ToCharTestCase(12, "UTF8_BINARY", "999", " 12"), + ToCharTestCase(12, "UTF8_BINARY_RTRIM", "999", " 12"), ToCharTestCase(34, "UTF8_LCASE", "000D00", "034.00"), + ToCharTestCase(34, "UTF8_LCASE_RTRIM", "000D00", "034.00"), ToCharTestCase(56, "UNICODE", "$99.99", "$56.00"), - ToCharTestCase(78, "UNICODE_CI", "99D9S", "78.0+") + ToCharTestCase(56, "UNICODE_RTRIM", "$99.99", "$56.00"), + ToCharTestCase(78, "UNICODE_CI", "99D9S", "78.0+"), + ToCharTestCase(78, "UNICODE_CI_RTRIM", "99D9S", "78.0+") ) // Supported collations @@ -779,9 +891,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( GetJsonObjectTestCase("{\"a\":\"b\"}", "$.a", "UTF8_BINARY", "b"), + GetJsonObjectTestCase("{\"a\":\"b\"}", "$.a", "UTF8_BINARY_RTRIM", "b"), GetJsonObjectTestCase("{\"A\":\"1\"}", "$.A", "UTF8_LCASE", "1"), + GetJsonObjectTestCase("{\"A\":\"1\"}", "$.A", "UTF8_LCASE_RTRIM", "1"), GetJsonObjectTestCase("{\"x\":true}", "$.x", "UNICODE", "true"), - GetJsonObjectTestCase("{\"X\":1}", "$.X", "UNICODE_CI", "1") + GetJsonObjectTestCase("{\"x\":true}", "$.x", "UNICODE_RTRIM", "true"), + GetJsonObjectTestCase("{\"X\":1}", "$.X", "UNICODE_CI", "1"), + GetJsonObjectTestCase("{\"X\":1}", "$.X", "UNICODE_CI_RTRIM", "1") ) // Supported collations @@ -811,10 +927,16 @@ class CollationSQLExpressionsSuite val testCases = Seq( JsonTupleTestCase("{\"a\":1, \"b\":2}", "'a', 'b'", "UTF8_BINARY", Row("1", "2")), + JsonTupleTestCase("{\"a\":1, \"b\":2}", "'a', 'b'", "UTF8_BINARY_RTRIM", + Row("1", "2")), JsonTupleTestCase("{\"A\":\"3\", \"B\":\"4\"}", "'A', 'B'", "UTF8_LCASE", Row("3", "4")), + JsonTupleTestCase("{\"A\":\"3\", \"B\":\"4\"}", "'A', 'B'", "UTF8_LCASE_RTRIM", + Row("3", "4")), JsonTupleTestCase("{\"x\":true, \"y\":false}", "'x', 'y'", "UNICODE", Row("true", "false")), + JsonTupleTestCase("{\"x\":true, \"y\":false}", "'x', 'y'", "UNICODE_RTRIM", + Row("true", "false")), JsonTupleTestCase("{\"X\":null, \"Y\":null}", "'X', 'Y'", "UNICODE_CI", Row(null, null)) ) @@ -846,12 +968,20 @@ class CollationSQLExpressionsSuite val testCases = Seq( JsonToStructsTestCase("{\"a\":1, \"b\":2.0}", "a INT, b DOUBLE", "UTF8_BINARY", Row(Row(1, 2.0))), + JsonToStructsTestCase("{\"a\":1, \"b\":2.0}", "a INT, b DOUBLE", + "UTF8_BINARY_RTRIM", Row(Row(1, 2.0))), JsonToStructsTestCase("{\"A\":\"3\", \"B\":4}", "A STRING COLLATE UTF8_LCASE, B INT", "UTF8_LCASE", Row(Row("3", 4))), + JsonToStructsTestCase("{\"A\":\"3\", \"B\":4}", "A STRING COLLATE UTF8_LCASE, B INT", + "UTF8_LCASE_RTRIM", Row(Row("3", 4))), JsonToStructsTestCase("{\"x\":true, \"y\":null}", "x BOOLEAN, y VOID", "UNICODE", Row(Row(true, null))), + JsonToStructsTestCase("{\"x\":true, \"y\":null}", "x BOOLEAN, y VOID", + "UNICODE_RTRIM", Row(Row(true, null))), + JsonToStructsTestCase("{\"X\":null, \"Y\":false}", "X VOID, Y BOOLEAN", + "UNICODE_CI", Row(Row(null, false))), JsonToStructsTestCase("{\"X\":null, \"Y\":false}", "X VOID, Y BOOLEAN", - "UNICODE_CI", Row(Row(null, false))) + "UNICODE_CI_RTRIM", Row(Row(null, false))) ) // Supported collations @@ -880,12 +1010,20 @@ class CollationSQLExpressionsSuite val testCases = Seq( StructsToJsonTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", Row("{\"a\":1,\"b\":2}")), + StructsToJsonTestCase("named_struct('a', 1, 'b', 2)", + "UTF8_BINARY_RTRIM", Row("{\"a\":1,\"b\":2}")), StructsToJsonTestCase("array(named_struct('a', 1, 'b', 2))", "UTF8_LCASE", Row("[{\"a\":1,\"b\":2}]")), + StructsToJsonTestCase("array(named_struct('a', 1, 'b', 2))", + "UTF8_LCASE_RTRIM", Row("[{\"a\":1,\"b\":2}]")), StructsToJsonTestCase("map('a', named_struct('b', 1))", "UNICODE", Row("{\"a\":{\"b\":1}}")), + StructsToJsonTestCase("map('a', named_struct('b', 1))", + "UNICODE_RTRIM", Row("{\"a\":{\"b\":1}}")), StructsToJsonTestCase("array(map('a', 1))", - "UNICODE_CI", Row("[{\"a\":1}]")) + "UNICODE_CI", Row("[{\"a\":1}]")), + StructsToJsonTestCase("array(map('a', 1))", + "UNICODE_CI_RTRIM", Row("[{\"a\":1}]")) ) // Supported collations @@ -913,9 +1051,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( LengthOfJsonArrayTestCase("'[1,2,3,4]'", "UTF8_BINARY", Row(4)), + LengthOfJsonArrayTestCase("'[1,2,3,4]'", "UTF8_BINARY_RTRIM", Row(4)), LengthOfJsonArrayTestCase("'[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]'", "UTF8_LCASE", Row(5)), + LengthOfJsonArrayTestCase("'[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]'", "UTF8_LCASE_RTRIM", Row(5)), LengthOfJsonArrayTestCase("'[1,2'", "UNICODE", Row(null)), - LengthOfJsonArrayTestCase("'['", "UNICODE_CI", Row(null)) + LengthOfJsonArrayTestCase("'[1,2'", "UNICODE_RTRIM", Row(null)), + LengthOfJsonArrayTestCase("'['", "UNICODE_CI", Row(null)), + LengthOfJsonArrayTestCase("'['", "UNICODE_CI_RTRIM", Row(null)) ) // Supported collations @@ -943,11 +1085,19 @@ class CollationSQLExpressionsSuite val testCases = Seq( JsonObjectKeysJsonArrayTestCase("{}", "UTF8_BINARY", Row(Seq())), + JsonObjectKeysJsonArrayTestCase("{}", "UTF8_BINARY_RTRIM", + Row(Seq())), JsonObjectKeysJsonArrayTestCase("{\"k\":", "UTF8_LCASE", Row(null)), + JsonObjectKeysJsonArrayTestCase("{\"k\":", "UTF8_LCASE_RTRIM", + Row(null)), JsonObjectKeysJsonArrayTestCase("{\"k1\": \"v1\"}", "UNICODE", Row(Seq("k1"))), + JsonObjectKeysJsonArrayTestCase("{\"k1\": \"v1\"}", "UNICODE_RTRIM", + Row(Seq("k1"))), JsonObjectKeysJsonArrayTestCase("{\"k1\":1,\"k2\":{\"k3\":3, \"k4\":4}}", "UNICODE_CI", + Row(Seq("k1", "k2"))), + JsonObjectKeysJsonArrayTestCase("{\"k1\":1,\"k2\":{\"k3\":3, \"k4\":4}}", "UNICODE_CI_RTRIM", Row(Seq("k1", "k2"))) ) @@ -977,12 +1127,20 @@ class CollationSQLExpressionsSuite val testCases = Seq( SchemaOfJsonTestCase("'[{\"col\":0}]'", "UTF8_BINARY", Row("ARRAY>")), + SchemaOfJsonTestCase("'[{\"col\":0}]'", + "UTF8_BINARY_RTRIM", Row("ARRAY>")), SchemaOfJsonTestCase("'[{\"col\":01}]', map('allowNumericLeadingZeros', 'true')", "UTF8_LCASE", Row("ARRAY>")), + SchemaOfJsonTestCase("'[{\"col\":01}]', map('allowNumericLeadingZeros', 'true')", + "UTF8_LCASE_RTRIM", Row("ARRAY>")), SchemaOfJsonTestCase("'[]'", "UNICODE", Row("ARRAY")), + SchemaOfJsonTestCase("'[]'", + "UNICODE_RTRIM", Row("ARRAY")), SchemaOfJsonTestCase("''", - "UNICODE_CI", Row("STRING")) + "UNICODE_CI", Row("STRING")), + SchemaOfJsonTestCase("''", + "UNICODE_CI_RTRIM", Row("STRING")) ) // Supported collations @@ -1023,10 +1181,7 @@ class CollationSQLExpressionsSuite Map("c" -> "1", "č" -> "2", "ć" -> "3")) ) val unsupportedTestCases = Seq( - StringToMapTestCase("a:1,b:2,c:3", "?", "?", "UNICODE_AI", null), - StringToMapTestCase("a:1,b:2,c:3", "?", "?", "UNICODE_RTRIM", null), - StringToMapTestCase("a:1,b:2,c:3", "?", "?", "UTF8_BINARY_RTRIM", null), - StringToMapTestCase("a:1,b:2,c:3", "?", "?", "UTF8_LCASE_RTRIM", null)) + StringToMapTestCase("a:1,b:2,c:3", "?", "?", "UNICODE_AI", null)) testCases.foreach(t => { // Unit test. val text = Literal.create(t.text, StringType(t.collation)) @@ -1073,9 +1228,13 @@ class CollationSQLExpressionsSuite case class RaiseErrorTestCase(errorMessage: String, collationName: String) val testCases = Seq( RaiseErrorTestCase("custom error message 1", "UTF8_BINARY"), + RaiseErrorTestCase("custom error message 1", "UTF8_BINARY_RTRIM"), RaiseErrorTestCase("custom error message 2", "UTF8_LCASE"), + RaiseErrorTestCase("custom error message 2", "UTF8_LCASE_RTRIM"), RaiseErrorTestCase("custom error message 3", "UNICODE"), - RaiseErrorTestCase("custom error message 4", "UNICODE_CI") + RaiseErrorTestCase("custom error message 3", "UNICODE_RTRIM"), + RaiseErrorTestCase("custom error message 4", "UNICODE_CI"), + RaiseErrorTestCase("custom error message 4", "UNICODE_CI_RTRIM") ) testCases.foreach(t => { withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { @@ -1094,7 +1253,13 @@ class CollationSQLExpressionsSuite test("Support CurrentDatabase/Catalog/User expressions with collation") { // Supported collations - Seq("UTF8_LCASE", "UNICODE", "UNICODE_CI", "SR_CI_AI").foreach(collationName => + Seq( + "UTF8_LCASE", + "UTF8_LCASE_RTRIM", + "UNICODE", + "UNICODE_RTRIM", + "UNICODE_CI", + "SR_CI_AI").foreach(collationName => withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { val queryDatabase = sql("SELECT current_schema()") val queryCatalog = sql("SELECT current_catalog()") @@ -1110,7 +1275,14 @@ class CollationSQLExpressionsSuite test("Support Uuid misc expression with collation") { // Supported collations - Seq("UTF8_LCASE", "UNICODE", "UNICODE_CI", "NO_CI_AI").foreach(collationName => + Seq( + "UTF8_LCASE", + "UTF8_LCASE_RTRIM", + "UNICODE", + "UNICODE_RTRIM", + "UNICODE_CI", + "UNICODE_CI_RTRIM", + "NO_CI_AI").foreach(collationName => withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { val query = s"SELECT uuid()" // Result & data type @@ -1285,15 +1457,33 @@ class CollationSQLExpressionsSuite Row(1), Seq( StructField("a", IntegerType, nullable = true) )), + XmlToStructsTestCase("

1

", "UTF8_BINARY_RTRIM", "'a INT'", "", + Row(1), Seq( + StructField("a", IntegerType, nullable = true) + )), XmlToStructsTestCase("

true0.8

", "UTF8_LCASE", "'A BOOLEAN, B DOUBLE'", "", Row(true, 0.8), Seq( StructField("A", BooleanType, nullable = true), StructField("B", DoubleType, nullable = true) )), + XmlToStructsTestCase("

true0.8

", "UTF8_LCASE_RTRIM", + "'A BOOLEAN, B DOUBLE'", "", Row(true, 0.8), Seq( + StructField("A", BooleanType, nullable = true), + StructField("B", DoubleType, nullable = true) + )), XmlToStructsTestCase("

Spark

", "UNICODE", "'s STRING'", "", + Row("Spark"), Seq( + StructField("s", StringType, nullable = true) + )), + XmlToStructsTestCase("

Spark

", "UTF8_BINARY", "'s STRING COLLATE UNICODE'", "", Row("Spark"), Seq( StructField("s", StringType("UNICODE"), nullable = true) )), + XmlToStructsTestCase("

Spark

", "UNICODE_RTRIM", + "'s STRING COLLATE UNICODE_RTRIM'", "", + Row("Spark"), Seq( + StructField("s", StringType("UNICODE_RTRIM"), nullable = true) + )), XmlToStructsTestCase("

", "UNICODE_CI", "'time Timestamp'", ", map('timestampFormat', 'dd/MM/yyyy')", Row( new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse("2015-08-26 00:00:00.0") @@ -1327,10 +1517,16 @@ class CollationSQLExpressionsSuite val testCases = Seq( SchemaOfXmlTestCase("

1

", "UTF8_BINARY", "STRUCT"), + SchemaOfXmlTestCase("

1

", "UTF8_BINARY_RTRIM", "STRUCT"), SchemaOfXmlTestCase("

true0.8

", "UTF8_LCASE", "STRUCT"), + SchemaOfXmlTestCase("

true0.8

", "UTF8_LCASE_RTRIM", + "STRUCT"), SchemaOfXmlTestCase("

", "UNICODE", "STRUCT<>"), + SchemaOfXmlTestCase("

", "UNICODE_RTRIM", "STRUCT<>"), SchemaOfXmlTestCase("

123

", "UNICODE_CI", + "STRUCT>"), + SchemaOfXmlTestCase("

123

", "UNICODE_CI_RTRIM", "STRUCT>") ) @@ -1363,6 +1559,11 @@ class CollationSQLExpressionsSuite | 1 | 2 |""".stripMargin), + StructsToXmlTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY_RTRIM", + s""" + | 1 + | 2 + |""".stripMargin), StructsToXmlTestCase("named_struct('A', true, 'B', 2.0)", "UTF8_LCASE", s""" | true @@ -1373,6 +1574,11 @@ class CollationSQLExpressionsSuite | aa | bb |""".stripMargin), + StructsToXmlTestCase("named_struct('A', 'aa', 'B', 'bb')", "UTF8_LCASE_RTRIM", + s""" + | aa + | bb + |""".stripMargin), StructsToXmlTestCase("named_struct('A', 'aa', 'B', 'bb')", "UTF8_BINARY", s""" | aa @@ -1380,6 +1586,8 @@ class CollationSQLExpressionsSuite |""".stripMargin), StructsToXmlTestCase("named_struct()", "UNICODE", ""), + StructsToXmlTestCase("named_struct()", "UNICODE_RTRIM", + ""), StructsToXmlTestCase("named_struct('time', to_timestamp('2015-08-26'))", "UNICODE_CI", s""" | @@ -1411,9 +1619,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( ParseJsonTestCase("{\"a\":1,\"b\":2}", "UTF8_BINARY", "{\"a\":1,\"b\":2}"), + ParseJsonTestCase("{\"a\":1,\"b\":2}", "UTF8_BINARY_RTRIM", "{\"a\":1,\"b\":2}"), ParseJsonTestCase("{\"A\":3,\"B\":4}", "UTF8_LCASE", "{\"A\":3,\"B\":4}"), + ParseJsonTestCase("{\"A\":3,\"B\":4}", "UTF8_LCASE_RTRIM", "{\"A\":3,\"B\":4}"), ParseJsonTestCase("{\"c\":5,\"d\":6}", "UNICODE", "{\"c\":5,\"d\":6}"), - ParseJsonTestCase("{\"C\":7,\"D\":8}", "UNICODE_CI", "{\"C\":7,\"D\":8}") + ParseJsonTestCase("{\"c\":5,\"d\":6}", "UNICODE_RTRIM", "{\"c\":5,\"d\":6}"), + ParseJsonTestCase("{\"C\":7,\"D\":8}", "UNICODE_CI", "{\"C\":7,\"D\":8}"), + ParseJsonTestCase("{\"C\":7,\"D\":8}", "UNICODE_CI_RTRIM", "{\"C\":7,\"D\":8}") ) // Supported collations (ParseJson) @@ -1483,9 +1695,13 @@ class CollationSQLExpressionsSuite val testCases = Seq( IsVariantNullTestCase("'null'", "UTF8_BINARY", result = true), + IsVariantNullTestCase("'null'", "UTF8_BINARY_RTRIM", result = true), IsVariantNullTestCase("'\"null\"'", "UTF8_LCASE", result = false), + IsVariantNullTestCase("'\"null\"'", "UTF8_LCASE_RTRIM", result = false), IsVariantNullTestCase("'13'", "UNICODE", result = false), - IsVariantNullTestCase("null", "UNICODE_CI", result = false) + IsVariantNullTestCase("'13'", "UNICODE_RTRIM", result = false), + IsVariantNullTestCase("null", "UNICODE_CI", result = false), + IsVariantNullTestCase("null", "UNICODE_CI_RTRIM", result = false) ) // Supported collations @@ -1514,9 +1730,15 @@ class CollationSQLExpressionsSuite val testCases = Seq( VariantGetTestCase("{\"a\": 1}", "$.a", "int", "UTF8_BINARY", 1, IntegerType), + VariantGetTestCase("{\"a\": 1}", "$.a", "int", "UTF8_BINARY_RTRIM", 1, IntegerType), VariantGetTestCase("{\"a\": 1}", "$.b", "int", "UTF8_LCASE", null, IntegerType), - VariantGetTestCase("[1, \"2\"]", "$[1]", "string", "UNICODE", "2", StringType("UNICODE")), + VariantGetTestCase("[1, \"2\"]", "$[1]", "string", "UNICODE", "2", + StringType), + VariantGetTestCase("[1, \"2\"]", "$[1]", "string collate unicode", "UTF8_BINARY", "2", + StringType("UNICODE")), VariantGetTestCase("[1, \"2\"]", "$[2]", "string", "UNICODE_CI", null, + StringType), + VariantGetTestCase("[1, \"2\"]", "$[2]", "string collate unicode_CI", "UTF8_BINARY", null, StringType("UNICODE_CI")) ) @@ -1595,6 +1817,14 @@ class CollationSQLExpressionsSuite StructField("value", VariantType, nullable = false) ) ), + VariantExplodeTestCase("[\"hello\", \"world\"]", "UTF8_BINARY_RTRIM", + Row(0, "null", "\"hello\"").toString() + Row(1, "null", "\"world\"").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UTF8_BINARY_RTRIM")), + StructField("value", VariantType, nullable = false) + ) + ), VariantExplodeTestCase("[\"Spark\", \"SQL\"]", "UTF8_LCASE", Row(0, "null", "\"Spark\"").toString() + Row(1, "null", "\"SQL\"").toString(), Seq[StructField]( @@ -1603,6 +1833,14 @@ class CollationSQLExpressionsSuite StructField("value", VariantType, nullable = false) ) ), + VariantExplodeTestCase("[\"Spark\", \"SQL\"]", "UTF8_LCASE_RTRIM", + Row(0, "null", "\"Spark\"").toString() + Row(1, "null", "\"SQL\"").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UTF8_LCASE_RTRIM")), + StructField("value", VariantType, nullable = false) + ) + ), VariantExplodeTestCase("{\"a\": true, \"b\": 3.14}", "UNICODE", Row(0, "a", "true").toString() + Row(1, "b", "3.14").toString(), Seq[StructField]( @@ -1611,6 +1849,14 @@ class CollationSQLExpressionsSuite StructField("value", VariantType, nullable = false) ) ), + VariantExplodeTestCase("{\"a\": true, \"b\": 3.14}", "UNICODE_RTRIM", + Row(0, "a", "true").toString() + Row(1, "b", "3.14").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UNICODE_RTRIM")), + StructField("value", VariantType, nullable = false) + ) + ), VariantExplodeTestCase("{\"A\": 9.99, \"B\": false}", "UNICODE_CI", Row(0, "A", "9.99").toString() + Row(1, "B", "false").toString(), Seq[StructField]( @@ -1646,11 +1892,17 @@ class CollationSQLExpressionsSuite val testCases = Seq( SchemaOfVariantTestCase("null", "UTF8_BINARY", "VOID"), + SchemaOfVariantTestCase("null", "UTF8_BINARY_RTRIM", "VOID"), SchemaOfVariantTestCase("[]", "UTF8_LCASE", "ARRAY"), + SchemaOfVariantTestCase("[]", "UTF8_LCASE_RTRIM", "ARRAY"), SchemaOfVariantTestCase("[{\"a\":true,\"b\":0}]", "UNICODE", "ARRAY>"), + SchemaOfVariantTestCase("[{\"a\":true,\"b\":0}]", "UNICODE_RTRIM", + "ARRAY>"), SchemaOfVariantTestCase("[{\"A\":\"x\",\"B\":-1.00}]", "UNICODE_CI", - "ARRAY>") + "ARRAY>"), + SchemaOfVariantTestCase("[{\"A\":\"x\",\"B\":-1.00}]", "UNICODE_CI_RTRIM", + "ARRAY>") ) // Supported collations @@ -1677,11 +1929,18 @@ class CollationSQLExpressionsSuite val testCases = Seq( SchemaOfVariantAggTestCase("('1'), ('2'), ('3')", "UTF8_BINARY", "BIGINT"), + SchemaOfVariantAggTestCase("('1'), ('2'), ('3')", "UTF8_BINARY_RTRIM", "BIGINT"), SchemaOfVariantAggTestCase("('true'), ('false'), ('true')", "UTF8_LCASE", "BOOLEAN"), + SchemaOfVariantAggTestCase("('true'), ('false'), ('true')", "UTF8_LCASE_RTRIM", "BOOLEAN"), SchemaOfVariantAggTestCase("('{\"a\": 1}'), ('{\"b\": true}'), ('{\"c\": 1.23}')", "UNICODE", "OBJECT"), + SchemaOfVariantAggTestCase("('{\"a\": 1}'), ('{\"b\": true}'), ('{\"c\": 1.23}')", + "UNICODE_RTRIM", "OBJECT"), SchemaOfVariantAggTestCase("('{\"A\": \"x\"}'), ('{\"B\": 9.99}'), ('{\"C\": 0}')", - "UNICODE_CI", "OBJECT") + "UNICODE_CI", "OBJECT"), + SchemaOfVariantAggTestCase("('{\"A\": \"x\"}'), ('{\"B\": 9.99}'), ('{\"C\": 0}')", + "UNICODE_CI_RTRIM", "OBJECT" + ) ) // Supported collations @@ -1701,7 +1960,16 @@ class CollationSQLExpressionsSuite test("Support InputFileName expression with collation") { // Supported collations - Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI", "MT_CI_AI").foreach(collationName => { + Seq( + "UTF8_BINARY", + "UTF8_BINARY_RTRIM", + "UTF8_LCASE", + "UTF8_LCASE_RTRIM", + "UNICODE", + "UNICODE_RTRIM", + "UNICODE_CI", + "UNICODE_CI_RTRIM", + "MT_CI_AI").foreach(collationName => { val query = s""" |select input_file_name() @@ -1720,9 +1988,13 @@ class CollationSQLExpressionsSuite case class DateFormatTestCase[R](date: String, format: String, collation: String, result: R) val testCases = Seq( DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UTF8_BINARY", "2021-01-01"), + DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UTF8_BINARY_RTRIM", "2021-01-01"), DateFormatTestCase("2021-01-01", "yyyy-dd", "UTF8_LCASE", "2021-01"), + DateFormatTestCase("2021-01-01", "yyyy-dd", "UTF8_LCASE_RTRIM", "2021-01"), DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UNICODE", "2021-01-01"), - DateFormatTestCase("2021-01-01", "yyyy", "UNICODE_CI", "2021") + DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UNICODE_RTRIM", "2021-01-01"), + DateFormatTestCase("2021-01-01", "yyyy", "UNICODE_CI", "2021"), + DateFormatTestCase("2021-01-01", "yyyy", "UNICODE_CI_RTRIM", "2021") ) for { @@ -1749,7 +2021,16 @@ class CollationSQLExpressionsSuite } test("Support mode for string expression with collation - Basic Test") { - Seq("utf8_binary", "UTF8_LCASE", "unicode_ci", "unicode", "NL_AI").foreach { collationId => + Seq( + "utf8_binary", + "utf8_binary_rtrim", + "UTF8_LCASE", + "UTF8_LCASE_RTRIM", + "unicode_ci", + "unicode_ci_rtrim", + "unicode", + "unicode_rtrim", + "NL_AI").foreach { collationId => val query = s"SELECT mode(collate('abc', '${collationId}'))" checkAnswer(sql(query), Row("abc")) assert(sql(query).schema.fields.head.dataType.sameType(StringType(collationId))) @@ -1760,9 +2041,13 @@ class CollationSQLExpressionsSuite case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("UTF8_LCASE_RTRIM", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("SR", Map("c" -> 3L, "č" -> 2L, "Č" -> 2L), "c") ) testCases.foreach(t => { @@ -1797,9 +2082,14 @@ class CollationSQLExpressionsSuite val testCasesUTF8String = Seq( UTF8StringModeTestCase("utf8_binary", bufferValuesUTF8String, "a"), + UTF8StringModeTestCase("utf8_binary_rtrim", bufferValuesUTF8String, "a"), UTF8StringModeTestCase("UTF8_LCASE", bufferValuesUTF8String, "b"), + UTF8StringModeTestCase("UTF8_LCASE_RTRIM", bufferValuesUTF8String, "b"), UTF8StringModeTestCase("unicode_ci", bufferValuesUTF8String, "b"), - UTF8StringModeTestCase("unicode", bufferValuesUTF8String, "a")) + UTF8StringModeTestCase("unicode_ci_rtrim", bufferValuesUTF8String, "b"), + UTF8StringModeTestCase("unicode", bufferValuesUTF8String, "a"), + UTF8StringModeTestCase("unicode_rtrim", bufferValuesUTF8String, "a") + ) testCasesUTF8String.foreach ( t => { val buffer = new OpenHashMap[AnyRef, Long](5) @@ -1827,9 +2117,13 @@ class CollationSQLExpressionsSuite } val testCasesUTF8String = Seq( UTF8StringModeTestCase("utf8_binary", bufferValuesComplex, "[a,a,a]"), + UTF8StringModeTestCase("utf8_binary_rtrim", bufferValuesComplex, "[a,a,a]"), UTF8StringModeTestCase("UTF8_LCASE", bufferValuesComplex, "[b,b,b]"), + UTF8StringModeTestCase("UTF8_LCASE_rtrim", bufferValuesComplex, "[b,b,b]"), UTF8StringModeTestCase("unicode_ci", bufferValuesComplex, "[b,b,b]"), - UTF8StringModeTestCase("unicode", bufferValuesComplex, "[a,a,a]")) + UTF8StringModeTestCase("unicode_ci_rtrim", bufferValuesComplex, "[b,b,b]"), + UTF8StringModeTestCase("unicode", bufferValuesComplex, "[a,a,a]"), + UTF8StringModeTestCase("unicode_rtrim", bufferValuesComplex, "[a,a,a]")) testCasesUTF8String.foreach { t => val buffer = new OpenHashMap[AnyRef, Long](5) @@ -1847,9 +2141,13 @@ class CollationSQLExpressionsSuite case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("UTF8_LCASE_RTRIM", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), - ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) testCases.foreach(t => { val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => @@ -1872,9 +2170,13 @@ class CollationSQLExpressionsSuite case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("UTF8_LCASE_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), - ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => @@ -1897,9 +2199,13 @@ class CollationSQLExpressionsSuite case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("UTF8_LCASE_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), - ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => @@ -1923,9 +2229,13 @@ class CollationSQLExpressionsSuite case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("UTF8_LCASE_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), - ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => @@ -1949,9 +2259,13 @@ class CollationSQLExpressionsSuite case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("UTF8_LCASE_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), - ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => @@ -1976,8 +2290,11 @@ class CollationSQLExpressionsSuite case class ModeTestCase(collationId: String, bufferValues: Map[String, Long], result: String) Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), + ModeTestCase("utf8_binary_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), + ModeTestCase("unicode_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), ModeTestCase("utf8_lcase", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}"), + ModeTestCase("utf8_lcase_rtrim", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}") ).foreach { t1 => def getValuesToAdd(t: ModeTestCase): String = { @@ -1991,9 +2308,11 @@ class CollationSQLExpressionsSuite } val tableName = s"t_${t1.collationId}_mode_nested_map_struct1" withTable(tableName) { - sql(s"CREATE TABLE ${tableName}(" + - s"i STRUCT>) USING parquet") - sql(s"INSERT INTO ${tableName} VALUES ${getValuesToAdd(t1)}") + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + sql(s"CREATE TABLE ${tableName}(" + + s"i STRUCT>) USING parquet") + sql(s"INSERT INTO ${tableName} VALUES ${getValuesToAdd(t1)}") + } val query = "SELECT lower(cast(mode(i).m1 as string))" + s" FROM ${tableName}" val queryResult = sql(query) @@ -2006,7 +2325,12 @@ class CollationSQLExpressionsSuite for { collateKey <- Seq(true, false) collateVal <- Seq(true, false) - defaultCollation <- Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE") + defaultCollation <- Seq( + "UTF8_BINARY", + "UTF8_BINARY_RTRIM", + "UTF8_LCASE", + "UTF8_LCASE_RTRIM", + "UNICODE") } { val mapKey = if (collateKey) "'a' collate utf8_lcase" else "'a'" val mapVal = if (collateVal) "'b' collate utf8_lcase" else "'b'" @@ -2014,11 +2338,11 @@ class CollationSQLExpressionsSuite val queryExtractor = s"select collation(map($mapKey, $mapVal)[$mapKey])" val queryElementAt = s"select collation(element_at(map($mapKey, $mapVal), $mapKey))" - checkAnswer(sql(queryExtractor), Row(collation)) - checkAnswer(sql(queryElementAt), Row(collation)) + checkAnswer(sql(queryExtractor), Row(fullyQualifiedPrefix + collation)) + checkAnswer(sql(queryElementAt), Row(fullyQualifiedPrefix + collation)) withSQLConf(SqlApiConf.DEFAULT_COLLATION -> defaultCollation) { - val res = if (collateVal) "UTF8_LCASE" else defaultCollation + val res = fullyQualifiedPrefix + (if (collateVal) "UTF8_LCASE" else defaultCollation) checkAnswer(sql(queryExtractor), Row(res)) checkAnswer(sql(queryElementAt), Row(res)) } @@ -2403,7 +2727,8 @@ class CollationSQLExpressionsSuite "a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", true), ReflectExpressions("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", "A5Cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", false), - + ReflectExpressions("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", + "a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary_rtrim", true), ReflectExpressions("A5cf6C42-0C85-418f-af6c-3E4E5b1328f2", "utf8_binary", "a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_lcase", true), ReflectExpressions("A5cf6C42-0C85-418f-af6c-3E4E5b1328f2", "utf8_binary", @@ -3149,14 +3474,22 @@ class CollationSQLExpressionsSuite ) val testCases = Seq( - HyperLogLogPlusPlusTestCase("utf8_binary", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", - "aA", "Aa", "aa"), Seq(Row(10))), - HyperLogLogPlusPlusTestCase("utf8_lcase", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", - "aA", "Aa", "aa"), Seq(Row(5))), + HyperLogLogPlusPlusTestCase("utf8_binary", Seq("a", "a", "A", "z", "zz", "ZZ", "w", + "AA", "aA", "Aa", "aa"), Seq(Row(10))), + HyperLogLogPlusPlusTestCase("utf8_binary_rtrim", Seq("a ", "a", "a", "A", "z", "zz", "ZZ", + "w", "AA", "aA", "Aa", "aa"), Seq(Row(10))), + HyperLogLogPlusPlusTestCase("utf8_lcase", Seq("a", "a", "A", "z", "zz", "ZZ", "w", + "AA", "aA", "Aa", "aa"), Seq(Row(5))), + HyperLogLogPlusPlusTestCase("utf8_lcase_rtrim", Seq("a ", "a", "a", "A", "z", "zz", "ZZ", "w", + "AA", "aA", "Aa", "aa"), Seq(Row(5))), HyperLogLogPlusPlusTestCase("UNICODE", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", "aA", "Aa", "aa"), Seq(Row(9))), + HyperLogLogPlusPlusTestCase("UNICODE_RTRIM", Seq("a ", "a", "a", "A", "z", "zz", "ZZ", "w", + "AA", "aA", "Aa", "aa"), Seq(Row(9))), HyperLogLogPlusPlusTestCase("UNICODE_CI", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", - "aA", "Aa", "aa"), Seq(Row(5))) + "aA", "Aa", "aa"), Seq(Row(5))), + HyperLogLogPlusPlusTestCase("UNICODE_CI_RTRIM", Seq("a ", "a", "a", "A", "z", "zz", "ZZ", "w", + "AA", "aA", "Aa", "aa"), Seq(Row(5))) ) testCases.foreach( t => { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 5bb8511d0d935..8d831e4ca1668 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -111,15 +111,17 @@ class CollationSQLRegexpSuite } val tableNameLcase = "T_LCASE" withTable(tableNameLcase) { - withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_LCASE") { - sql(s"CREATE TABLE IF NOT EXISTS $tableNameLcase(c STRING) using PARQUET") - sql(s"INSERT INTO $tableNameLcase(c) VALUES('ABC')") - checkAnswer(sql(s"select c like 'ab%' FROM $tableNameLcase"), Row(true)) - checkAnswer(sql(s"select c like '%bc' FROM $tableNameLcase"), Row(true)) - checkAnswer(sql(s"select c like 'a%c' FROM $tableNameLcase"), Row(true)) - checkAnswer(sql(s"select c like '%b%' FROM $tableNameLcase"), Row(true)) - checkAnswer(sql(s"select c like 'abc' FROM $tableNameLcase"), Row(true)) - } + sql(s""" + |CREATE TABLE IF NOT EXISTS $tableNameLcase( + | c STRING COLLATE UTF8_LCASE + |) using PARQUET + |""".stripMargin) + sql(s"INSERT INTO $tableNameLcase(c) VALUES('ABC')") + checkAnswer(sql(s"select c like 'ab%' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like '%bc' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like 'a%c' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like '%b%' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like 'abc' FROM $tableNameLcase"), Row(true)) } } @@ -448,7 +450,8 @@ class CollationSQLRegexpSuite }, condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", parameters = Map( - "sqlExpr" -> "\"regexp_replace(collate(ABCDE, UNICODE_CI), .c., FFF, 1)\"", + "sqlExpr" -> + """"regexp_replace(collate(ABCDE, UNICODE_CI), .c., 'FFF' collate UNICODE_CI, 1)"""", "paramIndex" -> "first", "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", "inputType" -> "\"STRING COLLATE UNICODE_CI\"", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 2a0b84c075079..ee9734ebaa5bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -198,8 +198,10 @@ class CollationStringExpressionsSuite checkError( exception = intercept[AnalysisException] { val expr = StringSplitSQL( - Collate(Literal.create("1a2"), "UTF8_BINARY"), - Collate(Literal.create("a"), "UTF8_LCASE")) + Collate(Literal.create("1a2", StringType("UTF8_BINARY")), + ResolvedCollation("UTF8_BINARY")), + Collate(Literal.create("a", StringType("UTF8_BINARY")), + ResolvedCollation("UTF8_LCASE"))) CollationTypeCasts.transform(expr) }, condition = "COLLATION_MISMATCH.EXPLICIT", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index 1707820053837..1571433a37e16 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -22,7 +22,6 @@ import scala.jdk.CollectionConverters.MapHasAsJava import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.connector.{DatasourceV2SQLBase, FakeV2ProviderWithCustomSchema} import org.apache.spark.sql.connector.catalog.{Identifier, InMemoryTable} @@ -34,6 +33,7 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec} import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.joins._ +import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType, Metadata, MetadataBuilder, StringType, StructField, StructType} @@ -43,6 +43,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { private val collationPreservingSources = Seq("parquet") private val collationNonPreservingSources = Seq("orc", "csv", "json", "text") private val allFileBasedDataSources = collationPreservingSources ++ collationNonPreservingSources + private val fullyQualifiedPrefix = s"${CollationFactory.CATALOG}.${CollationFactory.SCHEMA}." @inline private def isSortMergeForced: Boolean = { @@ -117,7 +118,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { ).foreach { collationName => checkAnswer( sql(s"select collation('aaa' collate $collationName)"), - Row(collationName.toUpperCase()) + Row(fullyQualifiedPrefix + collationName.toUpperCase()) ) } } @@ -209,7 +210,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } test("collation expression returns default collation") { - checkAnswer(sql(s"select collation('aaa')"), Row("UTF8_BINARY")) + checkAnswer(sql(s"select collation('aaa')"), Row(fullyQualifiedPrefix + "UTF8_BINARY")) } test("invalid collation name throws exception") { @@ -220,23 +221,54 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { parameters = Map("collationName" -> "UTF8_BS", "proposals" -> "UTF8_LCASE")) } + test("fail on table creation with collated strings as map key") { + withTable("table_1", "table_2") { + checkError( + exception = intercept[AnalysisException] { + sql("CREATE TABLE table_1 (col MAP) USING parquet") + }, + condition = "UNSUPPORTED_FEATURE.COLLATIONS_IN_MAP_KEYS" + ) + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + sql("CREATE TABLE table_2 (col MAP) USING parquet") + } + } + } + + test("fail on adding column with collated map key") { + withTable("table_1") { + sql("CREATE TABLE table_1 (id INTEGER) USING parquet") + checkError( + exception = intercept[AnalysisException] { + sql("ALTER TABLE table_1 ADD COLUMN col1 MAP, INTEGER>") + }, + condition = "UNSUPPORTED_FEATURE.COLLATIONS_IN_MAP_KEYS" + ) + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + sql("ALTER TABLE table_1 ADD COLUMN col1 MAP, INTEGER>") + } + } + } + test("disable bucketing on collated string column") { def createTable(bucketColumns: String*): Unit = { val tableName = "test_partition_tbl" withTable(tableName) { - sql( - s""" - |CREATE TABLE $tableName ( - | id INT, - | c1 STRING COLLATE UNICODE, - | c2 STRING, - | struct_col STRUCT, - | array_col ARRAY, - | map_col MAP - |) USING parquet - |CLUSTERED BY (${bucketColumns.mkString(",")}) - |INTO 4 BUCKETS""".stripMargin - ) + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + sql( + s""" + |CREATE TABLE $tableName ( + | id INT, + | c1 STRING COLLATE UNICODE, + | c2 STRING, + | struct_col STRUCT, + | array_col ARRAY, + | map_col MAP + |) USING parquet + |CLUSTERED BY (${bucketColumns.mkString(",")}) + |INTO 4 BUCKETS""".stripMargin + ) + } } } // should work fine on default collated columns @@ -477,7 +509,8 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"INSERT INTO $tableName VALUES ('aaa')") sql(s"INSERT INTO $tableName VALUES ('AAA')") - checkAnswer(sql(s"SELECT DISTINCT COLLATION(c1) FROM $tableName"), Seq(Row(collationName))) + checkAnswer(sql(s"SELECT DISTINCT COLLATION(c1) FROM $tableName"), + Seq(Row(fullyQualifiedPrefix + collationName))) assert(sql(s"select c1 FROM $tableName").schema.head.dataType == StringType(collationId)) } } @@ -501,7 +534,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { checkAnswer(readback, Row("aaa")) checkAnswer( readback.selectExpr(s"collation(${readback.columns.head})"), - Row(readbackCollation)) + Row(fullyQualifiedPrefix + readbackCollation)) } } } @@ -523,7 +556,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"INSERT INTO $tableName VALUES ('AAA')") checkAnswer(sql(s"SELECT DISTINCT COLLATION(c1) FROM $tableName"), - Seq(Row(defaultCollation))) + Seq(Row(fullyQualifiedPrefix + defaultCollation))) sql( s""" @@ -535,7 +568,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"INSERT INTO $tableName VALUES ('AAA', 'AAA')") checkAnswer(sql(s"SELECT DISTINCT COLLATION(c2) FROM $tableName"), - Seq(Row(collationName))) + Seq(Row(fullyQualifiedPrefix + collationName))) assert(sql(s"select c2 FROM $tableName").schema.head.dataType == StringType(collationId)) } } @@ -558,7 +591,8 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"ALTER TABLE $tableName ALTER COLUMN c4.t TYPE STRING COLLATE UNICODE") checkAnswer(sql(s"SELECT collation(c1), collation(c2[0]), " + s"collation(c3[1]), collation(c4.t) FROM $tableName"), - Seq(Row("UTF8_LCASE", "UNICODE_CI", "UTF8_BINARY", "UNICODE"))) + Seq(Row(fullyQualifiedPrefix + "UTF8_LCASE", fullyQualifiedPrefix + "UNICODE_CI", + fullyQualifiedPrefix + "UTF8_BINARY", fullyQualifiedPrefix + "UNICODE"))) } } @@ -662,6 +696,11 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { s"IN (COLLATE('aa', 'UTF8_LCASE'))"), Seq(Row("a"), Row("A"))) checkAnswer(sql(s"SELECT c1 FROM $tableName where (c1 || 'a') " + s"IN (COLLATE('aa', 'UTF8_BINARY'))"), Seq(Row("a"))) + checkAnswer(sql(s"SELECT c1 FROM $tableName where c1 || 'a' " + + s"IN (COLLATE('aa', 'UTF8_LCASE_RTRIM'))"), Seq(Row("a"), Row("A"))) + checkAnswer(sql(s"SELECT c1 FROM $tableName where (c1 || 'a') " + + s"IN (COLLATE('aa', 'UTF8_BINARY_RTRIM'))"), Seq(Row("a"))) + // columns have different collation checkError( @@ -772,6 +811,16 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { ) ) + checkError( + exception = intercept[AnalysisException] { + sql(s"SELECT array('A', 'a' COLLATE UNICODE) == array('b' COLLATE UNICODE_CI_RTRIM)") + }, + condition = "COLLATION_MISMATCH.EXPLICIT", + parameters = Map( + "explicitTypes" -> """"STRING COLLATE UNICODE", "STRING COLLATE UNICODE_CI_RTRIM"""" + ) + ) + checkError( exception = intercept[AnalysisException] { sql("SELECT array_join(array('a', 'b' collate UNICODE), 'c' collate UNICODE_CI)") @@ -829,7 +878,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { """EXECUTE IMMEDIATE stmtStr1 USING | 'a' AS var1, | 'b' AS var2;""".stripMargin), - Seq(Row("UTF8_BINARY")) + Seq(Row(fullyQualifiedPrefix + "UTF8_BINARY")) ) withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { @@ -838,7 +887,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { """EXECUTE IMMEDIATE stmtStr1 USING | 'a' AS var1, | 'b' AS var2;""".stripMargin), - Seq(Row("UNICODE")) + Seq(Row(fullyQualifiedPrefix + "UNICODE")) ) } @@ -846,7 +895,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql( """EXECUTE IMMEDIATE stmtStr2 USING | 'a' AS var1;""".stripMargin), - Seq(Row("UNICODE")) + Seq(Row(fullyQualifiedPrefix + "UNICODE")) ) withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { @@ -854,7 +903,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql( """EXECUTE IMMEDIATE stmtStr2 USING | 'a' AS var1;""".stripMargin), - Seq(Row("UNICODE")) + Seq(Row(fullyQualifiedPrefix + "UNICODE")) ) } } @@ -941,7 +990,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"INSERT INTO $tableName VALUES ('a'), ('A')") checkAnswer(sql(s"SELECT DISTINCT COLLATION(c1) FROM $tableName"), - Seq(Row(collationName))) + Seq(Row(fullyQualifiedPrefix + collationName))) assert(sql(s"select c1 FROM $tableName").schema.head.dataType == StringType(collationId)) } } @@ -1053,69 +1102,46 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("SPARK-47431: Default collation set to UNICODE, literal test") { withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { - checkAnswer(sql(s"SELECT collation('aa')"), Seq(Row("UNICODE"))) + checkAnswer(sql(s"SELECT collation('aa')"), Seq(Row(fullyQualifiedPrefix + "UNICODE"))) } } - test("SPARK-47972: Cast expression limitation for collations") { - checkError( - exception = intercept[ParseException] - (sql("SELECT cast(1 as string collate unicode)")), - condition = "UNSUPPORTED_DATATYPE", - parameters = Map( - "typeName" -> toSQLType(StringType("UNICODE"))), - context = - ExpectedContext(fragment = s"cast(1 as string collate unicode)", start = 7, stop = 39) - ) + test("Cast expression for collations") { + checkAnswer( + sql(s"SELECT collation(cast('a' as string collate utf8_lcase))"), + Seq(Row(fullyQualifiedPrefix + "UTF8_LCASE"))) - checkError( - exception = intercept[ParseException] - (sql("SELECT 'A' :: string collate unicode")), - condition = "UNSUPPORTED_DATATYPE", - parameters = Map( - "typeName" -> toSQLType(StringType("UNICODE"))), - context = ExpectedContext(fragment = s"'A' :: string collate unicode", start = 7, stop = 35) - ) + checkAnswer( + sql(s"SELECT collation('a' :: string collate utf8_lcase)"), + Seq(Row(fullyQualifiedPrefix + "UTF8_LCASE"))) checkAnswer(sql(s"SELECT cast(1 as string)"), Seq(Row("1"))) checkAnswer(sql(s"SELECT cast('A' as string)"), Seq(Row("A"))) withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { - checkError( - exception = intercept[ParseException] - (sql("SELECT cast(1 as string collate unicode)")), - condition = "UNSUPPORTED_DATATYPE", - parameters = Map( - "typeName" -> toSQLType(StringType("UNICODE"))), - context = - ExpectedContext(fragment = s"cast(1 as string collate unicode)", start = 7, stop = 39) - ) - + checkAnswer( + sql(s"SELECT collation(cast(1 as string collate unicode))"), + Seq(Row(fullyQualifiedPrefix + "UNICODE"))) checkAnswer(sql(s"SELECT cast(1 as string)"), Seq(Row("1"))) - checkAnswer(sql(s"SELECT collation(cast(1 as string))"), Seq(Row("UNICODE"))) + checkAnswer(sql(s"SELECT collation(cast(1 as string))"), + Seq(Row(fullyQualifiedPrefix + "UNICODE"))) } } - test("SPARK-47431: Default collation set to UNICODE, column type test") { - withTable("t") { - withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { - sql(s"CREATE TABLE t(c1 STRING) USING PARQUET") - sql(s"INSERT INTO t VALUES ('a')") - checkAnswer(sql(s"SELECT collation(c1) FROM t"), Seq(Row("UNICODE"))) - } - } - } + test("cast using the dataframe api") { + val tableName = "cast_table" + withTable(tableName) { + sql(s"CREATE TABLE $tableName (name STRING COLLATE UTF8_LCASE) USING PARQUET") - test("SPARK-47431: Create table with UTF8_BINARY, make sure collation persists on read") { - withTable("t") { - withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_BINARY") { - sql("CREATE TABLE t(c1 STRING) USING PARQUET") - sql("INSERT INTO t VALUES ('a')") - checkAnswer(sql("SELECT collation(c1) FROM t"), Seq(Row("UTF8_BINARY"))) - } - withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { - checkAnswer(sql("SELECT collation(c1) FROM t"), Seq(Row("UTF8_BINARY"))) - } + var df = spark.read.table(tableName) + .withColumn("name", col("name").cast("STRING COLLATE UNICODE")) + + assert(df.schema.fields.head.dataType === StringType("UNICODE")) + + df = spark.read.table(tableName) + .withColumn("name", col("name").cast("STRING COLLATE UTF8_BINARY")) + + assert(df.schema.fields.head.dataType === StringType) } } @@ -1143,7 +1169,9 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } // map doesn't support aggregation withTable(table) { - sql(s"create table $table (m map) using parquet") + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + sql(s"create table $table (m map) using parquet") + } val query = s"select distinct m from $table" checkError( exception = intercept[ExtendedAnalysisException](sql(query)), @@ -1185,8 +1213,10 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } // map doesn't support joins withTable(tableLeft, tableRight) { - Seq(tableLeft, tableRight).map(tab => - sql(s"create table $tab (m map) using parquet")) + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + Seq(tableLeft, tableRight).map(tab => + sql(s"create table $tab (m map) using parquet")) + } val query = s"select $tableLeft.m from $tableLeft join $tableRight on $tableLeft.m = $tableRight.m" val ctx = s"$tableLeft.m = $tableRight.m" @@ -1437,7 +1467,10 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val tableName = "t" withTable(tableName) { - withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) { + withSQLConf( + SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen, + SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true" + ) { sql(s"create table $tableName" + s" (m map)") sql(s"insert into $tableName values (map('aaa', 'AAA'))") @@ -1462,7 +1495,10 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val tableName = "t" withTable(tableName) { - withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) { + withSQLConf( + SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen, + SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true" + ) { sql(s"create table $tableName" + s" (m map, " + s"struct>)") @@ -1489,7 +1525,10 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val tableName = "t" withTable(tableName) { - withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) { + withSQLConf( + SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen, + SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true" + ) { sql(s"create table $tableName " + s"(m map, array>)") sql(s"insert into $tableName values (map(array('aaa', 'bbb'), array('ccc', 'ddd')))") @@ -1512,7 +1551,10 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test(s"Check that order by on map with$collationSetup strings fails ($codeGen)") { val tableName = "t" withTable(tableName) { - withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) { + withSQLConf( + SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen, + SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true" + ) { sql(s"create table $tableName" + s" (m map, " + s" c integer)") @@ -2087,4 +2129,49 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { checkAnswer(sql("SELECT NAME FROM collations() WHERE ICU_VERSION is null"), Seq(Row("UTF8_BINARY"), Row("UTF8_LCASE"))) } + + test("fully qualified name") { + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI_AI").foreach { collation => + // Make sure that the collation expression returns the correct fully qualified name. + val df = sql(s"SELECT collation('a' collate $collation)") + checkAnswer(df, + Seq(Row(s"${CollationFactory.CATALOG}.${CollationFactory.SCHEMA}.$collation"))) + + // Make sure the user can specify the fully qualified name as a collation name. + Seq("contains", "startswith", "endswith").foreach{ binaryFunction => + val dfRegularName = sql( + s"SELECT $binaryFunction('a' collate $collation, 'A' collate $collation)") + val dfFullyQualifiedName = sql( + s"SELECT $binaryFunction('a' collate system.builtin.$collation, 'A' collate $collation)") + checkAnswer(dfRegularName, dfFullyQualifiedName) + } + } + + // Wrong collation names raise a Spark exception. + Seq( + ("system.builtin2.UTF8_BINARY", "UTF8_BINARY"), + ("system.UTF8_BINARY", "UTF8_BINARY"), + ("builtin.UTF8_LCASE", "UTF8_LCASE") + ).foreach { case(collationName, proposal) => + checkError( + exception = intercept[SparkException] { + sql(s"SELECT 'a' COLLATE ${collationName}") + }, + condition = "COLLATION_INVALID_NAME", + sqlState = "42704", + parameters = Map("collationName" -> collationName.split("\\.").last, + "proposals" -> proposal)) + } + + // Case insensitive fully qualified names are supported. + checkAnswer( + sql("SELECT 'a' collate sYstEm.bUiltIn.utf8_lCAse = 'A'"), + Seq(Row(true)) + ) + + // Make sure DDLs can use fully qualified names. + withTable("t") { + sql(s"CREATE TABLE t (c STRING COLLATE system.builtin.UTF8_LCASE)") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 6348e5f315395..141d6b219f2a7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -39,10 +39,12 @@ import org.apache.spark.sql.test.SQLTestData.DecimalData import org.apache.spark.sql.types._ import org.apache.spark.sql.types.DayTimeIntervalType.{DAY, HOUR, MINUTE, SECOND} import org.apache.spark.sql.types.YearMonthIntervalType.{MONTH, YEAR} +import org.apache.spark.tags.SlowSQLTest import org.apache.spark.unsafe.types.CalendarInterval case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double) +@SlowSQLTest class DataFrameAggregateSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper { @@ -621,6 +623,41 @@ class DataFrameAggregateSuite extends QueryTest ) } + test("listagg function") { + // normal case + val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b") + checkAnswer( + df.selectExpr("listagg(a)", "listagg(b)"), + Seq(Row("abc", "bcd")) + ) + checkAnswer( + df.select(listagg($"a"), listagg($"b")), + Seq(Row("abc", "bcd")) + ) + + // distinct case + val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("a", "b") + checkAnswer( + df2.select(listagg_distinct($"a"), listagg_distinct($"b")), + Seq(Row("ab", "bd")) + ) + + // null case + val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null, null)).toDF("a", "b", "c") + checkAnswer( + df3.select(listagg_distinct($"a"), listagg($"a"), listagg_distinct($"b"), listagg($"b"), + listagg($"c")), + Seq(Row("a", "aa", "b", "bb", null)) + ) + + // custom delimiter + val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b") + checkAnswer( + df4.selectExpr("listagg(a, '|')", "listagg(b, '|')"), + Seq(Row("a|b|c", "b|c|d")) + ) + } + test("SPARK-31500: collect_set() of BinaryType returns duplicate elements") { val bytesTest1 = "test1".getBytes val bytesTest2 = "test2".getBytes diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala index 48ea0e01a4372..8024b579e5d0c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.objects.MapObjects import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BooleanType, Decimal, DoubleType, IntegerType, MapType, StringType, StructField, StructType} @@ -92,8 +91,8 @@ class DataFrameComplexTypeSuite extends QueryTest with SharedSparkSession { // items: Seq[Int] => items.map { item => Seq(Struct(item)) } val result = df.select( - column(MapObjects( - (item: Expression) => array(struct(column(item))).expr, + Column(MapObjects( + (item: Expression) => array(struct(Column(item))).expr, $"items".expr, df.schema("items").dataType.asInstanceOf[ArrayType].elementType )) as "items" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 975a82e26f4eb..fc6d3023ed072 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -24,7 +24,7 @@ import scala.reflect.runtime.universe.runtimeMirror import scala.util.Random import org.apache.spark.{QueryContextType, SPARK_DOC_ROOT, SparkException, SparkRuntimeException} -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, InternalRow} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.Cast._ @@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, UTC} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -73,7 +72,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { "sum_distinct", // equivalent to sum(distinct foo) "typedLit", "typedlit", // Scala only "udaf", "udf", // create function statement in sql - "call_function" // moot in SQL as you just call the function directly + "call_function", // moot in SQL as you just call the function directly + "listagg_distinct", // equivalent to listagg(distinct foo) + "string_agg_distinct" // equivalent to string_agg(distinct foo) ) val excludedSqlFunctions = Set.empty[String] @@ -404,7 +405,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { callSitePattern = "", startIndex = 0, stopIndex = 0)) - expr = nullifzero(Literal.create(20201231, DateType)) + expr = nullifzero(Column(Literal.create(20201231, DateType))) checkError( intercept[AnalysisException](df.select(expr)), condition = "DATATYPE_MISMATCH.BINARY_OP_DIFF_TYPES", @@ -457,14 +458,14 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { val df = Seq((0)).toDF("a") var expr = randstr(lit(10), lit("a")) checkError( - intercept[AnalysisException](df.select(expr)), + intercept[ExtendedAnalysisException](df.select(expr).collect()), condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", parameters = Map( "sqlExpr" -> "\"randstr(10, a)\"", "paramIndex" -> "second", "inputSql" -> "\"a\"", "inputType" -> "\"STRING\"", - "requiredType" -> "INT or SMALLINT"), + "requiredType" -> "(\"INT\" or \"BIGINT\")"), context = ExpectedContext( contextType = QueryContextType.DataFrame, fragment = "randstr", @@ -479,7 +480,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { condition = "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", parameters = Map( "inputName" -> "`length`", - "inputType" -> "INT or SMALLINT", + "inputType" -> "integer", "inputExpr" -> "\"a\"", "sqlExpr" -> "\"randstr(a, 10)\""), context = ExpectedContext( @@ -516,7 +517,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { "paramIndex" -> "second", "inputSql" -> "\"a\"", "inputType" -> "\"STRING\"", - "requiredType" -> "integer or floating-point"), + "requiredType" -> "\"NUMERIC\""), context = ExpectedContext( contextType = QueryContextType.DataFrame, fragment = "uniform", @@ -586,7 +587,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { callSitePattern = "", startIndex = 0, stopIndex = 0)) - expr = zeroifnull(Literal.create(20201231, DateType)) + expr = zeroifnull(Column(Literal.create(20201231, DateType))) checkError( intercept[AnalysisException](df.select(expr)), condition = "DATATYPE_MISMATCH.DATA_DIFF_TYPES", @@ -5735,7 +5736,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { import DataFrameFunctionsSuite.CodegenFallbackExpr for ((codegenFallback, wholeStage) <- Seq((true, false), (false, false), (false, true))) { val c = if (codegenFallback) { - column(CodegenFallbackExpr(v.expr)) + Column(CodegenFallbackExpr(v.expr)) } else { v } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala index f0ed2241fd286..0e9b1c9d2104e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, AttributeRef import org.apache.spark.sql.catalyst.plans.logical.{Expand, Generate, ScriptInputOutputSchema, ScriptTransformation, Window => WindowPlan} import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{col, count, explode, sum, year} -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.test.SQLTestData.TestData @@ -375,7 +374,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { Seq.empty, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, true) - val df7 = df1.mapInPandas(mapInPandasUDF) + val df7 = df1.mapInPandas(Column(mapInPandasUDF)) val df8 = df7.filter($"x" > 0) assertAmbiguousSelfJoin(df7.join(df8, df7("x") === df8("y"))) assertAmbiguousSelfJoin(df8.join(df7, df7("x") === df8("y"))) @@ -386,7 +385,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { Seq.empty, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, true) - val df9 = df1.groupBy($"key1").flatMapGroupsInPandas(flatMapGroupsInPandasUDF) + val df9 = df1.groupBy($"key1").flatMapGroupsInPandas(Column(flatMapGroupsInPandasUDF)) val df10 = df9.filter($"x" > 0) assertAmbiguousSelfJoin(df9.join(df10, df9("x") === df10("y"))) assertAmbiguousSelfJoin(df10.join(df9, df9("x") === df10("y"))) @@ -398,7 +397,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, true) val df11 = df1.groupBy($"key1").flatMapCoGroupsInPandas( - df1.groupBy($"key2"), flatMapCoGroupsInPandasUDF) + df1.groupBy($"key2"), Column(flatMapCoGroupsInPandasUDF)) val df12 = df11.filter($"x" > 0) assertAmbiguousSelfJoin(df11.join(df12, df11("x") === df12("y"))) assertAmbiguousSelfJoin(df12.join(df11, df11("x") === df12("y"))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index 5ff737d2b57cb..9c182be0f7dd6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -350,6 +350,84 @@ class DataFrameSetOperationsSuite extends QueryTest dates.intersect(widenTypedRows).collect() } + test("SPARK-50373 - cannot run set operations with variant type") { + val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + + " as v, id % 5 as id from range(0, 100, 1, 5)") + checkError( + exception = intercept[AnalysisException](df.intersect(df)), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`v`", + "dataType" -> "\"VARIANT\"") + ) + checkError( + exception = intercept[AnalysisException](df.except(df)), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`v`", + "dataType" -> "\"VARIANT\"") + ) + checkError( + exception = intercept[AnalysisException](df.distinct()), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`v`", + "dataType" -> "\"VARIANT\"")) + checkError( + exception = intercept[AnalysisException](df.dropDuplicates()), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`v`", + "dataType" -> "\"VARIANT\"")) + withTempView("tv") { + df.createOrReplaceTempView("tv") + checkError( + exception = intercept[AnalysisException](sql("SELECT DISTINCT v FROM tv")), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`v`", + "dataType" -> "\"VARIANT\""), + context = ExpectedContext( + fragment = "SELECT DISTINCT v FROM tv", + start = 0, + stop = 24) + ) + checkError( + exception = intercept[AnalysisException](sql("SELECT DISTINCT STRUCT(v) FROM tv")), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`struct(v)`", + "dataType" -> "\"STRUCT\""), + context = ExpectedContext( + fragment = "SELECT DISTINCT STRUCT(v) FROM tv", + start = 0, + stop = 32) + ) + checkError( + exception = intercept[AnalysisException](sql("SELECT DISTINCT ARRAY(v) FROM tv")), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_VARIANT_TYPE", + parameters = Map( + "colName" -> "`array(v)`", + "dataType" -> "\"ARRAY\""), + context = ExpectedContext( + fragment = "SELECT DISTINCT ARRAY(v) FROM tv", + start = 0, + stop = 31) + ) + checkError( + exception = intercept[AnalysisException](sql("SELECT DISTINCT MAP('m', v) FROM tv")), + condition = "UNSUPPORTED_FEATURE.SET_OPERATION_ON_MAP_TYPE", + parameters = Map( + "colName" -> "`map(m, v)`", + "dataType" -> "\"MAP\""), + context = ExpectedContext( + fragment = "SELECT DISTINCT MAP('m', v) FROM tv", + start = 0, + stop = 34) + ) + } + } + test("SPARK-19893: cannot run set operations with map type") { val df = spark.range(1).select(map(lit("key"), $"id").as("m")) checkError( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala index d656c36ce842a..621d468454d40 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSubquerySuite.scala @@ -53,23 +53,15 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { r.createOrReplaceTempView("r") } - test("unanalyzable expression") { - val sub = spark.range(1).select($"id" === $"id".outer()) - - checkError( - intercept[AnalysisException](sub.schema), - condition = "UNANALYZABLE_EXPRESSION", - parameters = Map("expr" -> "\"outer(id)\""), - queryContext = - Array(ExpectedContext(fragment = "outer", callSitePattern = getCurrentClassCallSitePattern)) - ) - + test("noop outer()") { + checkAnswer(spark.range(1).select($"id".outer()), Row(0)) checkError( - intercept[AnalysisException](sub.encoder), - condition = "UNANALYZABLE_EXPRESSION", - parameters = Map("expr" -> "\"outer(id)\""), - queryContext = - Array(ExpectedContext(fragment = "outer", callSitePattern = getCurrentClassCallSitePattern)) + intercept[AnalysisException](spark.range(1).select($"outer_col".outer()).collect()), + "UNRESOLVED_COLUMN.WITH_SUGGESTION", + parameters = Map("objectName" -> "`outer_col`", "proposal" -> "`id`"), + context = ExpectedContext( + fragment = "$", + callSitePattern = getCurrentClassCallSitePattern) ) } @@ -148,6 +140,64 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { } } + test("correlated scalar subquery in SELECT with outer() function") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") + // We can use the `.outer()` function to wrap either the outer column, or the entire condition, + // or the SQL string of the condition. + Seq( + $"t1.a" === $"t2.a".outer(), + ($"t1.a" === $"t2.a").outer(), + expr("t1.a = t2.a").outer()).foreach { cond => + checkAnswer( + df1.select( + $"a", + df2.where(cond).select(sum($"b")).scalar().as("sum_b") + ), + sql("select a, (select sum(b) from l t1 where t1.a = t2.a) sum_b from l t2") + ) + } + } + + test("correlated scalar subquery in WHERE with outer() function") { + // We can use the `.outer()` function to wrap either the outer column, or the entire condition, + // or the SQL string of the condition. + Seq( + $"a".outer() === $"c", + ($"a" === $"c").outer(), + expr("a = c").outer()).foreach { cond => + checkAnswer( + spark.table("l").where( + $"b" < spark.table("r").where(cond).select(max($"d")).scalar() + ), + sql("select * from l where b < (select max(d) from r where a = c)") + ) + } + } + + test("EXISTS predicate subquery with outer() function") { + // We can use the `.outer()` function to wrap either the outer column, or the entire condition, + // or the SQL string of the condition. + Seq( + $"a".outer() === $"c", + ($"a" === $"c").outer(), + expr("a = c").outer()).foreach { cond => + checkAnswer( + spark.table("l").where( + spark.table("r").where(cond).exists() + ), + sql("select * from l where exists (select * from r where l.a = r.c)") + ) + + checkAnswer( + spark.table("l").where( + spark.table("r").where(cond).exists() && $"a" <= lit(2) + ), + sql("select * from l where exists (select * from r where l.a = r.c) and l.a <= 2") + ) + } + } + test("SPARK-15677: Queries against local relations with scalar subquery in Select list") { withTempView("t1", "t2") { Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t1") @@ -192,22 +242,6 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { } } - test("EXISTS predicate subquery") { - checkAnswer( - spark.table("l").where( - spark.table("r").where($"a".outer() === $"c").exists() - ), - sql("select * from l where exists (select * from r where l.a = r.c)") - ) - - checkAnswer( - spark.table("l").where( - spark.table("r").where($"a".outer() === $"c").exists() && $"a" <= lit(2) - ), - sql("select * from l where exists (select * from r where l.a = r.c) and l.a <= 2") - ) - } - test("NOT EXISTS predicate subquery") { checkAnswer( spark.table("l").where( @@ -244,32 +278,15 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { ) } - test("correlated scalar subquery in where") { + test("correlated scalar subquery in select (null safe equal)") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") checkAnswer( - spark.table("l").where( - $"b" < spark.table("r").where($"a".outer() === $"c").select(max($"d")).scalar() - ), - sql("select * from l where b < (select max(d) from r where a = c)") - ) - } - - test("correlated scalar subquery in select") { - checkAnswer( - spark.table("l").select( + df1.select( $"a", - spark.table("l").where($"a" === $"a".outer()).select(sum($"b")).scalar().as("sum_b") + df2.where($"t2.a" <=> $"t1.a".outer()).select(sum($"b")).scalar().as("sum_b") ), - sql("select a, (select sum(b) from l l2 where l2.a = l1.a) sum_b from l l1") - ) - } - - test("correlated scalar subquery in select (null safe)") { - checkAnswer( - spark.table("l").select( - $"a", - spark.table("l").where($"a" <=> $"a".outer()).select(sum($"b")).scalar().as("sum_b") - ), - sql("select a, (select sum(b) from l l2 where l2.a <=> l1.a) sum_b from l l1") + sql("select a, (select sum(b) from l t2 where t2.a <=> t1.a) sum_b from l t1") ) } @@ -300,10 +317,12 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { } test("non-aggregated correlated scalar subquery") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") val exception1 = intercept[SparkRuntimeException] { - spark.table("l").select( + df1.select( $"a", - spark.table("l").where($"a" === $"a".outer()).select($"b").scalar().as("sum_b") + df2.where($"t1.a" === $"t2.a".outer()).select($"b").scalar().as("sum_b") ).collect() } checkError( @@ -313,12 +332,14 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { } test("non-equal correlated scalar subquery") { + val df1 = spark.table("l").as("t1") + val df2 = spark.table("l").as("t2") checkAnswer( - spark.table("l").select( + df1.select( $"a", - spark.table("l").where($"a" < $"a".outer()).select(sum($"b")).scalar().as("sum_b") + df2.where($"t2.a" < $"t1.a".outer()).select(sum($"b")).scalar().as("sum_b") ), - sql("select a, (select sum(b) from l l2 where l2.a < l1.a) sum_b from l l1") + sql("select a, (select sum(b) from l t2 where t2.a < t1.a) sum_b from l t1") ) } @@ -346,7 +367,7 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { spark.table("l").select( $"a", spark.table("r").where($"c" === $"a").select(sum($"d")).scalar() - ).collect() + ) } checkError( exception1, @@ -355,35 +376,468 @@ class DataFrameSubquerySuite extends QueryTest with SharedSparkSession { queryContext = Array(ExpectedContext(fragment = "$", callSitePattern = getCurrentClassCallSitePattern)) ) + } - // Extra `outer()` - val exception2 = intercept[AnalysisException] { - spark.table("l").select( - $"a", - spark.table("r").where($"c".outer() === $"a".outer()).select(sum($"d")).scalar() - ).collect() + private def table1() = { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + spark.table("t1") + } + + private def table2() = { + sql("CREATE VIEW t2(c1, c2) AS VALUES (0, 2), (0, 3)") + spark.table("t2") + } + + private def table3() = { + sql("CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + spark.table("t3") + } + + test("lateral join with single column select") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(spark.range(1).select($"c1".outer())), + sql("SELECT * FROM t1, LATERAL (SELECT c1)") + ) + checkAnswer( + t1.lateralJoin(t2.select($"c1")), + sql("SELECT * FROM t1, LATERAL (SELECT c1 FROM t2)") + ) + checkAnswer( + t1.lateralJoin(t2.select($"t1.c1".outer())), + sql("SELECT * FROM t1, LATERAL (SELECT t1.c1 FROM t2)") + ) + checkAnswer( + t1.lateralJoin(t2.select($"t1.c1".outer() + $"t2.c1")), + sql("SELECT * FROM t1, LATERAL (SELECT t1.c1 + t2.c1 FROM t2)") + ) } - checkError( - exception2, - condition = "UNRESOLVED_COLUMN.WITH_SUGGESTION", - parameters = Map("objectName" -> "`c`", "proposal" -> "`a`, `b`"), - queryContext = - Array(ExpectedContext(fragment = "outer", callSitePattern = getCurrentClassCallSitePattern)) - ) + } - // Missing `outer()` for another outer - val exception3 = intercept[AnalysisException] { - spark.table("l").select( - $"a", - spark.table("r").where($"b" === $"a".outer()).select(sum($"d")).scalar() - ).collect() + test("lateral join with star expansion") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(spark.range(1).select().select($"*")), + sql("SELECT * FROM t1, LATERAL (SELECT *)") + ) + checkAnswer( + t1.lateralJoin(t2.select($"*")), + sql("SELECT * FROM t1, LATERAL (SELECT * FROM t2)") + ) + checkAnswer( + t1.lateralJoin(t2.select($"t1.*".outer(), $"t2.*")), + sql("SELECT * FROM t1, LATERAL (SELECT t1.*, t2.* FROM t2)") + ) + checkAnswer( + t1.lateralJoin(t2.alias("t1").select($"t1.*")), + sql("SELECT * FROM t1, LATERAL (SELECT t1.* FROM t2 AS t1)") + ) + } + } + + test("lateral join with different join types") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + $"c2".outer()).as("c3")), + $"c2" === $"c3"), + sql("SELECT * FROM t1 JOIN LATERAL (SELECT c1 + c2 AS c3) ON c2 = c3") + ) + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + $"c2".outer()).as("c3")), + $"c2" === $"c3", + "left"), + sql("SELECT * FROM t1 LEFT JOIN LATERAL (SELECT c1 + c2 AS c3) ON c2 = c3") + ) + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + $"c2".outer()).as("c3")), + "cross"), + sql("SELECT * FROM t1 CROSS JOIN LATERAL (SELECT c1 + c2 AS c3)") + ) + } + } + + test("lateral join with subquery alias") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.lateralJoin(spark.range(1).select($"c1".outer(), $"c2".outer()).toDF("a", "b").as("s")) + .select("a", "b"), + sql("SELECT a, b FROM t1, LATERAL (SELECT c1, c2) s(a, b)") + ) + } + } + + test("lateral join with correlated equality / non-equality predicates") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(t2.where($"t1.c1".outer() === $"t2.c1").select($"c2")), + sql("SELECT * FROM t1, LATERAL (SELECT c2 FROM t2 WHERE t1.c1 = t2.c1)") + ) + checkAnswer( + t1.lateralJoin(t2.where($"t1.c1".outer() < $"t2.c1").select($"c2")), + sql("SELECT * FROM t1, LATERAL (SELECT c2 FROM t2 WHERE t1.c1 < t2.c1)") + ) + } + } + + test("lateral join with aggregation and correlated non-equality predicates") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin(t2.where($"t1.c2".outer() < $"t2.c2").select(max($"c2").as("m"))), + sql("SELECT * FROM t1, LATERAL (SELECT max(c2) AS m FROM t2 WHERE t1.c2 < t2.c2)") + ) + } + } + + test("lateral join can reference preceding FROM clause items") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.join(t2).lateralJoin( + spark.range(1).select($"t1.c2".outer() + $"t2.c2".outer()) + ), + sql("SELECT * FROM t1 JOIN t2 JOIN LATERAL (SELECT t1.c2 + t2.c2)") + ) + } + } + + test("multiple lateral joins") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + $"c2".outer()).as("a")) + ).lateralJoin( + spark.range(1).select(($"c1".outer() - $"c2".outer()).as("b")) + ).lateralJoin( + spark.range(1).select(($"a".outer() * $"b".outer()).as("c")) + ), + sql( + """ + |SELECT * FROM t1, + |LATERAL (SELECT c1 + c2 AS a), + |LATERAL (SELECT c1 - c2 AS b), + |LATERAL (SELECT a * b AS c) + |""".stripMargin) + ) + } + } + + test("lateral join in between regular joins") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin( + t2.where($"t1.c1".outer() === $"t2.c1").select($"c2").as("s"), "left" + ).join(t1.as("t3"), $"s.c2" === $"t3.c2", "left"), + sql( + """ + |SELECT * FROM t1 + |LEFT OUTER JOIN LATERAL (SELECT c2 FROM t2 WHERE t1.c1 = t2.c1) s + |LEFT OUTER JOIN t1 t3 ON s.c2 = t3.c2 + |""".stripMargin) + ) + } + } + + test("nested lateral joins") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.lateralJoin( + t2.lateralJoin(spark.range(1).select($"c1".outer())) + ), + sql("SELECT * FROM t1, LATERAL (SELECT * FROM t2, LATERAL (SELECT c1))") + ) + checkAnswer( + t1.lateralJoin( + spark.range(1).select(($"c1".outer() + lit(1)).as("c1")) + .lateralJoin(spark.range(1).select($"c1".outer())) + ), + sql("SELECT * FROM t1, LATERAL (SELECT * FROM (SELECT c1 + 1 AS c1), LATERAL (SELECT c1))") + ) + } + } + + test("scalar subquery inside lateral join") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + // uncorrelated + checkAnswer( + t1.lateralJoin( + spark.range(1).select( + $"c2".outer(), + t2.select(min($"c2")).scalar() + ) + ), + sql("SELECT * FROM t1, LATERAL (SELECT c2, (SELECT MIN(c2) FROM t2))") + ) + + // correlated + checkAnswer( + t1.lateralJoin( + spark.range(1).select($"c1".outer().as("a")) + .select(t2.where($"c1" === $"a".outer()).select(sum($"c2")).scalar()) + ), + sql( + """ + |SELECT * FROM t1, LATERAL ( + | SELECT (SELECT SUM(c2) FROM t2 WHERE c1 = a) FROM (SELECT c1 AS a) + |) + |""".stripMargin) + ) + } + } + + test("lateral join inside subquery") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + // uncorrelated + checkAnswer( + t1.where( + $"c1" === t2.lateralJoin( + spark.range(1).select($"c1".outer().as("a"))).select(min($"a") + ).scalar() + ), + sql("SELECT * FROM t1 WHERE c1 = (SELECT MIN(a) FROM t2, LATERAL (SELECT c1 AS a))") + ) + // correlated + checkAnswer( + t1.where( + $"c1" === t2.lateralJoin( + spark.range(1).select($"c1".outer().as("a"))) + .where($"c1" === $"t1.c1".outer()) + .select(min($"a")) + .scalar() + ), + sql("SELECT * FROM t1 " + + "WHERE c1 = (SELECT MIN(a) FROM t2, LATERAL (SELECT c1 AS a) WHERE c1 = t1.c1)") + ) + } + } + + test("lateral join with table-valued functions") { + withView("t1", "t3") { + val t1 = table1() + val t3 = table3() + + checkAnswer( + t1.lateralJoin(spark.tvf.range(3)), + sql("SELECT * FROM t1, LATERAL RANGE(3)") + ) + checkAnswer( + t1.lateralJoin(spark.tvf.explode(array($"c1".outer(), $"c2".outer()))), + sql("SELECT * FROM t1, LATERAL EXPLODE(ARRAY(c1, c2)) t2(c3)") + ) + checkAnswer( + t3.lateralJoin(spark.tvf.explode_outer($"c2".outer())), + sql("SELECT * FROM t3, LATERAL EXPLODE_OUTER(c2) t2(v)") + ) + checkAnswer( + spark.tvf.explode(array(lit(1), lit(2))).toDF("v") + .lateralJoin(spark.range(1).select($"v".outer() + 1)), + sql("SELECT * FROM EXPLODE(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1)") + ) + } + } + + test("lateral join with table-valued functions and join conditions") { + withView("t1", "t3") { + val t1 = table1() + val t3 = table3() + + checkAnswer( + t1.lateralJoin( + spark.tvf.explode(array($"c1".outer(), $"c2".outer())), + $"c1" === $"col" + ), + sql("SELECT * FROM t1 JOIN LATERAL EXPLODE(ARRAY(c1, c2)) t(c3) ON t1.c1 = c3") + ) + checkAnswer( + t3.lateralJoin( + spark.tvf.explode($"c2".outer()), + $"c1" === $"col" + ), + sql("SELECT * FROM t3 JOIN LATERAL EXPLODE(c2) t(c3) ON t3.c1 = c3") + ) + checkAnswer( + t3.lateralJoin( + spark.tvf.explode($"c2".outer()), + $"c1" === $"col", + "left" + ), + sql("SELECT * FROM t3 LEFT JOIN LATERAL EXPLODE(c2) t(c3) ON t3.c1 = c3") + ) + } + } + + test("subquery with generator / table-valued functions") { + withView("t1") { + val t1 = table1() + + checkAnswer( + spark.range(1).select(explode(t1.select(collect_list("c2")).scalar())), + sql("SELECT EXPLODE((SELECT COLLECT_LIST(c2) FROM t1))") + ) + checkAnswer( + spark.tvf.explode(t1.select(collect_list("c2")).scalar()), + sql("SELECT * FROM EXPLODE((SELECT COLLECT_LIST(c2) FROM t1))") + ) + } + } + + test("subquery in join condition") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkAnswer( + t1.join(t2, $"t1.c1" === t1.select(max("c1")).scalar()), + sql("SELECT * FROM t1 JOIN t2 ON t1.c1 = (SELECT MAX(c1) FROM t1)") + ) + } + } + + test("subquery in unpivot") { + withView("t1", "t2") { + val t1 = table1() + val t2 = table2() + + checkError( + intercept[AnalysisException] { + t1.unpivot(Array(t2.exists()), "c1", "c2").collect() + }, + "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_IN_EXISTS_SUBQUERY", + parameters = Map("treeNode" -> "(?s)'Unpivot.*"), + matchPVals = true, + queryContext = Array(ExpectedContext( + fragment = "exists", + callSitePattern = getCurrentClassCallSitePattern)) + ) + checkError( + intercept[AnalysisException] { + t1.unpivot(Array($"c1"), Array(t2.exists()), "c1", "c2").collect() + }, + "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_IN_EXISTS_SUBQUERY", + parameters = Map("treeNode" -> "(?s)Expand.*"), + matchPVals = true, + queryContext = Array(ExpectedContext( + fragment = "exists", + callSitePattern = getCurrentClassCallSitePattern)) + ) + } + } + + test("subquery in transpose") { + withView("t1") { + val t1 = table1() + + checkError( + intercept[AnalysisException] { + t1.transpose(t1.select(max("c1")).scalar()).collect() + }, + "TRANSPOSE_INVALID_INDEX_COLUMN", + parameters = Map("reason" -> "Index column must be an atomic attribute") + ) + } + } + + test("subquery in withColumns") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .select($"c1".outer() + $"c2".outer()) + .scalar()), + t1.select($"*", ($"c1" + $"c2").as("scalar"))) + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .withColumn("c1", $"c1".outer()) + .select($"c1" + $"c2".outer()) + .scalar()), + t1.select($"*", ($"c1" + $"c2").as("scalar"))) + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .select($"c1".outer().as("c1")) + .withColumn("c2", $"c2".outer()) + .select($"c1" + $"c2") + .scalar()), + t1.select($"*", ($"c1" + $"c2").as("scalar"))) + } + } + + test("subquery in withColumnsRenamed") { + withView("t1") { + val t1 = table1() + + checkAnswer( + t1.withColumn( + "scalar", + spark + .range(1) + .select($"c1".outer().as("c1"), $"c2".outer().as("c2")) + .withColumnsRenamed(Map("c1" -> "x", "c2" -> "y")) + .select($"x" + $"y") + .scalar()), + t1.select($"*", ($"c1".as("x") + $"c2".as("y")).as("scalar"))) + } + } + + test("subquery in drop") { + withView("t1") { + val t1 = table1() + + checkAnswer(t1.drop(spark.range(1).select(lit("c1")).scalar()), t1) + } + } + + test("subquery in repartition") { + withView("t1") { + val t1 = table1() + + checkAnswer(t1.repartition(spark.range(1).select(lit(1)).scalar()), t1) } - checkError( - exception3, - condition = "UNRESOLVED_COLUMN.WITH_SUGGESTION", - parameters = Map("objectName" -> "`b`", "proposal" -> "`c`, `d`"), - queryContext = - Array(ExpectedContext(fragment = "$", callSitePattern = getCurrentClassCallSitePattern)) - ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index ff251ddbbfb52..71d55b007aa17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -43,7 +43,6 @@ import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec, ShuffleExchangeLike} import org.apache.spark.sql.expressions.{Aggregator, Window} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession} import org.apache.spark.sql.test.SQLTestData.{ArrayStringWrapper, ContainerStringWrapper, StringWrapper, TestData2} @@ -309,6 +308,69 @@ class DataFrameSuite extends QueryTest testData.select("key").collect().toSeq) } + test("SPARK-50503 - cannot partition by variant columns") { + val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + + " as v, id % 5 as id, named_struct('v', parse_json(id::string)) s from range(0, 100, 1, 5)") + // variant column + checkError( + exception = intercept[AnalysisException](df.repartition(5, col("v"))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "\"v\"", + "dataType" -> "\"VARIANT\"") + ) + // nested variant column + checkError( + exception = intercept[AnalysisException](df.repartition(5, col("s"))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "\"s\"", + "dataType" -> "\"STRUCT\"") + ) + // variant producing expression + checkError( + exception = + intercept[AnalysisException](df.repartition(5, parse_json(col("id").cast("string")))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "\"parse_json(CAST(id AS STRING))\"", + "dataType" -> "\"VARIANT\"") + ) + // Partitioning by non-variant column works + try { + df.repartition(5, col("id")).collect() + } catch { + case e: Exception => + fail(s"Expected no exception to be thrown but an exception was thrown: ${e.getMessage}") + } + // SQL + withTempView("tv") { + df.createOrReplaceTempView("tv") + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY v")), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "\"v\"", + "dataType" -> "\"VARIANT\""), + context = ExpectedContext( + fragment = "DISTRIBUTE BY v", + start = 17, + stop = 31) + ) + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY s")), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "\"s\"", + "dataType" -> "\"STRUCT\""), + context = ExpectedContext( + fragment = "DISTRIBUTE BY s", + start = 17, + stop = 31) + ) + } + } + test("repartition with SortOrder") { // passing SortOrder expressions to .repartition() should result in an informative error @@ -366,6 +428,35 @@ class DataFrameSuite extends QueryTest } } + test("repartition by MapType") { + Seq("int", "long", "float", "double", "decimal(10, 2)", "string", "varchar(6)").foreach { dt => + withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + val df = spark.range(20) + .withColumn("c1", + when(col("id") % 3 === 1, typedLit(Map(1 -> 1))) + .when(col("id") % 3 === 2, typedLit(Map(1 -> 1, 2 -> 2))) + .otherwise(typedLit(Map(2 -> 2, 1 -> 1))).cast(s"map<$dt, $dt>")) + .withColumn("c2", typedLit(Map(1 -> null)).cast(s"map<$dt, $dt>")) + .withColumn("c3", lit(null).cast(s"map<$dt, $dt>")) + + assertPartitionNumber(df.repartition(4, col("c1")), 2) + assertPartitionNumber(df.repartition(4, col("c2")), 1) + assertPartitionNumber(df.repartition(4, col("c3")), 1) + assertPartitionNumber(df.repartition(4, col("c1"), col("c2")), 2) + assertPartitionNumber(df.repartition(4, col("c1"), col("c3")), 2) + assertPartitionNumber(df.repartition(4, col("c1"), col("c2"), col("c3")), 2) + assertPartitionNumber(df.repartition(4, col("c2"), col("c3")), 2) + } + } + } + + private def assertPartitionNumber(df: => DataFrame, max: Int): Unit = { + val dfGrouped = df.groupBy(spark_partition_id()).count() + // Result number of partition can be lower or equal to max, + // but no more than that. + assert(dfGrouped.count() <= max, dfGrouped.queryExecution.simpleString) + } + test("coalesce") { intercept[IllegalArgumentException] { testData.select("key").coalesce(0) @@ -1567,7 +1658,7 @@ class DataFrameSuite extends QueryTest test("SPARK-46794: exclude subqueries from LogicalRDD constraints") { withTempDir { checkpointDir => val subquery = - column(ScalarSubquery(spark.range(10).selectExpr("max(id)").logicalPlan)) + Column(ScalarSubquery(spark.range(10).selectExpr("max(id)").logicalPlan)) val df = spark.range(1000).filter($"id" === subquery) assert(df.logicalPlan.constraints.exists(_.exists(_.isInstanceOf[ScalarSubquery]))) @@ -2054,18 +2145,18 @@ class DataFrameSuite extends QueryTest // the number of keys must match val exception1 = intercept[IllegalArgumentException] { df1.groupBy($"key1", $"key2").flatMapCoGroupsInPandas( - df2.groupBy($"key2"), flatMapCoGroupsInPandasUDF) + df2.groupBy($"key2"), Column(flatMapCoGroupsInPandasUDF)) } assert(exception1.getMessage.contains("Cogroup keys must have same size: 2 != 1")) val exception2 = intercept[IllegalArgumentException] { df1.groupBy($"key1").flatMapCoGroupsInPandas( - df2.groupBy($"key1", $"key2"), flatMapCoGroupsInPandasUDF) + df2.groupBy($"key1", $"key2"), Column(flatMapCoGroupsInPandasUDF)) } assert(exception2.getMessage.contains("Cogroup keys must have same size: 1 != 2")) // but different keys are allowed val actual = df1.groupBy($"key1").flatMapCoGroupsInPandas( - df2.groupBy($"key2"), flatMapCoGroupsInPandasUDF) + df2.groupBy($"key2"), Column(flatMapCoGroupsInPandasUDF)) // can't evaluate the DataFrame as there is no PythonFunction given assert(actual != null) } @@ -2419,7 +2510,7 @@ class DataFrameSuite extends QueryTest | SELECT a, b FROM (SELECT a, b FROM VALUES (1, 2) AS t(a, b)) |) |""".stripMargin) - val stringCols = df.logicalPlan.output.map(column(_).cast(StringType)) + val stringCols = df.logicalPlan.output.map(Column(_).cast(StringType)) val castedDf = df.select(stringCols: _*) checkAnswer(castedDf, Row("1", "1") :: Row("1", "2") :: Nil) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala index c2f53ff56d1aa..637e0cf964fe5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTableValuedFunctionsSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSession { + import testImplicits._ test("explode") { val actual1 = spark.tvf.explode(array(lit(1), lit(2))) @@ -50,6 +51,30 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual6, expected6) } + test("explode - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql("CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin(spark.tvf.explode(array($"c1".outer(), $"c2".outer())).toDF("c3").as("t2")), + sql("SELECT * FROM t1, LATERAL EXPLODE(ARRAY(c1, c2)) t2(c3)") + ) + checkAnswer( + t3.lateralJoin(spark.tvf.explode($"c2".outer()).toDF("v").as("t2")), + sql("SELECT * FROM t3, LATERAL EXPLODE(c2) t2(v)") + ) + checkAnswer( + spark.tvf.explode(array(lit(1), lit(2))).toDF("v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM EXPLODE(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1)") + ) + } + } + test("explode_outer") { val actual1 = spark.tvf.explode_outer(array(lit(1), lit(2))) val expected1 = spark.sql("SELECT * FROM explode_outer(array(1, 2))") @@ -78,6 +103,31 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual6, expected6) } + test("explode_outer - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql("CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin( + spark.tvf.explode_outer(array($"c1".outer(), $"c2".outer())).toDF("c3").as("t2")), + sql("SELECT * FROM t1, LATERAL EXPLODE_OUTER(ARRAY(c1, c2)) t2(c3)") + ) + checkAnswer( + t3.lateralJoin(spark.tvf.explode_outer($"c2".outer()).toDF("v").as("t2")), + sql("SELECT * FROM t3, LATERAL EXPLODE_OUTER(c2) t2(v)") + ) + checkAnswer( + spark.tvf.explode_outer(array(lit(1), lit(2))).toDF("v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM EXPLODE_OUTER(ARRAY(1, 2)) t(v), LATERAL (SELECT v + 1)") + ) + } + } + test("inline") { val actual1 = spark.tvf.inline(array(struct(lit(1), lit("a")), struct(lit(2), lit("b")))) val expected1 = spark.sql("SELECT * FROM inline(array(struct(1, 'a'), struct(2, 'b')))") @@ -98,6 +148,32 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual3, expected3) } + test("inline - lateral join") { + withView("array_struct") { + sql( + """ + |CREATE VIEW array_struct(id, arr) AS VALUES + | (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), + | (2, ARRAY()), + | (3, ARRAY(STRUCT(3, 'c'))) + |""".stripMargin) + val arrayStruct = spark.table("array_struct") + + checkAnswer( + arrayStruct.lateralJoin(spark.tvf.inline($"arr".outer())), + sql("SELECT * FROM array_struct JOIN LATERAL INLINE(arr)") + ) + checkAnswer( + arrayStruct.lateralJoin( + spark.tvf.inline($"arr".outer()).toDF("k", "v").as("t"), + $"id" === $"k", + "left" + ), + sql("SELECT * FROM array_struct LEFT JOIN LATERAL INLINE(arr) t(k, v) ON id = k") + ) + } + } + test("inline_outer") { val actual1 = spark.tvf.inline_outer(array(struct(lit(1), lit("a")), struct(lit(2), lit("b")))) val expected1 = spark.sql("SELECT * FROM inline_outer(array(struct(1, 'a'), struct(2, 'b')))") @@ -118,6 +194,32 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual3, expected3) } + test("inline_outer - lateral join") { + withView("array_struct") { + sql( + """ + |CREATE VIEW array_struct(id, arr) AS VALUES + | (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), + | (2, ARRAY()), + | (3, ARRAY(STRUCT(3, 'c'))) + |""".stripMargin) + val arrayStruct = spark.table("array_struct") + + checkAnswer( + arrayStruct.lateralJoin(spark.tvf.inline_outer($"arr".outer())), + sql("SELECT * FROM array_struct JOIN LATERAL INLINE_OUTER(arr)") + ) + checkAnswer( + arrayStruct.lateralJoin( + spark.tvf.inline_outer($"arr".outer()).toDF("k", "v").as("t"), + $"id" === $"k", + "left" + ), + sql("SELECT * FROM array_struct LEFT JOIN LATERAL INLINE_OUTER(arr) t(k, v) ON id = k") + ) + } + } + test("json_tuple") { val actual = spark.tvf.json_tuple(lit("""{"a":1,"b":2}"""), lit("a"), lit("b")) val expected = spark.sql("""SELECT * FROM json_tuple('{"a":1,"b":2}', 'a', 'b')""") @@ -130,6 +232,43 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi assert(ex.messageParameters("functionName") == "`json_tuple`") } + test("json_tuple - lateral join") { + withView("json_table") { + sql( + """ + |CREATE OR REPLACE TEMP VIEW json_table(key, jstring) AS VALUES + | ('1', '{"f1": "1", "f2": "2", "f3": 3, "f5": 5.23}'), + | ('2', '{"f1": "1", "f3": "3", "f2": 2, "f4": 4.01}'), + | ('3', '{"f1": 3, "f4": "4", "f3": "3", "f2": 2, "f5": 5.01}'), + | ('4', cast(null as string)), + | ('5', '{"f1": null, "f5": ""}'), + | ('6', '[invalid JSON string]') + |""".stripMargin) + val jsonTable = spark.table("json_table") + + checkAnswer( + jsonTable.as("t1").lateralJoin( + spark.tvf.json_tuple( + $"t1.jstring".outer(), + lit("f1"), lit("f2"), lit("f3"), lit("f4"), lit("f5")).as("t2") + ).select($"t1.key", $"t2.*"), + sql("SELECT t1.key, t2.* FROM json_table t1, " + + "LATERAL json_tuple(t1.jstring, 'f1', 'f2', 'f3', 'f4', 'f5') t2") + ) + checkAnswer( + jsonTable.as("t1").lateralJoin( + spark.tvf.json_tuple( + $"jstring".outer(), + lit("f1"), lit("f2"), lit("f3"), lit("f4"), lit("f5")).as("t2") + ).where($"t2.c0".isNotNull) + .select($"t1.key", $"t2.*"), + sql("SELECT t1.key, t2.* FROM json_table t1, " + + "LATERAL json_tuple(t1.jstring, 'f1', 'f2', 'f3', 'f4', 'f5') t2 " + + "WHERE t2.c0 IS NOT NULL") + ) + } + } + test("posexplode") { val actual1 = spark.tvf.posexplode(array(lit(1), lit(2))) val expected1 = spark.sql("SELECT * FROM posexplode(array(1, 2))") @@ -158,6 +297,30 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual6, expected6) } + test("posexplode - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql("CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin(spark.tvf.posexplode(array($"c1".outer(), $"c2".outer()))), + sql("SELECT * FROM t1, LATERAL POSEXPLODE(ARRAY(c1, c2))") + ) + checkAnswer( + t3.lateralJoin(spark.tvf.posexplode($"c2".outer())), + sql("SELECT * FROM t3, LATERAL POSEXPLODE(c2)") + ) + checkAnswer( + spark.tvf.posexplode(array(lit(1), lit(2))).toDF("p", "v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM POSEXPLODE(ARRAY(1, 2)) t(p, v), LATERAL (SELECT v + 1)") + ) + } + } + test("posexplode_outer") { val actual1 = spark.tvf.posexplode_outer(array(lit(1), lit(2))) val expected1 = spark.sql("SELECT * FROM posexplode_outer(array(1, 2))") @@ -186,12 +349,66 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual6, expected6) } + test("posexplode_outer - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql("CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin(spark.tvf.posexplode_outer(array($"c1".outer(), $"c2".outer()))), + sql("SELECT * FROM t1, LATERAL POSEXPLODE_OUTER(ARRAY(c1, c2))") + ) + checkAnswer( + t3.lateralJoin(spark.tvf.posexplode_outer($"c2".outer())), + sql("SELECT * FROM t3, LATERAL POSEXPLODE_OUTER(c2)") + ) + checkAnswer( + spark.tvf.posexplode_outer(array(lit(1), lit(2))).toDF("p", "v") + .lateralJoin(spark.range(1).select($"v".outer() + lit(1))), + sql("SELECT * FROM POSEXPLODE_OUTER(ARRAY(1, 2)) t(p, v), LATERAL (SELECT v + 1)") + ) + } + } + test("stack") { val actual = spark.tvf.stack(lit(2), lit(1), lit(2), lit(3)) val expected = spark.sql("SELECT * FROM stack(2, 1, 2, 3)") checkAnswer(actual, expected) } + test("stack - lateral join") { + withView("t1", "t3") { + sql("CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2)") + sql("CREATE VIEW t3(c1, c2) AS " + + "VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4))") + val t1 = spark.table("t1") + val t3 = spark.table("t3") + + checkAnswer( + t1.lateralJoin( + spark.tvf.stack(lit(2), lit("Key"), $"c1".outer(), lit("Value"), $"c2".outer()).as("t") + ).select($"t.*"), + sql("SELECT t.* FROM t1, LATERAL stack(2, 'Key', c1, 'Value', c2) t") + ) + checkAnswer( + t1.lateralJoin( + spark.tvf.stack(lit(1), $"c1".outer(), $"c2".outer()).toDF("x", "y").as("t") + ).select($"t.*"), + sql("SELECT t.* FROM t1 JOIN LATERAL stack(1, c1, c2) t(x, y)") + ) + checkAnswer( + t1.join(t3, $"t1.c1" === $"t3.c1") + .lateralJoin( + spark.tvf.stack(lit(1), $"t1.c2".outer(), $"t3.c2".outer()).as("t") + ).select($"t.*"), + sql("SELECT t.* FROM t1 JOIN t3 ON t1.c1 = t3.c1 JOIN LATERAL stack(1, t1.c2, t3.c2) t") + ) + } + } + test("collations") { val actual = spark.tvf.collations() val expected = spark.sql("SELECT * FROM collations()") @@ -235,6 +452,28 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi checkAnswer(actual6, expected6) } + test("variant_explode - lateral join") { + withView("variant_table") { + sql( + """ + |CREATE VIEW variant_table(id, v) AS + |SELECT id, parse_json(v) AS v FROM VALUES + |(0, '["hello", "world"]'), (1, '{"a": true, "b": 3.14}'), + |(2, '[]'), (3, '{}'), + |(4, NULL), (5, '1') + |AS t(id, v) + |""".stripMargin) + val variantTable = spark.table("variant_table") + + checkAnswer( + variantTable.as("t1").lateralJoin( + spark.tvf.variant_explode($"v".outer()).as("t") + ).select($"t1.id", $"t.*"), + sql("SELECT t1.id, t.* FROM variant_table AS t1, LATERAL variant_explode(v) AS t") + ) + } + } + test("variant_explode_outer") { val actual1 = spark.tvf.variant_explode_outer(parse_json(lit("""["hello", "world"]"""))) val expected1 = spark.sql( @@ -265,4 +504,26 @@ class DataFrameTableValuedFunctionsSuite extends QueryTest with SharedSparkSessi val expected6 = spark.sql("SELECT * FROM variant_explode_outer(parse_json('1'))") checkAnswer(actual6, expected6) } + + test("variant_explode_outer - lateral join") { + withView("variant_table") { + sql( + """ + |CREATE VIEW variant_table(id, v) AS + |SELECT id, parse_json(v) AS v FROM VALUES + |(0, '["hello", "world"]'), (1, '{"a": true, "b": 3.14}'), + |(2, '[]'), (3, '{}'), + |(4, NULL), (5, '1') + |AS t(id, v) + |""".stripMargin) + val variantTable = spark.table("variant_table") + + checkAnswer( + variantTable.as("t1").lateralJoin( + spark.tvf.variant_explode_outer($"v".outer()).as("t") + ).select($"t1.id", $"t.*"), + sql("SELECT t1.id, t.* FROM variant_table AS t1, LATERAL variant_explode_outer(v) AS t") + ) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTransposeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTransposeSuite.scala index 51de8553216c6..ce1c8d7ceb64a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTransposeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTransposeSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -173,4 +174,20 @@ class DataFrameTransposeSuite extends QueryTest with SharedSparkSession { ) assertResult(Array("key", "A", "B"))(transposedDf.columns) } + + test("SPARK-50602: invalid index columns") { + val df = Seq( + ("A", 1, 2), + ("B", 3, 4), + (null, 5, 6) + ).toDF("id", "val1", "val2") + + checkError( + exception = intercept[AnalysisException] { + df.transpose($"id" + lit(1)) + }, + condition = "TRANSPOSE_INVALID_INDEX_COLUMN", + parameters = Map("reason" -> "Index column must be an atomic attribute") + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala index 8a86aa10887c0..01e72daead440 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, Exchange, S import org.apache.spark.sql.execution.window.WindowExec import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction, Window} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -862,7 +861,7 @@ class DataFrameWindowFunctionsSuite extends QueryTest lead($"value", 2, null, true).over(window), lead($"value", 3, null, true).over(window), lead(concat($"value", $"key"), 1, null, true).over(window), - column(Lag($"value".expr, NonFoldableLiteral(1), Literal(null), true)).over(window), + Column(Lag($"value".expr, NonFoldableLiteral(1), Literal(null), true)).over(window), lag($"value", 2).over(window), lag($"value", 0, null, true).over(window), lag($"value", 1, null, true).over(window), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala index bda8c7f26082f..9d8aaf8d90e32 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableS import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.Metadata import org.apache.spark.storage.StorageLevel import org.apache.spark.tags.SlowSQLTest @@ -312,4 +313,19 @@ class DatasetCacheSuite extends QueryTest } } } + + test("SPARK-50682: inner Alias should be canonicalized") { + // Put a metadata in the Alias so that it won't be removed by the analyzer. + val metadata = Metadata.fromJson("""{"k": "v"}""") + val df1 = spark.range(5).select(struct($"id".as("name", metadata))) + df1.cache() + // This is exactly the same as df1. + val df2 = spark.range(5).select(struct($"id".as("name", metadata))) + assert(df2.queryExecution.executedPlan.exists(_.isInstanceOf[InMemoryTableScanExec])) + + val metadata2 = Metadata.fromJson("""{"k2": "v2"}""") + // Same with df1 except for the Alias metadata + val df3 = spark.range(5).select(struct($"id".as("name", metadata2))) + assert(!df3.queryExecution.executedPlan.exists(_.isInstanceOf[InMemoryTableScanExec])) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala index 81d7de856f881..5db3990b67c8b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala @@ -184,16 +184,18 @@ class DatasetOptimizationSuite extends QueryTest with SharedSparkSession { assert(count3 == count2) } - withClue("array type") { - checkCodegenCache(() => Seq(Seq("abc")).toDS()) - } + withSQLConf(SQLConf.ARTIFACTS_SESSION_ISOLATION_ALWAYS_APPLY_CLASSLOADER.key -> "true") { + withClue("array type") { + checkCodegenCache(() => Seq(Seq("abc")).toDS()) + } - withClue("map type") { - checkCodegenCache(() => Seq(Map("abc" -> 1)).toDS()) - } + withClue("map type") { + checkCodegenCache(() => Seq(Map("abc" -> 1)).toDS()) + } - withClue("array of map") { - checkCodegenCache(() => Seq(Seq(Map("abc" -> 1))).toDS()) + withClue("array of map") { + checkCodegenCache(() => Seq(Seq(Map("abc" -> 1))).toDS()) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala index 8c0231fddf39f..0468ceb9f967c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala @@ -118,7 +118,8 @@ class ExpressionsSchemaSuite extends QueryTest with SharedSparkSession { // SET spark.sql.parser.escapedStringLiterals=true example.split(" > ").tail.filterNot(_.trim.startsWith("SET")).take(1).foreach { case _ if funcName == "from_avro" || funcName == "to_avro" || - funcName == "from_protobuf" || funcName == "to_protobuf" => + funcName == "schema_of_avro" || funcName == "from_protobuf" || + funcName == "to_protobuf" => // Skip running the example queries for the from_avro, to_avro, from_protobuf and // to_protobuf functions because these functions dynamically load the // AvroDataToCatalyst or CatalystDataToAvro classes which are not available in this diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index cdea4446d9461..22f55819d1d4c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -31,10 +31,11 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, ExprId, PythonUDF} import org.apache.spark.sql.catalyst.plans.SQLHelper +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.execution.datasources.v2.python.UserDefinedPythonDataSource import org.apache.spark.sql.execution.python.{UserDefinedPythonFunction, UserDefinedPythonTableFunction} import org.apache.spark.sql.expressions.SparkUserDefinedFunction -import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} +import org.apache.spark.sql.internal.ExpressionUtils.expression import org.apache.spark.sql.internal.UserDefinedFunctionUtils.toScalaUDF import org.apache.spark.sql.types.{DataType, IntegerType, NullType, StringType, StructType, VariantType} import org.apache.spark.util.ArrayImplicits._ @@ -1592,7 +1593,7 @@ object IntegratedUDFTestUtils extends SQLHelper { Cast(toScalaUDF(udf, Cast(expr, StringType) :: Nil), rt) } - def apply(exprs: Column*): Column = builder(exprs.map(expression)) + def apply(exprs: Column*): Column = Column(builder(exprs.map(expression))) val prettyName: String = "Scala UDF" } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 84408d8e2495d..ea185b6b4901a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.execution.WholeStageCodegenExec import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -1394,7 +1393,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { val df = Seq(1).toDF("a") val schema = StructType(StructField("b", ObjectType(classOf[java.lang.Integer])) :: Nil) val row = InternalRow.fromSeq(Seq(Integer.valueOf(1))) - val structData = column(Literal.create(row, schema)) + val structData = Column(Literal.create(row, schema)) checkError( exception = intercept[AnalysisException] { df.select($"a").withColumn("c", to_json(structData)).collect() @@ -1456,4 +1455,28 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { assert(plan.isInstanceOf[WholeStageCodegenExec]) checkAnswer(df, Row(null)) } + + test("function json_tuple - field names foldable") { + withTempView("t") { + val json = """{"a":1, "b":2, "c":3}""" + val df = Seq((json, "a", "b", "c")).toDF("json", "c1", "c2", "c3") + df.createOrReplaceTempView("t") + + // Json and all field names are foldable. + val df1 = sql(s"SELECT json_tuple('$json', 'a', 'b', 'c') from t") + checkAnswer(df1, Row("1", "2", "3")) + + // All field names are foldable. + val df2 = sql("SELECT json_tuple(json, 'a', 'b', 'c') from t") + checkAnswer(df2, Row("1", "2", "3")) + + // The field names some foldable, some non-foldable. + val df3 = sql("SELECT json_tuple(json, 'a', c2, 'c') from t") + checkAnswer(df3, Row("1", "2", "3")) + + // All field names are non-foldable. + val df4 = sql("SELECT json_tuple(json, c1, c2, c3) from t") + checkAnswer(df4, Row("1", "2", "3")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LateralColumnAliasSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/LateralColumnAliasSuite.scala index d7177e19a6177..3def42cd7ee55 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/LateralColumnAliasSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/LateralColumnAliasSuite.scala @@ -1365,4 +1365,41 @@ class LateralColumnAliasSuite extends LateralColumnAliasSuiteBase { // the states are cleared - a subsequent correct query should succeed sql("select 1 as a, a").queryExecution.assertAnalyzed() } + + test("SPARK-49349: Improve error message for LCA with Generate") { + checkError( + exception = intercept[AnalysisException] { + sql( + s""" + |SELECT + | explode(split(name , ',')) AS new_name, + | new_name like 'a%' + |FROM $testTable + |""".stripMargin) + }, + condition = "UNSUPPORTED_FEATURE.LATERAL_COLUMN_ALIAS_IN_GENERATOR", + sqlState = "0A000", + parameters = Map( + "lca" -> "`new_name`", + "generatorExpr" -> "\"unresolvedalias(lateralAliasReference(new_name) LIKE a%)\"")) + + checkError( + exception = intercept[AnalysisException] { + sql( + s""" + |SELECT + | explode_outer(from_json(name,'array>')) as newName, + | size(from_json(newName.values,'array')) + + | size(array(from_json(newName.values,'map'))) as size + |FROM $testTable + |""".stripMargin) + }, + condition = "UNSUPPORTED_FEATURE.LATERAL_COLUMN_ALIAS_IN_GENERATOR", + sqlState = "0A000", + parameters = Map( + "lca" -> "`newName.values`", + "generatorExpr" -> ("\"(size(from_json(lateralAliasReference(newName.values), " + + "array)) + size(array(from_json(lateralAliasReference(newName.values), " + + "map)))) AS size\""))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala index 873337e7a4242..861b0bf0f3945 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala @@ -33,12 +33,18 @@ class LogQuerySuite extends QueryTest with SharedSparkSession with Logging { new File(pwd + "/target/LogQuerySuite.log") } + override def beforeAll(): Unit = { + super.beforeAll() + Logging.enableStructuredLogging() + } + override def afterAll(): Unit = { super.afterAll() // Clear the log file if (logFile.exists()) { logFile.delete() } + Logging.disableStructuredLogging() } private def createTempView(viewName: String): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala index 791bcc91d5094..bb1363f1c58c0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala @@ -758,4 +758,47 @@ class ParametersSuite extends QueryTest with SharedSparkSession with PlanTest { checkAnswer(spark.sql(query("?"), args = Array("tt1")), Row(1)) } } + + test("SPARK-50441: parameterized identifier referencing a CTE") { + def query(p: String): String = { + s""" + |WITH t1 AS (SELECT 1) + |SELECT * FROM IDENTIFIER($p)""".stripMargin + } + + checkAnswer(spark.sql(query(":cte"), args = Map("cte" -> "t1")), Row(1)) + checkAnswer(spark.sql(query("?"), args = Array("t1")), Row(1)) + } + + test("SPARK-50403: parameterized execute immediate") { + checkAnswer(spark.sql("execute immediate 'select ?' using ?", Array(1)), Row(1)) + checkAnswer(spark.sql("execute immediate 'select ?, ?' using ?, 2", Array(1)), Row(1, 2)) + checkError( + exception = intercept[AnalysisException] { + spark.sql("execute immediate 'select ?, ?' using 1", Array(2)) + }, + condition = "UNBOUND_SQL_PARAMETER", + parameters = Map("name" -> "_10"), + context = ExpectedContext("?", 10, 10)) + + checkAnswer(spark.sql("execute immediate 'select ?' using 1", Map("param1" -> "1")), Row(1)) + checkAnswer(spark.sql("execute immediate 'select :param1' using :param2 as param1", + Map("param2" -> 42)), Row(42)) + checkAnswer(spark.sql( + "execute immediate 'select :param1, :param2' using :param2 as param1, 43 as param2", + Map("param2" -> 42)), Row(42, 43)) + checkAnswer(spark.sql("execute immediate 'select :param' using 0 as param", + Map("param" -> 42)), Row(0)) + checkError( + exception = intercept[AnalysisException] { + spark.sql("execute immediate 'select :param1, :param2' using 1 as param1", + Map("param2" -> 2)) + }, + condition = "UNBOUND_SQL_PARAMETER", + parameters = Map("name" -> "param2"), + context = ExpectedContext(":param2", 16, 22)) + + checkAnswer(spark.sql("execute immediate 'select ?' using :param", Map("param" -> 2)), Row(2)) + checkAnswer(spark.sql("execute immediate 'select :param' using ? as param", Array(3)), Row(3)) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 30180d48da71a..b59c83c23d3c3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -27,7 +27,7 @@ import org.scalatest.Assertions import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.util.QueryExecutionListener import org.apache.spark.storage.StorageLevel @@ -449,12 +449,12 @@ object QueryTest extends Assertions { } } - def withPhysicalPlansCaptured(spark: SparkSession, thunk: => Unit): Seq[SparkPlan] = { - var capturedPlans = Seq.empty[SparkPlan] + def withQueryExecutionsCaptured(spark: SparkSession)(thunk: => Unit): Seq[QueryExecution] = { + var capturedQueryExecutions = Seq.empty[QueryExecution] val listener = new QueryExecutionListener { override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { - capturedPlans = capturedPlans :+ qe.executedPlan + capturedQueryExecutions = capturedQueryExecutions :+ qe } override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} } @@ -468,7 +468,7 @@ object QueryTest extends Assertions { spark.listenerManager.unregister(listener) } - capturedPlans + capturedQueryExecutions } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala index c80787c40c487..ce3ac9b8834bf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala @@ -108,4 +108,26 @@ class RuntimeConfigSuite extends SparkFunSuite { // this set should not fail conf.set(DEFAULT_PARALLELISM.key, "1") } + + test("config entry") { + val conf = newConf() + + val entry = SQLConf.FILES_MAX_PARTITION_NUM + assert(conf.get(entry.key) === null) + assert(conf.get(entry).isEmpty) + assert(conf.get(entry, Option(55)) === Option(55)) + conf.set(entry, Option(33)) + assert(conf.get(entry.key) === "33") + assert(conf.get(entry) === Option(33)) + assert(conf.get(entry, Option(55)) === Option(33)) + + val entryWithDefault = SQLConf.RUNTIME_FILTER_NUMBER_THRESHOLD + assert(conf.get(entryWithDefault.key) === "10") + assert(conf.get(entryWithDefault) === 10) + assert(conf.get(entryWithDefault, 11) === 11) + conf.set(entryWithDefault, 12) + assert(conf.get(entryWithDefault.key) === "12") + assert(conf.get(entryWithDefault) === 12) + assert(conf.get(entryWithDefault, 11) === 12) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala index d81768c0077eb..ea0d405d2a8f7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala @@ -99,6 +99,29 @@ class SQLContextSuite extends SparkFunSuite with SharedSparkContext { assert(sqlContext.tables().filter("tableName = 'listtablessuitetable'").count() === 0) } + test("get tables from a database") { + val sqlContext = SQLContext.getOrCreate(sc) + + try { + sqlContext.sql("CREATE DATABASE IF NOT EXISTS temp_db_1") + sqlContext.sql("CREATE TABLE temp_db_1.temp_table_1 (key int)") + sqlContext.sql("INSERT INTO temp_db_1.temp_table_1 VALUES (1)") + + assert(sqlContext.tableNames("temp_db_1").sameElements(Array("temp_table_1"))) + + assert(sqlContext.tables("temp_db_1").collect().toSeq == + Row("temp_db_1", "temp_table_1", false) :: Nil) + + assert(sqlContext.tables().collect().toSeq == Nil) + sqlContext.sql("USE temp_db_1") + assert(sqlContext.tableNames().sameElements(Array("temp_table_1"))) + assert(sqlContext.tables().collect().toSeq == Row("temp_db_1", "temp_table_1", false) :: Nil) + } finally { + sqlContext.sql("USE default") + sqlContext.sql("DROP DATABASE IF EXISTS temp_db_1 CASCADE") + } + } + test("getting all tables with a database name has no impact on returned table names") { val sqlContext = SQLContext.getOrCreate(sc) val df = sqlContext.range(10) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestHelper.scala index 7daf2c6b1b58b..04f274e4af592 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestHelper.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestHelper.scala @@ -60,7 +60,18 @@ trait SQLQueryTestHelper extends Logging { .replaceAll("CTERelationDef \\d+,", s"CTERelationDef xxxx,") .replaceAll("CTERelationRef \\d+,", s"CTERelationRef xxxx,") .replaceAll("@\\w*,", s"@xxxxxxxx,") - .replaceAll("\\*\\(\\d+\\) ", "*") // remove the WholeStageCodegen codegenStageIds + .replaceAll("\\*\\(\\d+\\) ", "*") + .replaceAll( + s""""location":.*?$clsName/""", + s""""location": "$notIncludedMsg/{warehouse_dir}/""") + .replaceAll(s""""created_by":".*?"""", s""""created_by $notIncludedMsg":"None"""") + .replaceAll(s""""created_time":".*?"""", s""""created_time $notIncludedMsg":"None"""") + .replaceAll(s""""last_access":".*?"""", s""""last_access $notIncludedMsg":"None"""") + .replaceAll(s""""owner":".*?"""", s""""owner $notIncludedMsg":"None"""") + .replaceAll(s""""partition_statistics":"\\d+"""", + s""""partition_statistics $notIncludedMsg":"None"""") + .replaceAll("cterelationdef \\d+,", "cterelationdef xxxx,") + .replaceAll("cterelationref \\d+,", "cterelationref xxxx,") } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 5c56377f21c20..575a4ae69d1a9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -132,7 +132,7 @@ import org.apache.spark.util.Utils // scalastyle:on line.size.limit @ExtendedSQLTest class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper - with SQLQueryTestHelper { + with SQLQueryTestHelper with TPCDSSchema { import IntegratedUDFTestUtils._ @@ -165,13 +165,17 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper protected def ignoreList: Set[String] = Set( "ignored.sql" // Do NOT remove this one. It is here to test the ignore functionality. ) ++ otherIgnoreList + /** List of test cases that require TPCDS table schemas to be loaded. */ + private def requireTPCDSCases: Seq[String] = Seq("pipe-operators.sql") + /** List of TPCDS table names and schemas to load from the [[TPCDSSchema]] base class. */ + private val tpcDSTableNamesToSchemas: Map[String, String] = tableColumns // Create all the test cases. listTestCases.foreach(createScalaTestCase) protected def createScalaTestCase(testCase: TestCase): Unit = { if (ignoreList.exists(t => - testCase.name.toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT)))) { + testCase.name.toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT)))) { // Create a test case to ignore this case. ignore(testCase.name) { /* Do nothing */ } } else testCase match { @@ -322,6 +326,15 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper setOperations.foreach(localSparkSession.sql) } + // Load TPCDS table schemas for the test case if required. + val lowercaseTestCase = testCase.name.toLowerCase(Locale.ROOT) + if (requireTPCDSCases.contains(lowercaseTestCase)) { + tpcDSTableNamesToSchemas.foreach { case (name: String, schema: String) => + localSparkSession.sql(s"DROP TABLE IF EXISTS $name") + localSparkSession.sql(s"CREATE TABLE `$name` ($schema) USING parquet") + } + } + // Run the SQL queries preparing them for comparison. val outputs: Seq[QueryTestOutput] = queries.map { sql => testCase match { @@ -348,6 +361,13 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper } } + // Drop TPCDS tables after the test case if required. + if (requireTPCDSCases.contains(lowercaseTestCase)) { + tpcDSTableNamesToSchemas.foreach { case (name: String, schema: String) => + localSparkSession.sql(s"DROP TABLE IF EXISTS $name") + } + } + if (regenerateGoldenFiles) { // Again, we are explicitly not using multi-line string due to stripMargin removing "|". val goldenOutput = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala index 46a24acb475c4..43e6111fc99ca 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala @@ -67,6 +67,7 @@ class TPCDSCollationQueryTestSuite extends QueryTest with TPCDSBase with SQLQuer // To make output results deterministic override protected def sparkConf: SparkConf = super.sparkConf .set(SQLConf.SHUFFLE_PARTITIONS.key, "1") + .remove("spark.hadoop.fs.file.impl") protected override def createSparkSession: TestSparkSession = { new TestSparkSession(new SparkContext("local[1]", this.getClass.getSimpleName, sparkConf)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala index ffd15eb46a48e..e8b36d8b130cf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala @@ -33,8 +33,7 @@ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSBase { // Disable read-side char padding so that the generated code is less than 8000. super.sparkConf.set(SQLConf.READ_SIDE_CHAR_PADDING, false) - // q72 is skipped due to GitHub Actions' memory limit. - tpcdsQueries.filterNot(sys.env.contains("GITHUB_ACTIONS") && _ == "q72").foreach { name => + tpcdsQueries.foreach { name => val queryString = resourceToString(s"tpcds/$name.sql", classLoader = Thread.currentThread().getContextClassLoader) test(name) { @@ -44,8 +43,7 @@ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSBase { } } - // q72 is skipped due to GitHub Actions' memory limit. - tpcdsQueriesV2_7_0.filterNot(sys.env.contains("GITHUB_ACTIONS") && _ == "q72").foreach { name => + tpcdsQueriesV2_7_0.foreach { name => val queryString = resourceToString(s"tpcds-v2.7.0/$name.sql", classLoader = Thread.currentThread().getContextClassLoader) test(s"$name-v2.7") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala index bde6155529872..c1246a167b8cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala @@ -62,6 +62,7 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase with SQLQueryTestHelp // To make output results deterministic override protected def sparkConf: SparkConf = super.sparkConf .set(SQLConf.SHUFFLE_PARTITIONS.key, "1") + .remove("spark.hadoop.fs.file.impl") protected override def createSparkSession: TestSparkSession = { new TestSparkSession(new SparkContext("local[1]", this.getClass.getSimpleName, sparkConf)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala index 624bae70ce09c..662eead137c40 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.{column => toColumn, expression} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -89,7 +88,7 @@ class TypedImperativeAggregateSuite extends QueryTest with SharedSparkSession { test("dataframe aggregate with object aggregate buffer, should not use HashAggregate") { val df = data.toDF("a", "b") - val max = TypedMax($"a") + val max = Column(TypedMax($"a".expr)) // Always uses SortAggregateExec val sparkPlan = df.select(max).queryExecution.sparkPlan @@ -212,9 +211,10 @@ class TypedImperativeAggregateSuite extends QueryTest with SharedSparkSession { checkAnswer(query, expected) } - private def typedMax(column: Column): Column = TypedMax(column) + private def typedMax(column: Column): Column = Column(TypedMax(column.expr)) - private def nullableTypedMax(column: Column): Column = TypedMax(column, nullable = true) + private def nullableTypedMax(column: Column): Column = + Column(TypedMax(column.expr, nullable = true)) } object TypedImperativeAggregateSuite { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantShreddingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantShreddingSuite.scala new file mode 100644 index 0000000000000..3443028ba45b0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantShreddingSuite.scala @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.io.File +import java.sql.{Date, Timestamp} +import java.time.LocalDateTime + +import org.apache.spark.SparkThrowable +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.variant.VariantExpressionEvalUtils +import org.apache.spark.sql.catalyst.util.DateTimeConstants._ +import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.sql.execution.datasources.parquet.{ParquetTest, SparkShreddingUtils} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ +import org.apache.spark.types.variant._ +import org.apache.spark.unsafe.types.{UTF8String, VariantVal} + +class VariantShreddingSuite extends QueryTest with SharedSparkSession with ParquetTest { + def parseJson(s: String): VariantVal = { + val v = VariantBuilder.parseJson(s, false) + new VariantVal(v.getValue, v.getMetadata) + } + + // Make a variant value binary by parsing a JSON string. + def value(s: String): Array[Byte] = VariantBuilder.parseJson(s, false).getValue + + // Make a variant metadata binary that includes a set of keys. + def metadata(keys: Seq[String]): Array[Byte] = { + val builder = new VariantBuilder(false) + keys.foreach(builder.addKey) + builder.result().getMetadata + } + + // Build a shredded variant value binary. Its IDs refer to the metadata built from `metadataKeys`, + // which can include more keys than the JSON string contains. + def shreddedValue(s: String, metadataKeys: Seq[String]): Array[Byte] = { + val builder = new VariantBuilder(false) + metadataKeys.foreach(builder.addKey) + builder.appendVariant(VariantBuilder.parseJson(s, false)) + builder.result().getValue + } + + // Given an expected schema of a Variant value, return a write schema with a single column `v` + // with the corresponding shredding schema. + def writeSchema(schema: DataType): StructType = + StructType(Array(StructField("v", SparkShreddingUtils.variantShreddingSchema(schema)))) + + def withPushConfigs(pushConfigs: Seq[Boolean] = Seq(true, false))(fn: => Unit): Unit = { + for (push <- pushConfigs) { + withSQLConf(SQLConf.PUSH_VARIANT_INTO_SCAN.key -> push.toString) { + fn + } + } + } + + def isPushEnabled: Boolean = SQLConf.get.getConf(SQLConf.PUSH_VARIANT_INTO_SCAN) + + def testWithTempPath(name: String)(block: File => Unit): Unit = test(name) { + withPushConfigs() { + withTempPath { path => + block(path) + } + } + } + + def writeRows(path: File, schema: StructType, rows: Row*): Unit = + spark.createDataFrame(spark.sparkContext.parallelize(rows.map(Row(_)), numSlices = 1), schema) + .write.mode("overwrite").parquet(path.getAbsolutePath) + + def writeRows(path: File, schema: String, rows: Row*): Unit = + writeRows(path, StructType.fromDDL(schema), rows: _*) + + def read(path: File): DataFrame = + spark.read.schema("v variant").parquet(path.getAbsolutePath) + + def checkExpr(path: File, expr: String, expected: Any*): Unit = withAllParquetReaders { + checkAnswer(read(path).selectExpr(expr), expected.map(Row(_))) + } + + def checkException(path: File, expr: String, msg: String): Unit = withAllParquetReaders { + val ex = intercept[Exception with SparkThrowable] { + read(path).selectExpr(expr).collect() + } + // When reading with the parquet-mr reader, the expected message can be nested in + // `ex.getCause.getCause`. + assert(ex.getMessage.contains(msg) || ex.getCause.getMessage.contains(msg) + || ex.getCause.getCause.getMessage.contains(msg)) + } + + testWithTempPath("scalar types rebuild") { path => + val scalarTypes = Array( + BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, + TimestampType, TimestampNTZType, DateType, + StringType, BinaryType, + DecimalType(9, 3), DecimalType(18, 6), DecimalType(22, 9)) + val schema = StructType(scalarTypes.zipWithIndex.map { case (t, i) => + StructField(i.toString, t) + }) + + val values = Seq[Any]( + true, 1.toByte, 2.toShort, 3, 4L, 5.5F, 6.6, + new Timestamp(7), LocalDateTime.of(1, 1, 1, 0, 0, 8, 0), new Date(9), + "str10", Array[Byte](11), + Decimal("12.12"), Decimal("13.13"), Decimal("14.14")).map(Row(null, _)) + val row = Row(metadata(scalarTypes.indices.map(_.toString)), null, Row.fromSeq(values)) + + writeRows(path, writeSchema(schema), row) + for (tz <- Seq("Etc/UTC", "America/Los_Angeles")) { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) { + val timestamp = if (tz == "Etc/UTC") { + "1970-01-01 00:00:00.007+00:00" + } else { + "1969-12-31 16:00:00.007-08:00" + } + checkExpr(path, "to_json(v)", + """{"0":true,"1":1,"10":"str10","11":"Cw==","12":12.12,"13":13.13,"14":14.14,""" + + s""""2":2,"3":3,"4":4,"5":5.5,"6":6.6,"7":"$timestamp",""" + + """"8":"0001-01-01 00:00:08","9":"1969-12-31"}""") + checkExpr(path, "variant_get(v, '$.0', 'int')", 1) + checkExpr(path, "variant_get(v, '$.2', 'boolean')", true) + checkExpr(path, "variant_get(v, '$.6', 'float')", 6.6F) + checkExpr(path, "variant_get(v, '$.11', 'string')", new String(Array[Byte](11))) + checkExpr(path, "variant_get(v, '$.14', 'decimal(9, 1)')", BigDecimal("14.1")) + } + } + } + + testWithTempPath("object rebuild") { path => + writeRows(path, writeSchema(StructType.fromDDL("b int, d int")), + Row(metadata(Seq("b", "d")), null, Row(Row(null, 1), Row(null, null))), + Row(metadata(Seq("b", "d")), null, Row(Row(null, 1), Row(value("null"), null))), + Row(metadata(Seq("a", "b", "c", "d")), + shreddedValue("""{"a": 1, "c": 3}""", Seq("a", "b", "c", "d")), + Row(Row(null, 2), Row(value("4"), null))), + Row(metadata(Nil), value("null"), null), + null) + checkExpr(path, "to_json(v)", """{"b":1}""", """{"b":1,"d":null}""", + """{"a":1,"b":2,"c":3,"d":4}""", "null", null) + checkExpr(path, "variant_get(v, '$.b', 'string')", "1", "1", "2", null, null) + checkExpr(path, "variant_get(v, '$.d', 'string')", null, null, "4", null, null) + } + + testWithTempPath("array rebuild") { path => + writeRows(path, writeSchema(ArrayType(IntegerType)), + Row(metadata(Nil), null, Array(Row(null, 1), Row(null, 2), Row(value("3"), null))), + Row(metadata(Seq("a", "b")), null, Array( + Row(shreddedValue("""{"a": 1}""", Seq("a", "b")), null), + Row(shreddedValue("""{"b": 2}""", Seq("a", "b")), null))), + Row(metadata(Seq("a", "b")), value("""{"a": 1, "b": 2}"""), null)) + checkExpr(path, "to_json(v)", """[1,2,3]""", """[{"a":1},{"b":2}]""", """{"a":1,"b":2}""") + checkExpr(path, "variant_get(v, '$[2]', 'int')", 3, null, null) + checkExpr(path, "variant_get(v, '$[1].b', 'int')", null, 2, null) + checkExpr(path, "variant_get(v, '$.a', 'long')", null, null, 1L) + } + + testWithTempPath("malformed input") { path => + // Top-level variant must not be missing. + writeRows(path, writeSchema(IntegerType), Row(metadata(Nil), null, null)) + checkException(path, "v", "MALFORMED_VARIANT") + + // Array-element variant must not be missing. + writeRows(path, writeSchema(ArrayType(IntegerType)), + Row(metadata(Nil), null, Array(Row(null, null)))) + checkException(path, "v", "MALFORMED_VARIANT") + checkException(path, "variant_get(v, '$[0]')", "MALFORMED_VARIANT") + + // Shredded field must not be null. + // Construct the schema manually, because SparkShreddingUtils.variantShreddingSchema will make + // `a` non-nullable, which would prevent us from writing the file. + val schema = StructType(Seq(StructField("v", StructType(Seq( + StructField("metadata", BinaryType), + StructField("value", BinaryType), + StructField("typed_value", StructType(Seq( + StructField("a", StructType(Seq( + StructField("value", BinaryType), + StructField("typed_value", BinaryType)))))))))))) + writeRows(path, schema, Row(metadata(Seq("a")), null, Row(null))) + checkException(path, "v", "MALFORMED_VARIANT") + checkException(path, "variant_get(v, '$.a')", "MALFORMED_VARIANT") + + // `value` must not contain any shredded field. + writeRows(path, writeSchema(StructType.fromDDL("a int")), + Row(metadata(Seq("a")), value("""{"a": 1}"""), Row(Row(null, null)))) + checkException(path, "v", "MALFORMED_VARIANT") + checkException(path, "cast(v as map)", "MALFORMED_VARIANT") + if (isPushEnabled) { + checkExpr(path, "cast(v as struct)", Row(null)) + checkExpr(path, "variant_get(v, '$.a', 'int')", null) + } else { + checkException(path, "cast(v as struct)", "MALFORMED_VARIANT") + checkException(path, "variant_get(v, '$.a', 'int')", "MALFORMED_VARIANT") + } + + // Scalar reader reads from `typed_value` if both `value` and `typed_value` are not null. + // Cast from `value` succeeds, cast from `typed_value` fails. + writeRows(path, "v struct", + Row(metadata(Nil), value("1"), "invalid")) + checkException(path, "cast(v as int)", "INVALID_VARIANT_CAST") + checkExpr(path, "try_cast(v as int)", null) + + // Cast from `value` fails, cast from `typed_value` succeeds. + writeRows(path, "v struct", + Row(metadata(Nil), value("\"invalid\""), "1")) + checkExpr(path, "cast(v as int)", 1) + checkExpr(path, "try_cast(v as int)", 1) + } + + testWithTempPath("extract from shredded object") { path => + val keys1 = Seq("a", "b", "c", "d") + val keys2 = Seq("a", "b", "c", "e", "f") + writeRows(path, "v struct, b struct," + + "c struct>>", + // {"a":1,"b":"2","c":3.3,"d":4.4}, d is in the left over value. + Row(metadata(keys1), shreddedValue("""{"d": 4.4}""", keys1), + Row(Row(null, 1), Row(value("\"2\"")), Row(Decimal("3.3")))), + // {"a":5.4,"b":-6,"e":{"f":[true]}}, e is in the left over value. + Row(metadata(keys2), shreddedValue("""{"e": {"f": [true]}}""", keys2), + Row(Row(value("5.4"), null), Row(value("-6")), Row(null))), + // [{"a":1}], the unshredded array at the top-level is put into `value` as a whole. + Row(metadata(Seq("a")), value("""[{"a": 1}]"""), null)) + + checkAnswer(read(path).selectExpr("variant_get(v, '$.a', 'int')", + "variant_get(v, '$.b', 'long')", "variant_get(v, '$.c', 'double')", + "variant_get(v, '$.d', 'decimal(9, 4)')"), + Seq(Row(1, 2L, 3.3, BigDecimal("4.4")), Row(5, -6L, null, null), Row(null, null, null, null))) + checkExpr(path, "variant_get(v, '$.e.f[0]', 'boolean')", null, true, null) + checkExpr(path, "variant_get(v, '$[0].a', 'boolean')", null, null, true) + checkExpr(path, "try_cast(v as struct)", + Row(1.0F, null), Row(5.4F, parseJson("""{"f": [true]}""")), null) + + // String "2" cannot be cast into boolean. + checkException(path, "variant_get(v, '$.b', 'boolean')", "INVALID_VARIANT_CAST") + // Decimal cannot be cast into date. + checkException(path, "variant_get(v, '$.c', 'date')", "INVALID_VARIANT_CAST") + // The value of `c` doesn't fit into `decimal(1, 1)`. + checkException(path, "variant_get(v, '$.c', 'decimal(1, 1)')", "INVALID_VARIANT_CAST") + checkExpr(path, "try_variant_get(v, '$.b', 'boolean')", null, true, null) + // Scalar cannot be cast into struct. + checkException(path, "variant_get(v, '$.a', 'struct')", "INVALID_VARIANT_CAST") + checkExpr(path, "try_variant_get(v, '$.a', 'struct')", null, null, null) + + checkExpr(path, "try_cast(v as map)", + Map("a" -> 1.0, "b" -> 2.0, "c" -> 3.3, "d" -> 4.4), + Map("a" -> 5.4, "b" -> -6.0, "e" -> null), null) + checkExpr(path, "try_cast(v as array)", null, null, Seq("""{"a":1}""")) + + val strings = Seq("""{"a":1,"b":"2","c":3.3,"d":4.4}""", + """{"a":5.4,"b":-6,"e":{"f":[true]}}""", """[{"a":1}]""") + checkExpr(path, "cast(v as string)", strings: _*) + checkExpr(path, "v", + VariantExpressionEvalUtils.castToVariant( + InternalRow(1, UTF8String.fromString("2"), Decimal("3.3000000000"), Decimal("4.4")), + StructType.fromDDL("a int, b string, c decimal(20, 10), d decimal(2, 1)") + ), + parseJson(strings(1)), + parseJson(strings(2)) + ) + } + + testWithTempPath("extract from shredded array") { path => + val keys = Seq("a", "b") + writeRows(path, "v struct>>>>", + // [{"a":"2000-01-01"},{"a":"1000-01-01","b":[7]}], b is in the left over value. + Row(metadata(keys), null, Array( + Row(null, Row(Row(null, "2000-01-01"))), + Row(shreddedValue("""{"b": [7]}""", keys), Row(Row(null, "1000-01-01"))))), + // [null,{"a":null},{"a":"null"},{}] + Row(metadata(keys), null, Array( + Row(value("null"), null), + Row(null, Row(Row(value("null"), null))), + Row(null, Row(Row(null, "null"))), + Row(null, Row(Row(null, null)))))) + + val date1 = Date.valueOf("2000-01-01") + val date2 = Date.valueOf("1000-01-01") + checkExpr(path, "variant_get(v, '$[0].a', 'date')", date1, null) + // try_cast succeeds. + checkExpr(path, "try_variant_get(v, '$[1].a', 'date')", date2, null) + // The first array returns null because of out-of-bound index. + // The second array returns "null". + checkExpr(path, "variant_get(v, '$[2].a', 'string')", null, "null") + // Return null because of invalid cast. + checkExpr(path, "try_variant_get(v, '$[1].a', 'int')", null, null) + + checkExpr(path, "variant_get(v, '$[0].b[0]', 'int')", null, null) + checkExpr(path, "variant_get(v, '$[1].b[0]', 'int')", 7, null) + // Validate timestamp-related casts uses the session time zone correctly. + Seq("Etc/UTC", "America/Los_Angeles").foreach { tz => + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) { + val expected = sql("select timestamp'1000-01-01', timestamp_ntz'1000-01-01'").head() + checkAnswer(read(path).selectExpr("variant_get(v, '$[1].a', 'timestamp')", + "variant_get(v, '$[1].a', 'timestamp_ntz')"), Seq(expected, Row(null, null))) + } + } + checkException(path, "variant_get(v, '$[0]', 'int')", "INVALID_VARIANT_CAST") + // An out-of-bound array access produces null. It never causes an invalid cast. + checkExpr(path, "variant_get(v, '$[4]', 'int')", null, null) + + checkExpr(path, "cast(v as array>>)", + Seq(Row("2000-01-01", null), Row("1000-01-01", Seq(7))), + Seq(null, Row(null, null), Row("null", null), Row(null, null))) + checkExpr(path, "cast(v as array>)", + Seq(Map("a" -> "2000-01-01"), Map("a" -> "1000-01-01", "b" -> "[7]")), + Seq(null, Map("a" -> null), Map("a" -> "null"), Map())) + checkExpr(path, "try_cast(v as array>)", + Seq(Map("a" -> date1), Map("a" -> date2, "b" -> null)), + Seq(null, Map("a" -> null), Map("a" -> null), Map())) + + val strings = Seq("""[{"a":"2000-01-01"},{"a":"1000-01-01","b":[7]}]""", + """[null,{"a":null},{"a":"null"},{}]""") + checkExpr(path, "cast(v as string)", strings: _*) + checkExpr(path, "v", strings.map(parseJson): _*) + } + + testWithTempPath("missing fields") { path => + writeRows(path, "v struct, b struct>>", + Row(metadata(Nil), Row(Row(null, null), Row(null))), + Row(metadata(Nil), Row(Row(value("null"), null), Row(null))), + Row(metadata(Nil), Row(Row(null, 1), Row(null))), + Row(metadata(Nil), Row(Row(null, null), Row(2))), + Row(metadata(Nil), Row(Row(value("null"), null), Row(2))), + Row(metadata(Nil), Row(Row(null, 3), Row(4)))) + + val strings = Seq("{}", """{"a":null}""", """{"a":1}""", """{"b":2}""", """{"a":null,"b":2}""", + """{"a":3,"b":4}""") + checkExpr(path, "cast(v as string)", strings: _*) + checkExpr(path, "v", strings.map(parseJson): _*) + + checkExpr(path, "variant_get(v, '$.a', 'string')", null, null, "1", null, null, "3") + checkExpr(path, "variant_get(v, '$.a')", null, parseJson("null"), parseJson("1"), null, + parseJson("null"), parseJson("3")) + } + + testWithTempPath("custom casts") { path => + writeRows(path, writeSchema(LongType), + Row(metadata(Nil), null, Long.MaxValue / MICROS_PER_SECOND + 1), + Row(metadata(Nil), null, Long.MaxValue / MICROS_PER_SECOND)) + + // long -> timestamp + checkException(path, "cast(v as timestamp)", "INVALID_VARIANT_CAST") + checkExpr(path, "try_cast(v as timestamp)", + null, toJavaTimestamp(Long.MaxValue / MICROS_PER_SECOND * MICROS_PER_SECOND)) + + writeRows(path, writeSchema(DecimalType(38, 19)), + Row(metadata(Nil), null, Decimal("1E18")), + Row(metadata(Nil), null, Decimal("100")), + Row(metadata(Nil), null, Decimal("10")), + Row(metadata(Nil), null, Decimal("1")), + Row(metadata(Nil), null, Decimal("0")), + Row(metadata(Nil), null, Decimal("0.1")), + Row(metadata(Nil), null, Decimal("0.01")), + Row(metadata(Nil), null, Decimal("1E-18"))) + + checkException(path, "cast(v as timestamp)", "INVALID_VARIANT_CAST") + // decimal -> timestamp + checkExpr(path, "try_cast(v as timestamp)", + (null +: Seq(100000000, 10000000, 1000000, 0, 100000, 10000, 0).map(toJavaTimestamp(_))): _*) + // decimal -> string + checkExpr(path, "cast(v as string)", + "1000000000000000000", "100", "10", "1", "0", "0.1", "0.01", "0.000000000000000001") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala index 5d59a3e0f8256..09b29b668b134 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ +import org.apache.spark.types.variant.VariantUtil._ import org.apache.spark.unsafe.types.{UTF8String, VariantVal} import org.apache.spark.util.ArrayImplicits._ @@ -117,7 +118,8 @@ class VariantSuite extends QueryTest with SharedSparkSession with ExpressionEval rand.nextBytes(value) val metadata = new Array[Byte](rand.nextInt(50)) rand.nextBytes(metadata) - new VariantVal(value, metadata) + // Generate a valid metadata, otherwise the shredded reader will fail. + new VariantVal(value, Array[Byte](VERSION, 0, 0) ++ metadata) } } @@ -151,7 +153,8 @@ class VariantSuite extends QueryTest with SharedSparkSession with ExpressionEval val metadata = new Array[Byte](rand.nextInt(50)) rand.nextBytes(metadata) val numElements = 3 // rand.nextInt(10) - Seq.fill(numElements)(new VariantVal(value, metadata)) + // Generate a valid metadata, otherwise the shredded reader will fail. + Seq.fill(numElements)(new VariantVal(value, Array[Byte](VERSION, 0, 0) ++ metadata)) } } @@ -299,7 +302,9 @@ class VariantSuite extends QueryTest with SharedSparkSession with ExpressionEval df.write.parquet(file) val schema = StructType(Seq(StructField("v", VariantType))) val result = spark.read.schema(schema).parquet(file).selectExpr("to_json(v)") - val e = intercept[org.apache.spark.SparkException](result.collect()) + val e = withSQLConf(SQLConf.VARIANT_ALLOW_READING_SHREDDED.key -> "false") { + intercept[org.apache.spark.SparkException](result.collect()) + } checkError( exception = e.getCause.asInstanceOf[AnalysisException], condition = condition, @@ -429,26 +434,26 @@ class VariantSuite extends QueryTest with SharedSparkSession with ExpressionEval checkError( exception = intercept[AnalysisException] { - spark.sql("select parse_json('') order by 1") + spark.sql("select parse_json('') v order by 1") }, condition = "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", parameters = Map( "functionName" -> "`sortorder`", "dataType" -> "\"VARIANT\"", - "sqlExpr" -> "\"parse_json() ASC NULLS FIRST\""), - context = ExpectedContext(fragment = "order by 1", start = 22, stop = 31) + "sqlExpr" -> "\"v ASC NULLS FIRST\""), + context = ExpectedContext(fragment = "order by 1", start = 24, stop = 33) ) checkError( exception = intercept[AnalysisException] { - spark.sql("select parse_json('') sort by 1") + spark.sql("select parse_json('') v sort by 1") }, condition = "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", parameters = Map( "functionName" -> "`sortorder`", "dataType" -> "\"VARIANT\"", - "sqlExpr" -> "\"parse_json() ASC NULLS FIRST\""), - context = ExpectedContext(fragment = "sort by 1", start = 22, stop = 30) + "sqlExpr" -> "\"v ASC NULLS FIRST\""), + context = ExpectedContext(fragment = "sort by 1", start = 24, stop = 32) ) checkError( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantWriteShreddingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantWriteShreddingSuite.scala index ed66ddb1f0f44..d31bf109af6c7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantWriteShreddingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantWriteShreddingSuite.scala @@ -67,6 +67,36 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper private val emptyMetadata: Array[Byte] = parseJson("null").getMetadata + test("variantShreddingSchema") { + // Validate the schema produced by SparkShreddingUtils.variantShreddingSchema for a few simple + // cases. + // metadata is always non-nullable. + assert(SparkShreddingUtils.variantShreddingSchema(IntegerType) == + StructType(Seq( + StructField("metadata", BinaryType, nullable = false), + StructField("value", BinaryType, nullable = true), + StructField("typed_value", IntegerType, nullable = true)))) + + val fieldA = StructType(Seq( + StructField("value", BinaryType, nullable = true), + StructField("typed_value", TimestampNTZType, nullable = true))) + val arrayType = ArrayType(StructType(Seq( + StructField("value", BinaryType, nullable = true), + StructField("typed_value", StringType, nullable = true))), containsNull = false) + val fieldB = StructType(Seq( + StructField("value", BinaryType, nullable = true), + StructField("typed_value", arrayType, nullable = true))) + val objectType = StructType(Seq( + StructField("a", fieldA, nullable = false), + StructField("b", fieldB, nullable = false))) + val structSchema = DataType.fromDDL("a timestamp_ntz, b array") + assert(SparkShreddingUtils.variantShreddingSchema(structSchema) == + StructType(Seq( + StructField("metadata", BinaryType, nullable = false), + StructField("value", BinaryType, nullable = true), + StructField("typed_value", objectType, nullable = true)))) + } + test("shredding as fixed numeric types") { /* Cast integer to any wider numeric type. */ testWithSchema("1", IntegerType, Row(emptyMetadata, null, 1)) @@ -179,6 +209,17 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper // Not an object testWithSchema(obj, ArrayType(StructType.fromDDL("a int, b string")), Row(obj.getMetadata, untypedValue(obj), null)) + + // Similar to the case above where "b" was not in the shredding schema, but with the unshredded + // value being an object. Check that the copied value has correct dictionary IDs. + val obj2 = parseJson("""{"a": 1, "b": {"c": "hello"}}""") + val residual2 = untypedValue("""{"b": {"c": "hello"}}""") + // First byte is the type, second is number of fields, and the third is the ID for "b" + residual2(2) = 1 + // Followed by 2 bytes for offsets, inner object type and number of fields, then ID for "c". + residual2(7) = 2 + testWithSchema(obj2, StructType.fromDDL("a int, c string"), + Row(obj2.getMetadata, residual2, Row(Row(null, 1), Row(null, null)))) } test("shredding as array") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala index f08466e8f8d9d..f2a86cbf54152 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import org.apache.spark.sql.catalyst.expressions.IsNotNull +import org.apache.spark.sql.execution.FilterExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession @@ -76,4 +78,38 @@ class XPathFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df.select(xpath(col("xml"), lit("a/*/text()"))), Row(Seq("b1", "b2", "b3", "c1", "c2"))) } + + test("The replacement of `xpath*` functions should be NullIntolerant") { + def check(df: DataFrame, expected: Seq[Row]): Unit = { + val filter = df.queryExecution + .sparkPlan + .find(_.isInstanceOf[FilterExec]) + .get.asInstanceOf[FilterExec] + assert(filter.condition.find(_.isInstanceOf[IsNotNull]).nonEmpty) + checkAnswer(df, expected) + } + withTable("t") { + sql("CREATE TABLE t AS SELECT * FROM VALUES ('1'), (NULL) T(xml)") + check(sql("SELECT * FROM t WHERE xpath_boolean(xml, 'a/b') = true"), + Seq(Row("1"))) + check(sql("SELECT * FROM t WHERE xpath_short(xml, 'a/b') = 1"), + Seq(Row("1"))) + check(sql("SELECT * FROM t WHERE xpath_int(xml, 'a/b') = 1"), + Seq(Row("1"))) + check(sql("SELECT * FROM t WHERE xpath_long(xml, 'a/b') = 1"), + Seq(Row("1"))) + check(sql("SELECT * FROM t WHERE xpath_float(xml, 'a/b') = 1"), + Seq(Row("1"))) + check(sql("SELECT * FROM t WHERE xpath_double(xml, 'a/b') = 1"), + Seq(Row("1"))) + check(sql("SELECT * FROM t WHERE xpath_string(xml, 'a/b') = '1'"), + Seq(Row("1"))) + } + withTable("t") { + sql("CREATE TABLE t AS SELECT * FROM VALUES " + + "('b1b2b3c1c2'), (NULL) T(xml)") + check(sql("SELECT * FROM t WHERE xpath(xml, 'a/b/text()') = array('b1', 'b2', 'b3')"), + Seq(Row("b1b2b3c1c2"))) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ExplicitlyUnsupportedResolverFeatureSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ExplicitlyUnsupportedResolverFeatureSuite.scala new file mode 100644 index 0000000000000..7fd7d570ecfc1 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ExplicitlyUnsupportedResolverFeatureSuite.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.analysis.resolver.Resolver +import org.apache.spark.sql.test.SharedSparkSession + +class ExplicitlyUnsupportedResolverFeatureSuite extends QueryTest with SharedSparkSession { + test("Unsupported table types") { + withTable("csv_table") { + spark.sql("CREATE TABLE csv_table (col1 INT) USING CSV;").collect() + checkResolution("SELECT * FROM csv_table;", shouldPass = true) + } + withTable("json_table") { + spark.sql("CREATE TABLE json_table (col1 INT) USING JSON;").collect() + checkResolution("SELECT * FROM json_table;", shouldPass = true) + } + withTable("parquet_table") { + spark.sql("CREATE TABLE parquet_table (col1 INT) USING PARQUET;").collect() + checkResolution("SELECT * FROM parquet_table;", shouldPass = true) + } + withTable("orc_table") { + spark.sql("CREATE TABLE orc_table (col1 INT) USING ORC;").collect() + checkResolution("SELECT * FROM orc_table;", shouldPass = true) + } + } + + test("Unsupported view types") { + withTable("src_table") { + spark.sql("CREATE TABLE src_table (col1 INT) USING PARQUET;").collect() + + withView("temporary_view") { + spark.sql("CREATE TEMPORARY VIEW temporary_view AS SELECT * FROM src_table;").collect() + checkResolution("SELECT * FROM temporary_view;") + } + + withView("persistent_view") { + spark.sql("CREATE VIEW persistent_view AS SELECT * FROM src_table;").collect() + checkResolution("SELECT * FROM persistent_view;") + } + } + } + + test("Unsupported char type padding") { + withTable("char_type_padding") { + spark.sql(s"CREATE TABLE t1 (c1 CHAR(3), c2 STRING) USING PARQUET") + checkResolution("SELECT c1 = '12', c1 = '12 ', c1 = '12 ' FROM t1 WHERE c2 = '12'") + } + } + + test("Unsupported lateral column alias") { + checkResolution("SELECT 1 AS a, a AS b") + checkResolution("SELECT sum(1), `sum(1)` + 1 AS a") + } + + private def checkResolution(sqlText: String, shouldPass: Boolean = false): Unit = { + def noopWrapper(body: => Unit) = body + + val wrapper = if (shouldPass) { + noopWrapper _ + } else { + intercept[Throwable] _ + } + + val unresolvedPlan = spark.sql(sqlText).queryExecution.logical + + val resolver = new Resolver( + spark.sessionState.catalogManager, + extensions = spark.sessionState.analyzer.singlePassResolverExtensions + ) + wrapper { + resolver.lookupMetadataAndResolve(unresolvedPlan) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/HybridAnalyzerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/HybridAnalyzerSuite.scala new file mode 100644 index 0000000000000..587725093f0e5 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/HybridAnalyzerSuite.scala @@ -0,0 +1,404 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import org.scalactic.source.Position +import org.scalatest.Tag + +import org.apache.spark.sql.{AnalysisException, QueryTest} +import org.apache.spark.sql.catalyst.{ + AliasIdentifier, + ExtendedAnalysisException, + QueryPlanningTracker +} +import org.apache.spark.sql.catalyst.analysis.{ + AnalysisContext, + Analyzer, + UnresolvedAttribute, + UnresolvedStar +} +import org.apache.spark.sql.catalyst.analysis.resolver.{ + AnalyzerBridgeState, + ExplicitlyUnsupportedResolverFeature, + HybridAnalyzer, + Resolver, + ResolverGuard +} +import org.apache.spark.sql.catalyst.expressions.AttributeReference +import org.apache.spark.sql.catalyst.plans.logical.{ + LocalRelation, + LogicalPlan, + Project, + SubqueryAlias +} +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, MetadataBuilder} + +class HybridAnalyzerSuite extends QueryTest with SharedSparkSession { + private val col1Integer = AttributeReference("col1", IntegerType)() + private val col2Integer = AttributeReference("col2", IntegerType)() + private val col2IntegerWithMetadata = AttributeReference( + "col2", + IntegerType, + metadata = (new MetadataBuilder).putString("comment", "this is an integer").build() + )() + + private def validateSinglePassResolverBridgeState(bridgeRelations: Boolean): Unit = { + assert(bridgeRelations == AnalysisContext.get.getSinglePassResolverBridgeState.isDefined) + } + + private class BrokenResolver(ex: Throwable, bridgeRelations: Boolean) + extends Resolver(spark.sessionState.catalogManager) { + override def lookupMetadataAndResolve( + plan: LogicalPlan, + analyzerBridgeState: Option[AnalyzerBridgeState] = None): LogicalPlan = { + validateSinglePassResolverBridgeState(bridgeRelations) + throw ex + } + } + + private class ValidatingResolver(bridgeRelations: Boolean) + extends Resolver(spark.sessionState.catalogManager) { + override def lookupMetadataAndResolve( + plan: LogicalPlan, + analyzerBridgeState: Option[AnalyzerBridgeState] = None): LogicalPlan = { + validateSinglePassResolverBridgeState(bridgeRelations) + super.lookupMetadataAndResolve(plan, analyzerBridgeState) + } + } + + private class HardCodedResolver(resolvedPlan: LogicalPlan, bridgeRelations: Boolean) + extends Resolver(spark.sessionState.catalogManager) { + override def lookupMetadataAndResolve( + plan: LogicalPlan, + analyzerBridgeState: Option[AnalyzerBridgeState] = None): LogicalPlan = { + validateSinglePassResolverBridgeState(bridgeRelations) + resolvedPlan + } + } + + private class ValidatingAnalyzer(bridgeRelations: Boolean) + extends Analyzer(spark.sessionState.catalogManager) { + override def executeAndTrack(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { + validateSinglePassResolverBridgeState(bridgeRelations) + super.executeAndTrack(plan, tracker) + } + } + + private class BrokenAnalyzer(ex: Throwable, bridgeRelations: Boolean) + extends Analyzer(spark.sessionState.catalogManager) { + override def executeAndTrack(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { + validateSinglePassResolverBridgeState(bridgeRelations) + throw ex + } + } + + private class CustomAnalyzer(customCode: () => Unit, bridgeRelations: Boolean) + extends Analyzer(spark.sessionState.catalogManager) { + override def executeAndTrack(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { + validateSinglePassResolverBridgeState(bridgeRelations) + customCode() + super.executeAndTrack(plan, tracker) + } + } + + override protected def test(testName: String, testTags: Tag*)(testFun: => Any)( + implicit pos: Position): Unit = { + super.test(testName) { + withSQLConf( + SQLConf.ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER.key -> "true" + ) { + testFun + } + } + } + + test("Both fixed-point and single-pass analyzers pass") { + val plan: LogicalPlan = { + Project( + Seq(UnresolvedStar(None)), + LocalRelation(col1Integer) + ) + } + val resolvedPlan = + Project( + Seq(col1Integer), + LocalRelation(Seq(col1Integer)) + ) + assert( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new ValidatingResolver(bridgeRelations = true) + ).apply(plan, null) + == + resolvedPlan + ) + } + + test("Fixed-point analyzer passes, single-pass analyzer fails") { + val plan: LogicalPlan = + Project(Seq(UnresolvedStar(None)), LocalRelation(col1Integer)) + checkError( + exception = intercept[AnalysisException]( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new BrokenResolver( + QueryCompilationErrors.unsupportedSinglePassAnalyzerFeature("test"), + bridgeRelations = true + ) + ).apply(plan, null) + ), + condition = "UNSUPPORTED_SINGLE_PASS_ANALYZER_FEATURE", + parameters = Map("feature" -> "test") + ) + } + + test("Fixed-point analyzer fails, single-pass analyzer passes") { + val plan: LogicalPlan = + Project( + Seq(UnresolvedAttribute("nonexistent_col")), + LocalRelation(col1Integer) + ) + val resolvedPlan = + Project( + Seq(col1Integer), + LocalRelation(Seq(col1Integer)) + ) + checkError( + exception = intercept[AnalysisException]( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new HardCodedResolver(resolvedPlan, bridgeRelations = true) + ).apply(plan, null) + ), + condition = "HYBRID_ANALYZER_EXCEPTION.FIXED_POINT_FAILED_SINGLE_PASS_SUCCEEDED", + parameters = Map("singlePassOutput" -> resolvedPlan.toString) + ) + } + + test("Both fixed-point and single-pass analyzers fail") { + val plan: LogicalPlan = + Project( + Seq(UnresolvedAttribute("nonexistent_col")), + LocalRelation(col1Integer) + ) + checkError( + exception = intercept[ExtendedAnalysisException]( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new ValidatingResolver(bridgeRelations = true) + ).apply(plan, null) + ), + condition = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + parameters = Map( + "objectName" -> "`nonexistent_col`", + "proposal" -> "`col1`" + ) + ) + } + + test("Plan mismatch") { + val plan: LogicalPlan = + Project( + Seq(UnresolvedAttribute("col1")), + SubqueryAlias( + AliasIdentifier("t", Seq.empty), + LocalRelation(Seq(col1Integer)) + ) + ) + val resolvedPlan = + Project( + Seq(col1Integer), + LocalRelation(Seq(col1Integer)) + ) + val expectedResolvedPlan = + Project( + Seq(col1Integer), + SubqueryAlias( + AliasIdentifier("t", Seq.empty), + LocalRelation(Seq(col1Integer)) + ) + ) + checkError( + exception = intercept[AnalysisException]( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new HardCodedResolver(resolvedPlan, bridgeRelations = true) + ).apply(plan, null) + ), + condition = "HYBRID_ANALYZER_EXCEPTION.LOGICAL_PLAN_COMPARISON_MISMATCH", + parameters = Map( + "singlePassOutput" -> resolvedPlan.toString, + "fixedPointOutput" -> expectedResolvedPlan.toString + ) + ) + } + + test("Missing metadata in output schema") { + val plan: LogicalPlan = + Project( + Seq(UnresolvedAttribute("col2")), + LocalRelation(col2IntegerWithMetadata) + ) + val resolvedPlan = + Project( + Seq(col2Integer), + LocalRelation(Seq(col2Integer)) + ) + checkError( + exception = intercept[AnalysisException]( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new HardCodedResolver(resolvedPlan, bridgeRelations = true) + ).apply(plan, null) + ), + condition = "HYBRID_ANALYZER_EXCEPTION.OUTPUT_SCHEMA_COMPARISON_MISMATCH", + parameters = Map( + "singlePassOutputSchema" -> "(col2,IntegerType,true,{})", + "fixedPointOutputSchema" -> "(col2,IntegerType,true,{\"comment\":\"this is an integer\"})" + ) + ) + } + + test("Explicitly unsupported resolver feature") { + val plan: LogicalPlan = { + Project( + Seq(UnresolvedStar(None)), + LocalRelation(col1Integer) + ) + } + checkAnswer( + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = true), + new ResolverGuard(spark.sessionState.catalogManager), + new BrokenResolver( + new ExplicitlyUnsupportedResolverFeature("FAILURE"), + bridgeRelations = true + ) + ).apply(plan, null), + plan + ) + } + + test("Fixed-point only run") { + val plan = Project( + Seq(UnresolvedStar(None)), + LocalRelation(col1Integer) + ) + val resolvedPlan = Project( + Seq(col1Integer), + LocalRelation(Seq(col1Integer)) + ) + assert( + withSQLConf( + SQLConf.ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER.key -> "false" + ) { + new HybridAnalyzer( + new ValidatingAnalyzer(bridgeRelations = false), + new ResolverGuard(spark.sessionState.catalogManager), + new BrokenResolver( + new Exception("Single-pass resolver should not be invoked"), + bridgeRelations = false + ) + ).apply(plan, null) + } + == + resolvedPlan + ) + } + + test("Single-pass only run") { + val plan = Project( + Seq(UnresolvedStar(None)), + LocalRelation(col1Integer) + ) + val resolvedPlan = Project( + Seq(col1Integer), + LocalRelation(Seq(col1Integer)) + ) + assert( + withSQLConf( + SQLConf.ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER.key -> "false", + SQLConf.ANALYZER_SINGLE_PASS_RESOLVER_ENABLED.key -> "true" + ) { + new HybridAnalyzer( + new BrokenAnalyzer( + new Exception("Fixed-point analyzer should not be invoked"), + bridgeRelations = false + ), + new ResolverGuard(spark.sessionState.catalogManager), + new ValidatingResolver(bridgeRelations = false) + ).apply(plan, null) + } + == + resolvedPlan + ) + } + + test("Nested invocations") { + val plan = Project( + Seq(UnresolvedStar(None)), + LocalRelation(col1Integer) + ) + val resolvedPlan = Project( + Seq(col1Integer), + LocalRelation(Seq(col1Integer)) + ) + + val nestedAnalysis = () => { + assert( + withSQLConf( + SQLConf.ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER.key -> "false", + SQLConf.ANALYZER_SINGLE_PASS_RESOLVER_ENABLED.key -> "true" + ) { + new HybridAnalyzer( + new BrokenAnalyzer( + new Exception("Fixed-point analyzer should not be invoked"), + bridgeRelations = false + ), + new ResolverGuard(spark.sessionState.catalogManager), + new ValidatingResolver(bridgeRelations = false) + ).apply(plan, null) + } + == + resolvedPlan + ) + } + + assert( + new HybridAnalyzer( + new CustomAnalyzer( + customCode = () => { nestedAnalysis() }, + bridgeRelations = true + ), + new ResolverGuard(spark.sessionState.catalogManager), + new ValidatingResolver(bridgeRelations = true) + ).apply(plan, null) + == + resolvedPlan + ) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/MetadataResolverSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/MetadataResolverSuite.scala new file mode 100644 index 0000000000000..5fd21d7543b33 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/MetadataResolverSuite.scala @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import scala.collection.mutable + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.analysis.resolver.{ + AnalyzerBridgeState, + BridgedRelationMetadataProvider, + MetadataResolver, + RelationId, + Resolver +} +import org.apache.spark.sql.catalyst.catalog.UnresolvedCatalogRelation +import org.apache.spark.sql.catalyst.expressions.{Expression, PlanExpression} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.execution.datasources.{FileResolver, HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} +import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +class MetadataResolverSuite extends QueryTest with SharedSparkSession with SQLTestUtils { + private val keyValueTableSchema = StructType( + Seq( + StructField("key", IntegerType, true), + StructField("value", StringType, true) + ) + ) + private val fileTableSchema = StructType( + Seq( + StructField("id", LongType, true) + ) + ) + + test("Single CSV relation") { + withTable("src_csv") { + spark.sql("CREATE TABLE src_csv (key INT, value STRING) USING CSV;").collect() + + checkResolveUnresolvedCatalogRelation( + sqlText = "SELECT * FROM src_csv", + expectedTableData = Seq(createTableData("src_csv")) + ) + } + } + + test("Single ORC relation") { + withTable("src_orc") { + spark.sql("CREATE TABLE src_orc (key INT, value STRING) USING ORC;").collect() + + checkResolveUnresolvedCatalogRelation( + sqlText = "SELECT * FROM src_orc", + expectedTableData = Seq(createTableData("src_orc")) + ) + } + } + + test("Relation inside an EXISTS subquery") { + withTable("src") { + spark.sql("CREATE TABLE src (key INT, value STRING) USING PARQUET;").collect() + + checkResolveUnresolvedCatalogRelation( + sqlText = "SELECT * FROM VALUES (1) WHERE EXISTS (SELECT col1 FROM src)", + expectedTableData = Seq(createTableData("src")) + ) + } + } + + test("Relation inside an IN subquery") { + withTable("src") { + spark.sql("CREATE TABLE src (key INT, value STRING) USING PARQUET;").collect() + + checkResolveUnresolvedCatalogRelation( + sqlText = "SELECT * FROM VALUES (1) WHERE col1 IN (SELECT col1 FROM src)", + expectedTableData = Seq(createTableData("src")) + ) + } + } + + test("Relation inside a nested subquery expression") { + withTable("src") { + spark.sql("CREATE TABLE src (key INT, value STRING) USING PARQUET;").collect() + + checkResolveUnresolvedCatalogRelation( + sqlText = """ + SELECT + col1 + ( + SELECT 35 * ( + SELECT key FROM src LIMIT 1 + ) * col1 FROM VALUES (2) + ) + FROM + VALUES (1) + """, + expectedTableData = Seq(createTableData("src")) + ) + } + } + + test("Relation from a file") { + val df = spark.range(100).toDF() + withTempPath(f => { + df.write.json(f.getCanonicalPath) + checkResolveLogicalRelation( + sqlText = s"select id from json.`${f.getCanonicalPath}`", + expectedTableData = Seq( + RelationId( + multipartIdentifier = Seq("spark_catalog", "json", s"${f.getCanonicalPath}") + ) -> TestTableData( + name = s"file:${f.getCanonicalPath}", + schema = fileTableSchema + ) + ) + ) + }) + } + + test("Relation bridged from legacy Analyzer") { + withTable("src") { + spark.sql("CREATE TABLE src (key INT, value STRING) USING PARQUET;").collect() + + val analyzerBridgeState = new AnalyzerBridgeState + analyzerBridgeState.relationsWithResolvedMetadata.put( + UnresolvedRelation(Seq("src")), + createUnresolvedCatalogRelation("src") + ) + + checkResolveUnresolvedCatalogRelation( + sqlText = "SELECT * FROM src", + expectedTableData = Seq(createTableData("src")), + analyzerBridgeState = Some(analyzerBridgeState) + ) + } + } + + test("Relation not bridged from legacy Analyzer") { + withTable("src") { + spark.sql("CREATE TABLE src (key INT, value STRING) USING PARQUET;").collect() + + checkResolveUnresolvedCatalogRelation( + sqlText = "SELECT * FROM src", + expectedTableData = Seq.empty, + analyzerBridgeState = Some(new AnalyzerBridgeState) + ) + } + } + + private def checkResolveUnresolvedCatalogRelation( + sqlText: String, + expectedTableData: Seq[(RelationId, TestTableData)], + analyzerBridgeState: Option[AnalyzerBridgeState] = None): Unit = { + checkResolve( + sqlText, + expectedTableData, + relation => + relation.asInstanceOf[UnresolvedCatalogRelation].tableMeta.identifier.unquotedString, + relation => relation.asInstanceOf[UnresolvedCatalogRelation].tableMeta.schema, + analyzerBridgeState + ) + } + + private def checkResolveLogicalRelation( + sqlText: String, + expectedTableData: Seq[(RelationId, TestTableData)], + analyzerBridgeState: Option[AnalyzerBridgeState] = None): Unit = { + checkResolve( + sqlText, + expectedTableData, + relation => + relation + .asInstanceOf[LogicalRelation] + .relation + .asInstanceOf[HadoopFsRelation] + .location + .rootPaths + .mkString(","), + relation => relation.asInstanceOf[LogicalRelation].relation.schema, + analyzerBridgeState + ) + } + + private def checkResolve( + sqlText: String, + expectedTableData: Seq[(RelationId, TestTableData)], + getTableName: LogicalPlan => String, + getTableSchema: LogicalPlan => StructType, + analyzerBridgeState: Option[AnalyzerBridgeState]): Unit = { + val unresolvedPlan = spark.sql(sqlText).queryExecution.logical + + val metadataResolver = analyzerBridgeState match { + case Some(analyzerBridgeState) => + new BridgedRelationMetadataProvider( + spark.sessionState.catalogManager, + Resolver.createRelationResolution(spark.sessionState.catalogManager), + analyzerBridgeState + ) + case None => + val metadataResolver = new MetadataResolver( + spark.sessionState.catalogManager, + Resolver.createRelationResolution(spark.sessionState.catalogManager), + Seq(new FileResolver(spark)) + ) + metadataResolver.resolve(unresolvedPlan) + metadataResolver + } + + val actualTableData = new mutable.HashMap[RelationId, TestTableData] + + def findUnresolvedRelations(unresolvedPlan: LogicalPlan): Unit = unresolvedPlan.foreach { + case unresolvedRelation: UnresolvedRelation => + metadataResolver.getRelationWithResolvedMetadata(unresolvedRelation) match { + case Some(plan) => + val relationId = metadataResolver.relationIdFromUnresolvedRelation(unresolvedRelation) + val relation = plan match { + case SubqueryAlias(_, relation) => relation + case relation => relation + } + + actualTableData(relationId) = + TestTableData(getTableName(relation), getTableSchema(relation)) + case None => + } + case unresolvedPlan => + def traverseExpressions(expression: Expression): Unit = expression match { + case planExpression: PlanExpression[_] => + planExpression.plan match { + case plan: LogicalPlan => + findUnresolvedRelations(plan) + case _ => + } + case expression => + expression.children.foreach(traverseExpressions) + } + + unresolvedPlan.expressions.foreach(traverseExpressions) + } + + findUnresolvedRelations(unresolvedPlan) + + assert(actualTableData == mutable.HashMap(expectedTableData: _*)) + } + + private def createTableData(name: String) = + RelationId( + multipartIdentifier = Seq("spark_catalog", "default", name) + ) -> TestTableData( + name = s"spark_catalog.default.$name", + schema = keyValueTableSchema + ) + + private def createUnresolvedCatalogRelation(name: String) = SubqueryAlias( + AliasIdentifier(name), + UnresolvedCatalogRelation( + spark.sessionState.catalog.getTableMetadata(TableIdentifier(name)), + CaseInsensitiveStringMap.empty + ) + ) + + private case class TestTableData(name: String, schema: StructType) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/NameScopeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/NameScopeSuite.scala new file mode 100644 index 0000000000000..ec744af89f000 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/NameScopeSuite.scala @@ -0,0 +1,659 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.analysis.UnresolvedStar +import org.apache.spark.sql.catalyst.analysis.resolver.{NameScope, NameScopeStack, NameTarget} +import org.apache.spark.sql.catalyst.expressions.{ + AttributeReference, + GetArrayItem, + GetArrayStructFields, + GetMapValue, + GetStructField, + Literal +} +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.types.{ + ArrayType, + BooleanType, + IntegerType, + MapType, + StringType, + StructField, + StructType +} + +class NameScopeSuite extends PlanTest with SQLConfHelper { + private val col1Integer = AttributeReference(name = "col1", dataType = IntegerType)() + private val col1IntegerOther = AttributeReference(name = "col1", dataType = IntegerType)() + private val col2Integer = AttributeReference(name = "col2", dataType = IntegerType)() + private val col3Boolean = AttributeReference(name = "col3", dataType = BooleanType)() + private val col4String = AttributeReference(name = "col4", dataType = StringType)() + private val col5String = AttributeReference(name = "col5", dataType = StringType)() + private val col6IntegerWithQualifier = AttributeReference( + name = "col6", + dataType = IntegerType + )(qualifier = Seq("catalog", "database", "table")) + private val col6IntegerOtherWithQualifier = AttributeReference( + name = "col6", + dataType = IntegerType + )(qualifier = Seq("catalog", "database", "table")) + private val col7StringWithQualifier = AttributeReference( + name = "col7", + dataType = IntegerType + )(qualifier = Seq("catalog", "database", "table")) + private val col8Struct = AttributeReference( + name = "col8", + dataType = StructType(Seq(StructField("field", IntegerType, true))) + )() + private val col9NestedStruct = AttributeReference( + name = "col9", + dataType = StructType( + Seq( + StructField( + "field", + StructType( + Seq( + StructField("subfield", IntegerType) + ) + ) + ) + ) + ) + )() + private val col10Map = AttributeReference( + name = "col10", + dataType = MapType(StringType, IntegerType) + )() + private val col11MapWithStruct = AttributeReference( + name = "col11", + dataType = MapType( + StringType, + StructType(Seq(StructField("field", StringType))) + ) + )() + private val col12Array = AttributeReference( + name = "col12", + dataType = ArrayType(IntegerType) + )() + private val col13ArrayWithStruct = AttributeReference( + name = "col13", + dataType = ArrayType( + StructType(Seq(StructField("field", StringType))) + ) + )() + + test("Empty scope") { + val nameScope = new NameScope + + assert(nameScope.getAllAttributes.isEmpty) + + assert(nameScope.matchMultipartName(Seq("col1")) == NameTarget(candidates = Seq.empty)) + } + + test("Single unnamed plan") { + val nameScope = new NameScope + + nameScope += Seq(col1Integer, col2Integer, col3Boolean) + + assert(nameScope.getAllAttributes == Seq(col1Integer, col2Integer, col3Boolean)) + + assert( + nameScope.matchMultipartName(Seq("col1")) == NameTarget( + candidates = Seq(col1Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col2")) == NameTarget( + candidates = Seq(col2Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col3")) == NameTarget( + candidates = Seq(col3Boolean), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col4")) == NameTarget( + candidates = Seq.empty, + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + } + + test("Several unnamed plans") { + val nameScope = new NameScope + + nameScope += Seq(col1Integer) + nameScope += Seq(col2Integer, col3Boolean) + nameScope += Seq(col4String) + + assert(nameScope.getAllAttributes == Seq(col1Integer, col2Integer, col3Boolean, col4String)) + + assert( + nameScope.matchMultipartName(Seq("col1")) == NameTarget( + candidates = Seq(col1Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col2")) == NameTarget( + candidates = Seq(col2Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col3")) == NameTarget( + candidates = Seq(col3Boolean), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col4")) == NameTarget( + candidates = Seq(col4String), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col5")) == NameTarget( + candidates = Seq.empty, + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + } + + test("Single named plan") { + val nameScope = new NameScope + + nameScope("table1") = Seq(col1Integer, col2Integer, col3Boolean) + + assert(nameScope.getAllAttributes == Seq(col1Integer, col2Integer, col3Boolean)) + + assert( + nameScope.matchMultipartName(Seq("col1")) == NameTarget( + candidates = Seq(col1Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col2")) == NameTarget( + candidates = Seq(col2Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col3")) == NameTarget( + candidates = Seq(col3Boolean), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col4")) == NameTarget( + candidates = Seq.empty, + allAttributes = Seq(col1Integer, col2Integer, col3Boolean) + ) + ) + } + + test("Several named plans") { + val nameScope = new NameScope + + nameScope("table1") = Seq(col1Integer) + nameScope("table2") = Seq(col2Integer, col3Boolean) + nameScope("table2") = Seq(col4String) + nameScope("table3") = Seq(col5String) + + assert( + nameScope.getAllAttributes == Seq( + col1Integer, + col2Integer, + col3Boolean, + col4String, + col5String + ) + ) + + assert( + nameScope.matchMultipartName(Seq("col1")) == NameTarget( + candidates = Seq(col1Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String, col5String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col2")) == NameTarget( + candidates = Seq(col2Integer), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String, col5String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col3")) == NameTarget( + candidates = Seq(col3Boolean), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String, col5String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col4")) == NameTarget( + candidates = Seq(col4String), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String, col5String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col5")) == NameTarget( + candidates = Seq(col5String), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String, col5String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col6")) == NameTarget( + candidates = Seq.empty, + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String, col5String) + ) + ) + } + + test("Named and unnamed plans with case insensitive comparison") { + val col1Integer = AttributeReference(name = "Col1", dataType = IntegerType)() + val col2Integer = AttributeReference(name = "col2", dataType = IntegerType)() + val col3Boolean = AttributeReference(name = "coL3", dataType = BooleanType)() + val col4String = AttributeReference(name = "Col4", dataType = StringType)() + + val nameScope = new NameScope + + nameScope("TaBle1") = Seq(col1Integer) + nameScope("table2") = Seq(col2Integer, col3Boolean) + nameScope += Seq(col4String) + + assert(nameScope.getAllAttributes == Seq(col1Integer, col2Integer, col3Boolean, col4String)) + + assert( + nameScope.matchMultipartName(Seq("cOL1")) == NameTarget( + candidates = Seq(col1Integer.withName("cOL1")), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("CoL2")) == NameTarget( + candidates = Seq(col2Integer.withName("CoL2")), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col3")) == NameTarget( + candidates = Seq(col3Boolean.withName("col3")), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("COL4")) == NameTarget( + candidates = Seq(col4String.withName("COL4")), + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + assert( + nameScope.matchMultipartName(Seq("col5")) == NameTarget( + candidates = Seq.empty, + allAttributes = Seq(col1Integer, col2Integer, col3Boolean, col4String) + ) + ) + } + + test("Duplicate attribute names from one plan") { + val nameScope = new NameScope + + nameScope("table1") = Seq(col1Integer, col1Integer) + nameScope("table1") = Seq(col1IntegerOther) + + assert(nameScope.getAllAttributes == Seq(col1Integer, col1Integer, col1IntegerOther)) + + nameScope.matchMultipartName(Seq("col1")) == NameTarget( + candidates = Seq(col1Integer, col1IntegerOther) + ) + } + + test("Duplicate attribute names from several plans") { + val nameScope = new NameScope + + nameScope("table1") = Seq(col1Integer, col1IntegerOther) + nameScope("table2") = Seq(col1Integer, col1IntegerOther) + + assert( + nameScope.getAllAttributes == Seq( + col1Integer, + col1IntegerOther, + col1Integer, + col1IntegerOther + ) + ) + + nameScope.matchMultipartName(Seq("col1")) == NameTarget( + candidates = Seq( + col1Integer, + col1IntegerOther, + col1Integer, + col1IntegerOther + ) + ) + } + + test("Expand star") { + val nameScope = new NameScope + + nameScope("table") = + Seq(col6IntegerWithQualifier, col6IntegerOtherWithQualifier, col7StringWithQualifier) + + Seq(Seq("table"), Seq("database", "table"), Seq("catalog", "database", "table")) + .foreach(tableQualifier => { + assert( + nameScope.expandStar(UnresolvedStar(Some(tableQualifier))) + == Seq(col6IntegerWithQualifier, col6IntegerOtherWithQualifier, col7StringWithQualifier) + ) + }) + + checkError( + exception = intercept[AnalysisException]( + nameScope.expandStar(UnresolvedStar(Some(Seq("database", "table_fail")))) + ), + condition = "CANNOT_RESOLVE_STAR_EXPAND", + parameters = Map( + "targetString" -> "`database`.`table_fail`", + "columns" -> "`col6`, `col6`, `col7`" + ) + ) + + nameScope("table2") = Seq(col6IntegerWithQualifier) + + checkError( + exception = intercept[AnalysisException]( + nameScope.expandStar(UnresolvedStar(Some(Seq("table2")))) + ), + condition = "INVALID_USAGE_OF_STAR_OR_REGEX", + parameters = Map( + "elem" -> "'*'", + "prettyName" -> "query" + ) + ) + } + + test("Multipart attribute names") { + val nameScope = new NameScope + + nameScope("table") = Seq(col6IntegerWithQualifier) + + for (multipartIdentifier <- Seq( + Seq("catalog", "database", "table", "col6"), + Seq("database", "table", "col6"), + Seq("table", "col6") + )) { + assert( + nameScope.matchMultipartName(multipartIdentifier) == NameTarget( + candidates = Seq( + col6IntegerWithQualifier + ), + allAttributes = Seq(col6IntegerWithQualifier) + ) + ) + } + + for (multipartIdentifier <- Seq( + Seq("catalog.database.table", "col6"), + Seq("`database`.`table`.`col6`"), + Seq("table.col6") + )) { + assert( + nameScope.matchMultipartName(multipartIdentifier) == NameTarget( + candidates = Seq.empty, + allAttributes = Seq(col6IntegerWithQualifier) + ) + ) + } + } + + test("Nested fields") { + val nameScope = new NameScope + + nameScope("table") = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + + var matchedStructs = nameScope.matchMultipartName(Seq("col8", "field")) + assert( + matchedStructs == NameTarget( + candidates = Seq( + GetStructField(col8Struct, 0, Some("field")) + ), + aliasName = Some("field"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + ) + ) + + matchedStructs = nameScope.matchMultipartName(Seq("col9", "field", "subfield")) + assert( + matchedStructs == NameTarget( + candidates = Seq( + GetStructField( + GetStructField( + col9NestedStruct, + 0, + Some("field") + ), + 0, + Some("subfield") + ) + ), + aliasName = Some("subfield"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + ) + ) + + var matchedMaps = nameScope.matchMultipartName(Seq("col10", "key")) + assert( + matchedMaps == NameTarget( + candidates = Seq(GetMapValue(col10Map, Literal("key"))), + aliasName = Some("key"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + ) + ) + + matchedMaps = nameScope.matchMultipartName(Seq("col11", "key")) + assert( + matchedMaps == NameTarget( + candidates = Seq(GetMapValue(col11MapWithStruct, Literal("key"))), + aliasName = Some("key"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + ) + ) + + var matchedArrays = nameScope.matchMultipartName(Seq("col12", "element")) + assert( + matchedArrays == NameTarget( + candidates = Seq(GetArrayItem(col12Array, Literal("element"))), + aliasName = Some("element"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + ) + ) + + matchedArrays = nameScope.matchMultipartName(Seq("col13", "field")) + assert( + matchedArrays == NameTarget( + candidates = Seq( + GetArrayStructFields( + col13ArrayWithStruct, + StructField("field", StringType, true), + 0, + 1, + true + ) + ), + aliasName = Some("field"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct + ) + ) + ) + + nameScope("table2") = Seq(col8Struct) + matchedStructs = nameScope.matchMultipartName(Seq("col8", "field")) + assert( + matchedStructs == NameTarget( + candidates = Seq( + GetStructField( + col8Struct, + 0, + Some("field") + ), + GetStructField( + col8Struct, + 0, + Some("field") + ) + ), + aliasName = Some("field"), + allAttributes = Seq( + col8Struct, + col9NestedStruct, + col10Map, + col11MapWithStruct, + col12Array, + col13ArrayWithStruct, + col8Struct + ) + ) + ) + } +} + +class NameScopeStackSuite extends PlanTest { + private val col1Integer = AttributeReference(name = "col1", dataType = IntegerType)() + private val col2String = AttributeReference(name = "col2", dataType = StringType)() + private val col3Integer = AttributeReference(name = "col3", dataType = IntegerType)() + private val col4String = AttributeReference(name = "col4", dataType = StringType)() + + test("Empty stack") { + val stack = new NameScopeStack + + assert(stack.top.getAllAttributes.isEmpty) + } + + test("Overwrite top of the stack containing single scope") { + val stack = new NameScopeStack + + stack.top.update("table1", Seq(col1Integer, col2String)) + assert(stack.top.getAllAttributes == Seq(col1Integer, col2String)) + + stack.overwriteTop("table2", Seq(col3Integer, col4String)) + assert(stack.top.getAllAttributes == Seq(col3Integer, col4String)) + + stack.overwriteTop(Seq(col2String)) + assert(stack.top.getAllAttributes == Seq(col2String)) + } + + test("Overwrite top of the stack containing several scopes") { + val stack = new NameScopeStack + + stack.top.update("table2", Seq(col3Integer)) + + stack.withNewScope { + assert(stack.top.getAllAttributes.isEmpty) + + stack.top.update("table1", Seq(col1Integer, col2String)) + assert(stack.top.getAllAttributes == Seq(col1Integer, col2String)) + + stack.overwriteTop("table2", Seq(col3Integer, col4String)) + assert(stack.top.getAllAttributes == Seq(col3Integer, col4String)) + + stack.overwriteTop(Seq(col2String)) + assert(stack.top.getAllAttributes == Seq(col2String)) + } + } + + test("Scope stacking") { + val stack = new NameScopeStack + + stack.top.update("table1", Seq(col1Integer)) + + stack.withNewScope { + stack.top.update("table2", Seq(col2String)) + + stack.withNewScope { + stack.top.update("table3", Seq(col3Integer)) + + stack.withNewScope { + stack.top.update("table4", Seq(col4String)) + + assert(stack.top.getAllAttributes == Seq(col4String)) + } + + assert(stack.top.getAllAttributes == Seq(col3Integer)) + } + + assert(stack.top.getAllAttributes == Seq(col2String)) + } + + assert(stack.top.getAllAttributes == Seq(col1Integer)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ResolverGuardSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ResolverGuardSuite.scala new file mode 100644 index 0000000000000..d512adbb0af37 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ResolverGuardSuite.scala @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.analysis.resolver.ResolverGuard +import org.apache.spark.sql.test.SharedSparkSession + +class ResolverGuardSuite extends QueryTest with SharedSparkSession { + + // Queries that should pass the OperatorResolverGuard + + test("Select * from an inline table") { + checkResolverGuard("SELECT * FROM VALUES(1,2,3)", shouldPass = true) + } + + test("Select the named parameters from an inline table") { + checkResolverGuard("SELECT col1,col2,col3 FROM VALUES(1,2,3)", shouldPass = true) + } + + test("Inline table as a top level operator") { + checkResolverGuard("VALUES(1,2,3)", shouldPass = true) + } + + test("Select one row") { + checkResolverGuard("SELECT 'Hello world!'", shouldPass = true) + } + + test("Where clause with a literal") { + checkResolverGuard( + "SELECT * FROM VALUES(1, 2, false), (3, 4, true) WHERE true", + shouldPass = true + ) + } + + test("Where clause with an attribute") { + checkResolverGuard( + "SELECT * FROM VALUES(1, 2, false), (3, 4, true) WHERE col3", + shouldPass = true + ) + } + + test("Explicit cast with auto-alias") { + checkResolverGuard( + "SELECT CAST(1 AS DECIMAL(3,2))", + shouldPass = true + ) + } + + test("Multipart attribute name") { + checkResolverGuard("SELECT table.col1 FROM VALUES(1) AS table", shouldPass = true) + } + + test("Predicates") { + checkResolverGuard("SELECT true and false", shouldPass = true) + checkResolverGuard("SELECT true or false", shouldPass = true) + checkResolverGuard( + "SELECT col1 from VALUES(1,2) where true and false or true", + shouldPass = true + ) + checkResolverGuard("SELECT 1 = 2", shouldPass = true) + checkResolverGuard("SELECT 1 != 2", shouldPass = true) + checkResolverGuard("SELECT 1 IN (1,2,3)", shouldPass = true) + checkResolverGuard("SELECT 1 NOT IN (1,2,3)", shouldPass = true) + checkResolverGuard("SELECT 1 IS NULL", shouldPass = true) + checkResolverGuard("SELECT 1 IS NOT NULL", shouldPass = true) + checkResolverGuard("SELECT INTERVAL '1' DAY > INTERVAL '1' HOUR", shouldPass = true) + } + + test("Star target") { + checkResolverGuard("SELECT table.* FROM VALUES(1) as table", shouldPass = true) + } + + test("Binary arithmetic") { + checkResolverGuard("SELECT col1+col2 FROM VALUES(1,2)", shouldPass = true) + checkResolverGuard("SELECT 1 + 2.3 / 2 - 3 DIV 2 + 3.0 * 10.0", shouldPass = true) + checkResolverGuard( + "SELECT TIMESTAMP'2011-11-11 11:11:11' - TIMESTAMP'2011-11-11 11:11:10'", + shouldPass = true + ) + checkResolverGuard( + "SELECT DATE'2020-01-01' - TIMESTAMP'2019-10-06 10:11:12.345678'", + shouldPass = true + ) + checkResolverGuard("SELECT DATE'2012-01-01' - INTERVAL 3 HOURS", shouldPass = true) + checkResolverGuard( + "SELECT DATE'2012-01-01' + INTERVAL '12:12:12' HOUR TO SECOND", + shouldPass = true + ) + checkResolverGuard("SELECT DATE'2012-01-01' + 1", shouldPass = true) + checkResolverGuard("SELECT 2 * INTERVAL 2 YEAR", shouldPass = true) + } + + test("Supported recursive types") { + Seq("ARRAY", "MAP", "STRUCT").foreach { typeName => + checkResolverGuard( + s"SELECT col1 FROM VALUES($typeName(1,2),3)", + shouldPass = true + ) + } + } + + test("Recursive types related functions") { + checkResolverGuard("SELECT NAMED_STRUCT('a', 1)", shouldPass = true) + checkResolverGuard("SELECT MAP_CONTAINS_KEY(MAP(1, 'a', 2, 'b'), 2)", shouldPass = true) + checkResolverGuard("SELECT ARRAY_CONTAINS(ARRAY(1, 2, 3), 2);", shouldPass = true) + } + + test("Conditional expressions") { + checkResolverGuard("SELECT COALESCE(NULL, 1)", shouldPass = true) + checkResolverGuard("SELECT col1, IF(col1 > 1, 1, 0) FROM VALUES(1,2),(2,3)", shouldPass = true) + checkResolverGuard( + "SELECT col1, CASE WHEN col1 > 1 THEN 1 ELSE 0 END FROM VALUES(1,2),(2,3)", + shouldPass = true + ) + } + + test("User specified alias") { + checkResolverGuard("SELECT 1 AS alias", shouldPass = true) + } + + // Queries that shouldn't pass the OperatorResolverGuard + + test("Select from table") { + withTable("test_table") { + sql("CREATE TABLE test_table (col1 INT, col2 INT)") + checkResolverGuard("SELECT * FROM test_table", shouldPass = true) + } + } + + test("Single-layer subquery") { + checkResolverGuard("SELECT * FROM (SELECT * FROM VALUES(1))", shouldPass = true) + } + + test("Multi-layer subquery") { + checkResolverGuard("SELECT * FROM (SELECT * FROM (SELECT * FROM VALUES(1)))", shouldPass = true) + } + + test("Scalar subquery") { + checkResolverGuard("SELECT (SELECT * FROM VALUES(1))", shouldPass = false) + } + + test("EXISTS subquery") { + checkResolverGuard( + "SELECT * FROM VALUES (1) WHERE EXISTS (SELECT * FROM VALUES(1))", + shouldPass = false + ) + } + + test("IN subquery") { + checkResolverGuard( + "SELECT * FROM VALUES (1) WHERE col1 IN (SELECT * FROM VALUES(1))", + shouldPass = false + ) + } + + test("Function") { + checkResolverGuard("SELECT current_date()", shouldPass = false) + } + + test("Function without the braces") { + checkResolverGuard("SELECT current_date", shouldPass = false) + } + + test("Session variables") { + withSessionVariable { + checkResolverGuard("SELECT session_variable", shouldPass = false) + } + } + + test("Case sensitive analysis") { + withSQLConf("spark.sql.caseSensitive" -> "true") { + checkResolverGuard("SELECT 1", shouldPass = false) + } + } + + private def checkResolverGuard(query: String, shouldPass: Boolean): Unit = { + val resolverGuard = new ResolverGuard(spark.sessionState.catalogManager) + assert( + resolverGuard.apply(sql(query).queryExecution.logical) == shouldPass + ) + } + + private def withSessionVariable(body: => Unit): Unit = { + sql("DECLARE session_variable = 1;") + try { + body + } finally { + sql("DROP TEMPORARY VARIABLE session_variable;") + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ResolverSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ResolverSuite.scala new file mode 100644 index 0000000000000..057724758d332 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/ResolverSuite.scala @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import org.apache.spark.sql.{AnalysisException, QueryTest} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.analysis.resolver.{Resolver, ResolverExtension} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.IntegerType + +class ResolverSuite extends QueryTest with SharedSparkSession { + private val col1Integer = AttributeReference("col1", IntegerType)() + + test("Node matched the extension") { + val resolver = createResolver( + Seq( + new NoopResolver, + new TestRelationResolver + ) + ) + + val result = resolver.lookupMetadataAndResolve( + Project( + Seq(UnresolvedAttribute("col1")), + TestRelation(resolutionDone = false, output = Seq(col1Integer)) + ) + ) + assert( + result == Project( + Seq(col1Integer), + TestRelation(resolutionDone = true, output = Seq(col1Integer)) + ) + ) + } + + test("Node didn't match the extension") { + val resolver = createResolver( + Seq( + new NoopResolver, + new TestRelationResolver + ) + ) + + checkError( + exception = intercept[AnalysisException]( + resolver.lookupMetadataAndResolve( + Project( + Seq(UnresolvedAttribute("col1")), + UnknownRelation(output = Seq(col1Integer)) + ) + ) + ), + condition = "UNSUPPORTED_SINGLE_PASS_ANALYZER_FEATURE", + parameters = Map( + "feature" -> ("class " + + "org.apache.spark.sql.analysis.resolver.ResolverSuite$UnknownRelation operator resolution") + ) + ) + } + + test("Ambiguous extensions") { + val resolver = createResolver( + Seq( + new NoopResolver, + new TestRelationResolver, + new TestRelationBrokenResolver + ) + ) + + checkError( + exception = intercept[AnalysisException]( + resolver.lookupMetadataAndResolve( + Project( + Seq(UnresolvedAttribute("col1")), + TestRelation(resolutionDone = false, output = Seq(col1Integer)) + ) + ) + ), + condition = "AMBIGUOUS_RESOLVER_EXTENSION", + parameters = Map( + "operator" -> "org.apache.spark.sql.analysis.resolver.ResolverSuite$TestRelation", + "extensions" -> "TestRelationResolver, TestRelationBrokenResolver" + ) + ) + } + + private def createResolver(extensions: Seq[ResolverExtension] = Seq.empty): Resolver = { + new Resolver(spark.sessionState.catalogManager, extensions) + } + + private class TestRelationResolver extends ResolverExtension { + var timesCalled = 0 + + override def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] = { + case testNode: TestRelation if countTimesCalled() => + testNode.copy(resolutionDone = true) + } + + private def countTimesCalled(): Boolean = { + timesCalled += 1 + assert(timesCalled == 1) + true + } + } + + private class TestRelationBrokenResolver extends ResolverExtension { + override def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] = { + case testNode: TestRelation => + assert(false) + testNode + } + } + + private class NoopResolver extends ResolverExtension { + override def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] = { + case node: LogicalPlan if false => + assert(false) + node + } + } + + private case class TestRelation( + resolutionDone: Boolean, + override val output: Seq[Attribute], + override val children: Seq[LogicalPlan] = Seq.empty) + extends LogicalPlan { + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[LogicalPlan]): TestRelation = + copy(children = newChildren) + } + + private case class UnknownRelation( + override val output: Seq[Attribute], + override val children: Seq[LogicalPlan] = Seq.empty) + extends LogicalPlan { + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[LogicalPlan]): UnknownRelation = + copy(children = newChildren) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/TracksResolvedNodesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/TracksResolvedNodesSuite.scala new file mode 100644 index 0000000000000..b7bf73f326fa8 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/analysis/resolver/TracksResolvedNodesSuite.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.analysis.resolver + +import org.apache.spark.SparkException +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.analysis.FunctionResolution +import org.apache.spark.sql.catalyst.analysis.resolver.{ + ExpressionResolver, + NameScopeStack, + PlanLogger, + Resolver +} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Cast, ExprId} +import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{BooleanType, StringType} + +class TracksResolvedNodesSuite extends QueryTest with SharedSparkSession { + + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.ANALYZER_SINGLE_PASS_TRACK_RESOLVED_NODES_ENABLED.key, "true") + } + + test("Single-pass contract preserved for equal expressions with different memory addresses") { + val expressionResolver = createExpressionResolver() + val columnObjFirst = + AttributeReference(name = "column", dataType = BooleanType)(exprId = ExprId(0)) + val columnObjSecond = + AttributeReference(name = "column", dataType = BooleanType)(exprId = ExprId(0)) + + expressionResolver.resolve(columnObjFirst) + expressionResolver.resolve(columnObjSecond) + } + + test("Single-pass contract broken for operators") { + val resolver = createResolver() + + val project = Project( + projectList = Seq(), + child = Project( + projectList = Seq(), + child = OneRowRelation() + ) + ) + + val resolvedProject = resolver.lookupMetadataAndResolve(project) + + checkError( + exception = intercept[SparkException]({ + resolver.lookupMetadataAndResolve(resolvedProject.children.head) + }), + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> ("Single-pass resolver attempted to resolve the same " + + "node more than once: Project\n+- OneRowRelation\n") + ) + ) + checkError( + exception = intercept[SparkException]({ + resolver.lookupMetadataAndResolve(resolvedProject) + }), + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> ("Single-pass resolver attempted to resolve the same " + + "node more than once: Project\n+- Project\n +- OneRowRelation\n") + ) + ) + } + + test("Single-pass contract broken for expressions") { + val expressionResolver = createExpressionResolver() + + val cast = Cast( + child = AttributeReference(name = "column", dataType = BooleanType)(exprId = ExprId(0)), + dataType = StringType + ) + + val resolvedCast = expressionResolver.resolve(cast) + + checkError( + exception = intercept[SparkException]({ + expressionResolver.resolve(resolvedCast.children.head) + }), + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> ("Single-pass resolver attempted " + + "to resolve the same node more than once: column#0") + ) + ) + checkError( + exception = intercept[SparkException]({ + expressionResolver.resolve(resolvedCast) + }), + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> ("Single-pass resolver attempted " + + "to resolve the same node more than once: cast(column#0 as string)") + ) + ) + } + + private def createResolver(): Resolver = { + new Resolver(spark.sessionState.catalogManager) + } + + private def createExpressionResolver(): ExpressionResolver = { + new ExpressionResolver( + createResolver(), + new NameScopeStack, + new FunctionResolution( + spark.sessionState.catalogManager, + Resolver.createRelationResolution(spark.sessionState.catalogManager) + ), + new PlanLogger + ) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/artifact/ArtifactManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/artifact/ArtifactManagerSuite.scala index e935af8b8bf8c..a24982aea1585 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/artifact/ArtifactManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/artifact/ArtifactManagerSuite.scala @@ -23,6 +23,7 @@ import java.nio.file.{Files, Path, Paths} import org.apache.commons.io.FileUtils import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.metrics.source.CodegenMetrics import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf @@ -47,7 +48,7 @@ class ArtifactManagerSuite extends SharedSparkSession { private def sessionUUID: String = spark.sessionUUID override def afterEach(): Unit = { - artifactManager.cleanUpResources() + artifactManager.cleanUpResourcesForTesting() super.afterEach() } @@ -208,7 +209,7 @@ class ArtifactManagerSuite extends SharedSparkSession { assert(expectedPath.toFile.exists()) // Remove resources - artifactManager.cleanUpResources() + artifactManager.cleanUpResourcesForTesting() assert(blockManager.getLocalBytes(blockId).isEmpty) assert(!expectedPath.toFile.exists()) @@ -293,7 +294,7 @@ class ArtifactManagerSuite extends SharedSparkSession { val sessionDirectory = artifactManager.artifactPath.toFile assert(sessionDirectory.exists()) - artifactManager.cleanUpResources() + artifactManager.cleanUpResourcesForTesting() assert(!sessionDirectory.exists()) assert(ArtifactManager.artifactRootDirectory.toFile.exists()) } @@ -447,4 +448,58 @@ class ArtifactManagerSuite extends SharedSparkSession { assert(msg == "Hello Talon! Nice to meet you!") } } + + test("Codegen cache should be invalid when artifacts are added - class artifact") { + withTempDir { dir => + runCodegenTest("class artifact") { + val randomFilePath = dir.toPath.resolve("random.class") + val testBytes = "test".getBytes(StandardCharsets.UTF_8) + Files.write(randomFilePath, testBytes) + spark.addArtifact(randomFilePath.toString) + } + } + } + + test("Codegen cache should be invalid when artifacts are added - JAR artifact") { + withTempDir { dir => + runCodegenTest("JAR artifact") { + val randomFilePath = dir.toPath.resolve("random.jar") + val testBytes = "test".getBytes(StandardCharsets.UTF_8) + Files.write(randomFilePath, testBytes) + spark.addArtifact(randomFilePath.toString) + } + } + } + + private def getCodegenCount: Long = CodegenMetrics.METRIC_COMPILATION_TIME.getCount + + private def runCodegenTest(msg: String)(addOneArtifact: => Unit): Unit = { + withSQLConf(SQLConf.ARTIFACTS_SESSION_ISOLATION_ALWAYS_APPLY_CLASSLOADER.key -> "true") { + val s = spark + import s.implicits._ + + val count1 = getCodegenCount + // trigger codegen for Dataset + Seq(Seq("abc")).toDS().collect() + val count2 = getCodegenCount + // codegen happens + assert(count2 > count1, s"$msg: codegen should happen at the first time") + + // add one artifact, codegen cache should be invalid after this + addOneArtifact + + // trigger codegen for another Dataset of same type + Seq(Seq("abc")).toDS().collect() + // codegen cache should not work for Datasets of same type. + val count3 = getCodegenCount + assert(count3 > count2, s"$msg: codegen should happen again after adding artifact") + + // trigger again + Seq(Seq("abc")).toDS().collect() + // codegen should work now as classloader is not changed + val count4 = getCodegenCount + assert(count4 == count3, + s"$msg: codegen should not happen again as classloader is not changed") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala index 9b54fe4bb052c..8bb4a1c803e8e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala @@ -43,7 +43,7 @@ abstract class CollatedFilterPushDownToParquetSuite extends QueryTest val collatedStructNestedCol = "f1" val collatedStructFieldAccess = s"$collatedStructCol.$collatedStructNestedCol" val collatedArrayCol = "c3" - val collatedMapCol = "c4" + val nonCollatedMapCol = "c4" val lcaseCollation = "'UTF8_LCASE'" @@ -69,7 +69,7 @@ abstract class CollatedFilterPushDownToParquetSuite extends QueryTest | named_struct('$collatedStructNestedCol', | COLLATE(c, $lcaseCollation)) as $collatedStructCol, | array(COLLATE(c, $lcaseCollation)) as $collatedArrayCol, - | map(COLLATE(c, $lcaseCollation), 1) as $collatedMapCol + | map(c, 1) as $nonCollatedMapCol |FROM VALUES ('aaa'), ('AAA'), ('bbb') |as data(c) |""".stripMargin) @@ -215,9 +215,9 @@ abstract class CollatedFilterPushDownToParquetSuite extends QueryTest test("map - parquet does not support null check on complex types") { testPushDown( - filterString = s"map_keys($collatedMapCol) != array(collate('aaa', $lcaseCollation))", + filterString = s"map_keys($nonCollatedMapCol) != array('aaa')", expectedPushedFilters = Seq.empty, - expectedRowCount = 1) + expectedRowCount = 2) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala index 4a904a85e0a7b..7df54b372e8a7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala @@ -19,11 +19,17 @@ package org.apache.spark.sql.collation import org.apache.spark.SparkThrowable import org.apache.spark.sql.{DataFrame, QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { val dataSource: String = "parquet" + val UNICODE_COLLATION_NAME = "SYSTEM.BUILTIN.UNICODE" + val UNICODE_CI_COLLATION_NAME = "SYSTEM.BUILTIN.UNICODE_CI" + val UTF8_BINARY_COLLATION_NAME = "SYSTEM.BUILTIN.UTF8_BINARY" + val UTF8_LCASE_COLLATION_NAME = "SYSTEM.BUILTIN.UTF8_LCASE" private def assertThrowsError(df: => DataFrame, errorClass: String): Unit = { val exception = intercept[SparkThrowable] { @@ -38,22 +44,27 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { private def assertImplicitMismatch(df: => DataFrame): Unit = assertThrowsError(df, "COLLATION_MISMATCH.IMPLICIT") + private def assertQuerySchema(df: => DataFrame, expectedSchema: DataType): Unit = { + val querySchema = df.schema.fields.head.dataType + assert(DataType.equalsIgnoreNullability(querySchema, expectedSchema)) + } + test("explicit collation propagates up") { checkAnswer( sql(s"SELECT COLLATION('a' collate unicode)"), - Row("UNICODE")) + Row(UNICODE_COLLATION_NAME)) checkAnswer( sql(s"SELECT COLLATION('a' collate unicode || 'b')"), - Row("UNICODE")) + Row(UNICODE_COLLATION_NAME)) checkAnswer( sql(s"SELECT COLLATION(SUBSTRING('a' collate unicode, 0, 1))"), - Row("UNICODE")) + Row(UNICODE_COLLATION_NAME)) checkAnswer( sql(s"SELECT COLLATION(SUBSTRING('a' collate unicode, 0, 1) || 'b')"), - Row("UNICODE")) + Row(UNICODE_COLLATION_NAME)) assertExplicitMismatch( sql(s"SELECT COLLATION('a' collate unicode || 'b' collate utf8_lcase)")) @@ -68,9 +79,9 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { test("implicit collation in columns") { val tableName = "implicit_coll_tbl" - val c1Collation = "UNICODE" - val c2Collation = "UNICODE_CI" - val structCollation = "UTF8_LCASE" + val c1Collation = UNICODE_COLLATION_NAME + val c2Collation = UNICODE_CI_COLLATION_NAME + val structCollation = UTF8_LCASE_COLLATION_NAME withTable(tableName) { sql(s""" |CREATE TABLE $tableName ( @@ -99,9 +110,57 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { } } + test("lateral alias has implicit strength") { + checkAnswer( + sql(""" + |SELECT + | a collate unicode as col1, + | COLLATION(col1 || 'b') + |FROM VALUES ('a') AS t(a) + |""".stripMargin), + Row("a", UNICODE_COLLATION_NAME)) + + assertImplicitMismatch( + sql(""" + |SELECT + | a collate unicode as col1, + | a collate utf8_lcase as col2, + | col1 = col2 + |FROM VALUES ('a') AS t(a) + |""".stripMargin)) + + checkAnswer( + sql(""" + |SELECT + | a collate unicode as col1, + | COLLATION(col1 || 'b' collate UTF8_LCASE) + |FROM VALUES ('a') AS t(a) + |""".stripMargin), + Row("a", UTF8_LCASE_COLLATION_NAME)) + } + + test("outer reference has implicit strength") { + val tableName = "outer_ref_tbl" + withTable(tableName) { + sql(s"CREATE TABLE $tableName (c STRING COLLATE UNICODE_CI, c1 STRING) USING $dataSource") + sql(s"INSERT INTO $tableName VALUES ('a', 'a'), ('A', 'A')") + + checkAnswer( + sql(s"SELECT DISTINCT (SELECT COLLATION(c || 'a')) FROM $tableName"), + Seq(Row(UNICODE_CI_COLLATION_NAME))) + + assertImplicitMismatch( + sql(s"SELECT DISTINCT (SELECT COLLATION(c || c1)) FROM $tableName")) + + checkAnswer( + sql(s"SELECT DISTINCT (SELECT COLLATION(c || 'a' collate utf8_lcase)) FROM $tableName"), + Seq(Row(UTF8_LCASE_COLLATION_NAME))) + } + } + test("variables have implicit collation") { - val v1Collation = "UTF8_BINARY" - val v2Collation = "UTF8_LCASE" + val v1Collation = UTF8_BINARY_COLLATION_NAME + val v2Collation = UTF8_LCASE_COLLATION_NAME sql(s"DECLARE v1 = 'a'") sql(s"DECLARE v2 = 'b' collate $v2Collation") @@ -115,7 +174,7 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { checkAnswer( sql(s"SELECT COLLATION(v2 || 'a' COLLATE UTF8_BINARY)"), - Row("UTF8_BINARY")) + Row(UTF8_BINARY_COLLATION_NAME)) checkAnswer( sql(s"SELECT COLLATION(SUBSTRING(v2, 0, 1) || 'a')"), @@ -137,33 +196,130 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { // Simple subquery with explicit collation checkAnswer( sql(s"SELECT COLLATION((SELECT 'text' COLLATE UTF8_BINARY) || 'suffix')"), - Row("UTF8_BINARY") + Row(UTF8_BINARY_COLLATION_NAME) ) checkAnswer( sql(s"SELECT COLLATION((SELECT 'text' COLLATE UTF8_LCASE) || 'suffix')"), - Row("UTF8_LCASE") + Row(UTF8_LCASE_COLLATION_NAME) ) // Nested subquery should retain the collation of the deepest expression checkAnswer( sql(s"SELECT COLLATION((SELECT (SELECT 'inner' COLLATE UTF8_LCASE) || 'outer'))"), - Row("UTF8_LCASE") + Row(UTF8_LCASE_COLLATION_NAME) ) checkAnswer( sql(s"SELECT COLLATION((SELECT (SELECT 'inner' COLLATE UTF8_BINARY) || 'outer'))"), - Row("UTF8_BINARY") + Row(UTF8_BINARY_COLLATION_NAME) ) // Subqueries with mixed collations should follow collation precedence rules checkAnswer( sql(s"SELECT COLLATION((SELECT 'string1' COLLATE UTF8_LCASE || " + s"(SELECT 'string2' COLLATE UTF8_BINARY)))"), - Row("UTF8_LCASE") + Row(UTF8_LCASE_COLLATION_NAME) ) } + test("in subquery expression") { + val tableName = "subquery_tbl" + withTable(tableName) { + sql(s""" + |CREATE TABLE $tableName ( + | c1 STRING COLLATE UTF8_LCASE, + | c2 STRING COLLATE UNICODE + |) USING $dataSource + |""".stripMargin) + + sql(s"INSERT INTO $tableName VALUES ('a', 'A')") + + assertImplicitMismatch( + sql(s""" + |SELECT * FROM $tableName + |WHERE c1 IN (SELECT c2 FROM $tableName) + |""".stripMargin)) + + // this fails since subquery expression collation is implicit by default + assertImplicitMismatch( + sql(s""" + |SELECT * FROM $tableName + |WHERE c1 IN (SELECT c2 collate unicode FROM $tableName) + |""".stripMargin)) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate utf8_lcase IN (SELECT c2 collate unicode FROM $tableName) + |""".stripMargin), + Seq(Row(1))) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate utf8_lcase IN (SELECT c2 FROM $tableName) + |""".stripMargin), + Seq(Row(1))) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate unicode IN (SELECT c2 FROM $tableName) + |""".stripMargin), + Seq(Row(0))) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate unicode IN (SELECT c2 FROM $tableName + | WHERE c2 collate unicode IN (SELECT c1 FROM $tableName)) + |""".stripMargin), + Seq(Row(0))) + } + } + + test("scalar subquery") { + val tableName = "scalar_subquery_tbl" + withTable(tableName) { + sql(s""" + |CREATE TABLE $tableName ( + | c1 STRING COLLATE UTF8_LCASE, + | c2 STRING COLLATE UNICODE + |) USING $dataSource + |""".stripMargin) + + sql(s"INSERT INTO $tableName VALUES ('a', 'A')") + + assertImplicitMismatch( + sql(s""" + |SELECT * FROM $tableName + |WHERE c1 = (SELECT MAX(c2) FROM $tableName) + |""".stripMargin)) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate utf8_lcase = (SELECT MAX(c2) collate unicode FROM $tableName) + |""".stripMargin), + Seq(Row(1))) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate utf8_lcase = (SELECT MAX(c2) FROM $tableName) + |""".stripMargin), + Seq(Row(1))) + + checkAnswer( + sql(s""" + |SELECT COUNT(*) FROM $tableName + |WHERE c1 collate unicode = (SELECT MAX(c2) FROM $tableName) + |""".stripMargin), + Seq(Row(0))) + } + } + test("struct test") { val tableName = "struct_tbl" val c1Collation = "UNICODE_CI" @@ -180,28 +336,28 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { checkAnswer( sql(s"SELECT COLLATION(c2.col1.col1 || 'a') FROM $tableName"), - Seq(Row(c2Collation))) + Seq(Row(UNICODE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(c1.col1 || 'a') FROM $tableName"), - Seq(Row(c1Collation))) + Seq(Row(UNICODE_CI_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(c1.col1 || 'a' collate UNICODE) FROM $tableName"), - Seq(Row("UNICODE"))) + Seq(Row(UNICODE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(struct('a').col1 || 'a' collate UNICODE) FROM $tableName"), - Seq(Row("UNICODE"))) + Seq(Row(UNICODE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(struct('a' collate UNICODE).col1 || 'a') FROM $tableName"), - Seq(Row("UNICODE"))) + Seq(Row(UNICODE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(struct('a').col1 collate UNICODE || 'a' collate UNICODE) " + s"FROM $tableName"), - Seq(Row("UNICODE"))) + Seq(Row(UNICODE_COLLATION_NAME))) assertExplicitMismatch( sql(s"SELECT COLLATION(struct('a').col1 collate UNICODE || 'a' collate UTF8_LCASE) " + @@ -229,7 +385,7 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { checkAnswer( sql(s"SELECT collation(element_at(array('a', 'b' collate utf8_lcase), 1))"), - Seq(Row("UTF8_LCASE"))) + Seq(Row(UTF8_LCASE_COLLATION_NAME))) assertExplicitMismatch( sql(s"SELECT collation(element_at(array('a' collate unicode, 'b' collate utf8_lcase), 1))") @@ -238,17 +394,17 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { checkAnswer( sql(s"SELECT collation(element_at(array('a', 'b' collate utf8_lcase), 1) || c1)" + s"from $tableName"), - Seq(Row("UTF8_LCASE"))) + Seq(Row(UTF8_LCASE_COLLATION_NAME))) checkAnswer( sql(s"SELECT collation(element_at(array_append(c2, 'd'), 1)) FROM $tableName"), - Seq(Row(arrayCollation)) + Seq(Row(UNICODE_CI_COLLATION_NAME)) ) checkAnswer( sql(s"SELECT collation(element_at(array_append(c2, 'd' collate utf8_lcase), 1))" + s"FROM $tableName"), - Seq(Row("UTF8_LCASE")) + Seq(Row(UTF8_LCASE_COLLATION_NAME)) ) } } @@ -262,65 +418,55 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { checkAnswer( sql(s"SELECT COLLATION(c1[0]) FROM $tableName"), - Seq(Row(columnCollation))) + Seq(Row(UNICODE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(cast(c1 AS ARRAY)[0]) FROM $tableName"), - Seq(Row("UTF8_BINARY"))) + Seq(Row(UTF8_BINARY_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(cast(c1 AS ARRAY)[0]) FROM $tableName"), - Seq(Row("UTF8_LCASE"))) + Seq(Row(UTF8_LCASE_COLLATION_NAME))) } } test("user defined cast") { val tableName = "dflt_coll_tbl" - val columnCollation = "UNICODE" + val columnCollation = UNICODE_COLLATION_NAME withTable(tableName) { sql(s"CREATE TABLE $tableName (c1 STRING COLLATE $columnCollation) USING $dataSource") sql(s"INSERT INTO $tableName VALUES ('a')") - // only for non string inputs cast results in default collation checkAnswer( - sql(s"SELECT COLLATION(c1 || CAST(to_char(DATE'2016-04-08', 'y') AS STRING)) " + - s"FROM $tableName"), - Seq(Row(columnCollation))) + sql(s"SELECT COLLATION(CAST(5 AS STRING)) FROM $tableName"), + Seq(Row(UTF8_BINARY_COLLATION_NAME))) checkAnswer( - sql(s"SELECT COLLATION(CAST(to_char(DATE'2016-04-08', 'y') AS STRING)) " + - s"FROM $tableName"), - Seq(Row("UTF8_BINARY"))) - - // for string inputs collation is of the child expression - checkAnswer( - sql(s"SELECT COLLATION(CAST('a' AS STRING)) FROM $tableName"), - Seq(Row("UTF8_BINARY"))) + sql(s"SELECT c1 = cast(5 AS STRING) FROM $tableName"), + Seq(Row(false))) checkAnswer( sql(s"SELECT COLLATION(CAST(c1 AS STRING)) FROM $tableName"), - Seq(Row(columnCollation))) + Seq(Row(UTF8_BINARY_COLLATION_NAME))) checkAnswer( - sql(s"SELECT COLLATION(CAST(c1 collate UTF8_LCASE AS STRING)) FROM $tableName"), - Seq(Row("UTF8_LCASE"))) + sql(s"SELECT c1 = cast(c1 as STRING COLLATE UNICODE) FROM $tableName"), + Seq(Row(true))) checkAnswer( - sql(s"SELECT COLLATION(c1 || CAST('a' AS STRING)) FROM $tableName"), - Seq(Row(columnCollation))) + sql(s"SELECT c1 = cast(5 as STRING COLLATE UNICODE) FROM $tableName"), + Seq(Row(false))) checkAnswer( - sql(s"SELECT COLLATION(c1 || CAST('a' collate UTF8_LCASE AS STRING)) FROM $tableName"), - Seq(Row("UTF8_LCASE"))) + sql(s"SELECT COLLATION(CAST(c1 collate UTF8_LCASE AS STRING)) FROM $tableName"), + Seq(Row(UTF8_BINARY_COLLATION_NAME))) - checkAnswer( - sql(s"SELECT COLLATION(c1 || CAST(c1 AS STRING)) FROM $tableName"), - Seq(Row(columnCollation))) + assertImplicitMismatch( + sql(s"SELECT c1 = CAST(c1 AS STRING) FROM $tableName")) - checkAnswer( - sql(s"SELECT COLLATION(c1 || SUBSTRING(CAST(c1 AS STRING), 0, 1)) FROM $tableName"), - Seq(Row(columnCollation))) - } + assertImplicitMismatch( + sql(s"SELECT c1 = CAST(to_char(DATE'2016-04-08', 'y') AS STRING) FROM $tableName")) + } } test("str fns without params have default strength") { @@ -332,26 +478,26 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { checkAnswer( sql(s"SELECT COLLATION('a' collate utf8_lcase || current_database()) FROM $tableName"), - Seq(Row("UTF8_LCASE"))) + Seq(Row(UTF8_LCASE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION(c1 || current_database()) FROM $tableName"), - Seq(Row(columnCollation))) + Seq(Row(UNICODE_COLLATION_NAME))) checkAnswer( sql(s"SELECT COLLATION('a' || current_database()) FROM $tableName"), - Seq(Row("UTF8_BINARY"))) + Seq(Row(UTF8_BINARY_COLLATION_NAME))) } } test("functions that contain both string and non string params") { checkAnswer( sql(s"SELECT COLLATION(elt(2, 'a', 'b'))"), - Row("UTF8_BINARY")) + Row(UTF8_BINARY_COLLATION_NAME)) checkAnswer( sql(s"SELECT COLLATION(elt(2, 'a' collate UTF8_LCASE, 'b'))"), - Row("UTF8_LCASE")) + Row(UTF8_LCASE_COLLATION_NAME)) assertExplicitMismatch( sql(s"SELECT COLLATION(elt(2, 'a' collate UTF8_LCASE, 'b' collate UNICODE))")) @@ -377,13 +523,219 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { s"'name2' collate utf8_lcase, 'value2' collate unicode)"), Row(Row("value1", "value2"))) - assertExplicitMismatch( + checkAnswer( sql(s"SELECT named_struct" + - s"('name1' collate unicode, 'value1', 'name2' collate utf8_lcase, 'value2')")) + s"('name1' collate unicode, 'value1', 'name2' collate utf8_lcase, 'value2')"), + Row(Row("value1", "value2"))) - assertExplicitMismatch( + checkAnswer( sql(s"SELECT named_struct" + - s"('name1', 'value1' collate unicode, 'name2', 'value2' collate utf8_lcase)")) + s"('name1', 'value1' collate unicode, 'name2', 'value2' collate utf8_lcase)"), + Row(Row("value1", "value2"))) + } + + test("coercing structs") { + assertQuerySchema( + sql(s"SELECT array(struct(1, 'a'), struct(2, 'b' collate utf8_lcase))"), + ArrayType( + StructType( + Seq(StructField("col1", IntegerType), StructField("col2", StringType("UTF8_LCASE")))))) + + assertQuerySchema( + sql(s"SELECT array(struct(1, 'a' collate utf8_lcase), struct(2, 'b' collate utf8_lcase))"), + ArrayType( + StructType( + Seq(StructField("col1", IntegerType), StructField("col2", StringType("UTF8_LCASE")))))) + + assertExplicitMismatch( + sql(s"SELECT array(struct(1, 'a' collate utf8_lcase), struct(2, 'b' collate unicode))")) + + assertImplicitMismatch(sql(s""" + |SELECT array(struct(1, c1), struct(2, c2)) + |FROM VALUES ('a' collate unicode, 'b' collate utf8_lcase) AS t(c1, c2) + |""".stripMargin)) + } + + test("coercing maps") { + assertQuerySchema( + sql(s"SELECT map('key1', 'val1', 'key2', 'val2')"), + MapType(StringType, StringType)) + + assertQuerySchema( + sql(s"SELECT map('key1' collate utf8_lcase, 'val1', 'key2', 'val2' collate unicode)"), + MapType(StringType("UTF8_LCASE"), StringType("UNICODE"))) + + assertQuerySchema( + sql(s"SELECT ARRAY(map('key1', 'val1'), map('key2' collate UNICODE, 'val2'))"), + ArrayType(MapType(StringType("UNICODE"), StringType))) + + assertExplicitMismatch( + sql(s"SELECT map('key1', 'val1' collate utf8_lcase, 'key2', 'val2' collate unicode)")) + } + + test("user defined cast on maps") { + checkAnswer( + sql(s""" + |SELECT map_contains_key( + | map('a' collate utf8_lcase, 'b'), + | 'A' collate utf8_lcase) + |""".stripMargin), + Seq(Row(true))) + + checkAnswer( + sql(s""" + |SELECT map_contains_key( + | CAST(map('a' collate utf8_lcase, 'b') AS MAP), + | 'A') + |""".stripMargin), + Seq(Row(false))) + + checkAnswer( + sql(s""" + |SELECT map_contains_key( + | CAST(map('a' collate utf8_lcase, 'b') AS MAP), + | 'A' COLLATE UNICODE) + |""".stripMargin), + Seq(Row(false))) + } + + test("maps of structs") { + assertQuerySchema( + sql(s"SELECT map('key1', struct(1, 'a' collate unicode), 'key2', struct(2, 'b'))"), + MapType( + StringType, + StructType( + Seq(StructField("col1", IntegerType), StructField("col2", StringType("UNICODE")))))) + + checkAnswer( + sql( + s"SELECT map('key1', struct(1, 'a' collate unicode_ci)," + + s"'key2', struct(2, 'b'))['key1'].col2 = 'A'"), + Seq(Row(true))) + } + + test("coercing arrays") { + assertQuerySchema(sql(s"SELECT array('a', 'b')"), ArrayType(StringType)) + + assertQuerySchema( + sql(s"SELECT array('a' collate utf8_lcase, 'b')"), + ArrayType(StringType("UTF8_LCASE"))) + + assertQuerySchema( + sql(s"SELECT array('a' collate utf8_lcase, 'b' collate utf8_lcase)"), + ArrayType(StringType("UTF8_LCASE"))) + + assertExplicitMismatch(sql(s"SELECT array('a' collate utf8_lcase, 'b' collate unicode)")) + + assertQuerySchema( + sql(s"SELECT array(array('a', 'b'), array('c' collate utf8_lcase, 'd'))"), + ArrayType(ArrayType(StringType("UTF8_LCASE")))) + + checkAnswer( + sql(s"SELECT array('a', 'b') = array('A' collate utf8_lcase, 'B')"), + Seq(Row(true))) + + checkAnswer( + sql(s"SELECT array('a', 'b')[0] = array('A' collate utf8_lcase, 'B')[1]"), + Seq(Row(false))) + + assertExplicitMismatch( + sql(s"SELECT array('a', 'b' collate unicode) = array('A' collate utf8_lcase, 'B')")) + } + + test("user defined cast on arrays") { + checkAnswer( + sql(s""" + |SELECT array_contains( + | array('a', 'b' collate utf8_lcase), + | 'A') + |""".stripMargin), + Seq(Row(true))) + + // should be false because ARRAY should take precedence + // over UTF8_LCASE in array creation + checkAnswer( + sql(s""" + |SELECT array_contains( + | CAST(array('a', 'b' collate utf8_lcase) AS ARRAY), + | 'A') + |""".stripMargin), + Seq(Row(false))) + + checkAnswer( + sql(s""" + |SELECT array_contains( + | CAST(array('a', 'b' collate utf8_lcase) AS ARRAY), + | 'A') + |""".stripMargin), + Seq(Row(false))) + + checkAnswer( + sql(s""" + |SELECT array_contains( + | CAST(array('a', 'b' collate utf8_lcase) AS ARRAY), + | 'A' collate unicode) + |""".stripMargin), + Seq(Row(false))) + } + + test("array of structs") { + assertQuerySchema( + sql(s"SELECT array(struct(1, 'a' collate unicode), struct(2, 'b'))[0]"), + StructType( + Seq(StructField("col1", IntegerType), StructField("col2", StringType("UNICODE"))))) + + checkAnswer( + sql(s"SELECT array(struct(1, 'a' collate unicode_ci), struct(2, 'b'))[0].col2 = 'A'"), + Seq(Row(true))) + } + + test("coercing deeply nested complex types") { + assertQuerySchema( + sql(s""" + |SELECT struct( + | struct(1, 'nested' collate unicode), + | array( + | struct(1, 'a' collate utf8_lcase), + | struct(2, 'b' collate utf8_lcase) + | ) + |) + |""".stripMargin), + StructType( + Seq( + StructField( + "col1", + StructType( + Seq(StructField("col1", IntegerType), StructField("col2", StringType("UNICODE"))))), + StructField( + "col2", + ArrayType( + StructType(Seq( + StructField("col1", IntegerType), + StructField("col2", StringType("UTF8_LCASE"))))))))) + + assertQuerySchema( + sql(s""" + |SELECT struct( + | struct( + | array( + | map('key1' collate utf8_lcase, 'val1', + | 'key2', 'val2'), + | map('key3', 'val3' collate unicode) + | ) + | ), + | 42 + |) + |""".stripMargin), + StructType( + Seq( + StructField( + "col1", + StructType( + Seq(StructField( + "col1", + ArrayType(MapType(StringType("UTF8_LCASE"), StringType("UNICODE"))))))), + StructField("col2", IntegerType)))) } test("access collated map via literal") { @@ -393,27 +745,30 @@ class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession { sql(s"SELECT c1 FROM $tableName WHERE $condition = 'B'") withTable(tableName) { - sql(s""" - |CREATE TABLE $tableName ( - | c1 MAP, - | c2 STRING - |) USING $dataSource - |""".stripMargin) - - sql(s"INSERT INTO $tableName VALUES (map('a', 'b'), 'a')") - - Seq("c1['A']", - "c1['A' COLLATE UNICODE_CI]", - "c1[c2 COLLATE UNICODE_CI]").foreach { condition => - checkAnswer(selectQuery(condition), Seq(Row(Map("a" -> "b")))) - } - - Seq( - // different explicit collation - "c1['A' COLLATE UNICODE]", - // different implicit collation - "c1[c2]").foreach { condition => - assertThrowsError(selectQuery(condition), "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + withSQLConf(SQLConf.ALLOW_COLLATIONS_IN_MAP_KEYS.key -> "true") { + sql( + s""" + |CREATE TABLE $tableName ( + | c1 MAP, + | c2 STRING + |) USING $dataSource + |""".stripMargin) + + sql(s"INSERT INTO $tableName VALUES (map('a', 'b'), 'a')") + + Seq("c1['A']", + "c1['A' COLLATE UNICODE_CI]", + "c1[c2 COLLATE UNICODE_CI]").foreach { condition => + checkAnswer(selectQuery(condition), Seq(Row(Map("a" -> "b")))) + } + + Seq( + // different explicit collation + "c1['A' COLLATE UNICODE]", + // different implicit collation + "c1[c2]").foreach { condition => + assertThrowsError(selectQuery(condition), "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/DefaultCollationTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/DefaultCollationTestSuite.scala new file mode 100644 index 0000000000000..69f1c6da65d12 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/DefaultCollationTestSuite.scala @@ -0,0 +1,502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.collation + +import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} +import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.connector.DatasourceV2SQLBase +import org.apache.spark.sql.internal.SqlApiConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StringType + +abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSession { + + def dataSource: String = "parquet" + def testTable: String = "test_tbl" + def testView: String = "test_view" + protected val fullyQualifiedPrefix = s"${CollationFactory.CATALOG}.${CollationFactory.SCHEMA}." + + def withSessionCollationAndTable(collation: String, testTables: String*)(f: => Unit): Unit = { + withTable(testTables: _*) { + withSessionCollation(collation) { + f + } + } + } + + def withSessionCollationAndView(collation: String, viewNames: String*)(f: => Unit): Unit = { + withView(viewNames: _*) { + withSessionCollation(collation) { + f + } + } + } + + def withSessionCollation(collation: String)(f: => Unit): Unit = { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collation) { + f + } + } + + def assertTableColumnCollation( + table: String, + column: String, + expectedCollation: String): Unit = { + val colType = spark.table(table).schema(column).dataType + assert(colType === StringType(expectedCollation)) + } + + def assertThrowsImplicitMismatch(f: => DataFrame): Unit = { + val exception = intercept[AnalysisException] { + f + } + assert(exception.getCondition === "COLLATION_MISMATCH.IMPLICIT") + } + + // region DDL tests + + test("create/alter table") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + // create table with implicit collation + sql(s"CREATE TABLE $testTable (c1 STRING) USING $dataSource") + assertTableColumnCollation(testTable, "c1", "UTF8_BINARY") + + // alter table add column with implicit collation + sql(s"ALTER TABLE $testTable ADD COLUMN c2 STRING") + assertTableColumnCollation(testTable, "c2", "UTF8_BINARY") + + sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING COLLATE UNICODE") + assertTableColumnCollation(testTable, "c2", "UNICODE") + + sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING") + assertTableColumnCollation(testTable, "c2", "UTF8_BINARY") + } + } + + test("create table with explicit collation") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING COLLATE UTF8_LCASE) USING $dataSource") + assertTableColumnCollation(testTable, "c1", "UTF8_LCASE") + } + + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING COLLATE UNICODE) USING $dataSource") + assertTableColumnCollation(testTable, "c1", "UNICODE") + } + } + + test("create table as select") { + // literals in select do not pick up session collation + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable USING $dataSource AS SELECT + | 'a' AS c1, + | 'a' || 'a' AS c2, + | SUBSTRING('a', 1, 1) AS c3, + | SUBSTRING(SUBSTRING('ab', 1, 1), 1, 1) AS c4, + | 'a' = 'A' AS truthy + |""".stripMargin) + assertTableColumnCollation(testTable, "c1", "UTF8_BINARY") + assertTableColumnCollation(testTable, "c2", "UTF8_BINARY") + assertTableColumnCollation(testTable, "c3", "UTF8_BINARY") + assertTableColumnCollation(testTable, "c4", "UTF8_BINARY") + + checkAnswer(sql(s"SELECT COUNT(*) FROM $testTable WHERE truthy"), Seq(Row(0))) + } + + // literals in inline table do not pick up session collation + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable USING $dataSource AS + |SELECT c1, c1 = 'A' as c2 FROM VALUES ('a'), ('A') AS vals(c1) + |""".stripMargin) + assertTableColumnCollation(testTable, "c1", "UTF8_BINARY") + checkAnswer(sql(s"SELECT COUNT(*) FROM $testTable WHERE c2"), Seq(Row(1))) + } + + // cast in select does not pick up session collation + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable USING $dataSource AS SELECT cast('a' AS STRING) AS c1") + assertTableColumnCollation(testTable, "c1", "UTF8_BINARY") + } + } + + test("ctas with complex types") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable USING $dataSource AS + |SELECT + | struct('a') AS c1, + | map('a', 'b') AS c2, + | array('a') AS c3 + |""".stripMargin) + + checkAnswer(sql(s"SELECT COLLATION(c1.col1) FROM $testTable"), + Seq(Row(fullyQualifiedPrefix + "UTF8_BINARY"))) + checkAnswer(sql(s"SELECT COLLATION(c2['a']) FROM $testTable"), + Seq(Row(fullyQualifiedPrefix + "UTF8_BINARY"))) + checkAnswer(sql(s"SELECT COLLATION(c3[0]) FROM $testTable"), + Seq(Row(fullyQualifiedPrefix + "UTF8_BINARY"))) + } + } + + test("ctas with union") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable USING $dataSource AS + |SELECT 'a' = 'A' AS c1 + |UNION + |SELECT 'b' = 'B' AS c1 + |""".stripMargin) + + checkAnswer(sql(s"SELECT * FROM $testTable"), Seq(Row(false))) + } + + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable USING $dataSource AS + |SELECT 'a' = 'A' AS c1 + |UNION ALL + |SELECT 'b' = 'B' AS c1 + |""".stripMargin) + + checkAnswer(sql(s"SELECT * FROM $testTable"), Seq(Row(false), Row(false))) + } + } + + test("add column") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING COLLATE UTF8_LCASE) USING $dataSource") + assertTableColumnCollation(testTable, "c1", "UTF8_LCASE") + + sql(s"ALTER TABLE $testTable ADD COLUMN c2 STRING") + assertTableColumnCollation(testTable, "c2", "UTF8_BINARY") + + sql(s"ALTER TABLE $testTable ADD COLUMN c3 STRING COLLATE UNICODE") + assertTableColumnCollation(testTable, "c3", "UNICODE") + } + } + + test("inline table in CTAS") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable + |USING $dataSource + |AS SELECT * + |FROM (VALUES ('a', 'a' = 'A')) + |AS inline_table(c1, c2); + |""".stripMargin) + + assertTableColumnCollation(testTable, "c1", "UTF8_BINARY") + checkAnswer(sql(s"SELECT COUNT(*) FROM $testTable WHERE c2"), Seq(Row(0))) + } + } + + test("subsequent analyzer iterations correctly resolve default string types") { + // since concat coercion happens after resolving default types this test + // makes sure that we are correctly resolving the default string types + // in subsequent analyzer iterations + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable + |USING $dataSource AS + |SELECT CONCAT(X'68656C6C6F', 'world') AS c1 + |""".stripMargin) + + checkAnswer(sql(s"SELECT c1 FROM $testTable"), Seq(Row("helloworld"))) + } + + // ELT is similar + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s""" + |CREATE TABLE $testTable + |USING $dataSource AS + |SELECT ELT(1, X'68656C6C6F', 'world') AS c1 + |""".stripMargin) + + checkAnswer(sql(s"SELECT c1 FROM $testTable"), Seq(Row("hello"))) + } + } + + // endregion + + // region DML tests + + test("literals with default collation") { + val sessionCollation = "UTF8_LCASE" + val sessionCollationFullyQualified = fullyQualifiedPrefix + sessionCollation + withSessionCollation(sessionCollation) { + + // literal without collation + checkAnswer(sql("SELECT COLLATION('a')"), Seq(Row(sessionCollationFullyQualified))) + + checkAnswer(sql("SELECT COLLATION(map('a', 'b')['a'])"), + Seq(Row(sessionCollationFullyQualified))) + + checkAnswer(sql("SELECT COLLATION(array('a')[0])"), Seq(Row(sessionCollationFullyQualified))) + + checkAnswer(sql("SELECT COLLATION(struct('a' as c)['c'])"), + Seq(Row(sessionCollationFullyQualified))) + } + } + + test("literals with explicit collation") { + val unicodeCollation = fullyQualifiedPrefix + "UNICODE" + withSessionCollation("UTF8_LCASE") { + checkAnswer(sql("SELECT COLLATION('a' collate unicode)"), Seq(Row(unicodeCollation))) + + checkAnswer( + sql("SELECT COLLATION(map('a', 'b' collate unicode)['a'])"), + Seq(Row(unicodeCollation))) + + checkAnswer(sql("SELECT COLLATION(array('a' collate unicode)[0])"), + Seq(Row(unicodeCollation))) + + checkAnswer( + sql("SELECT COLLATION(struct('a' collate unicode as c)['c'])"), + Seq(Row(unicodeCollation))) + } + } + + test("cast is aware of session collation") { + val sessionCollation = "UTF8_LCASE" + val sessionCollationFullyQualified = fullyQualifiedPrefix + sessionCollation + withSessionCollation(sessionCollation) { + checkAnswer(sql("SELECT COLLATION(cast('a' as STRING))"), + Seq(Row(sessionCollationFullyQualified))) + + checkAnswer( + sql("SELECT COLLATION(cast(map('a', 'b') as MAP)['a'])"), + Seq(Row(sessionCollationFullyQualified))) + + checkAnswer( + sql("SELECT COLLATION(map_keys(cast(map('a', 'b') as MAP))[0])"), + Seq(Row(sessionCollationFullyQualified))) + + checkAnswer( + sql("SELECT COLLATION(cast(array('a') as ARRAY)[0])"), + Seq(Row(sessionCollationFullyQualified))) + + checkAnswer( + sql("SELECT COLLATION(cast(struct('a' as c) as STRUCT)['c'])"), + Seq(Row(sessionCollationFullyQualified))) + } + } + + test("expressions in where are aware of session collation") { + withSessionCollation("UTF8_LCASE") { + // expression in where is aware of session collation + checkAnswer(sql("SELECT 1 WHERE 'a' = 'A'"), Seq(Row(1))) + + checkAnswer(sql("SELECT 1 WHERE 'a' = cast('A' as STRING)"), Seq(Row(1))) + } + } + + test("having group by is aware of session collation") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING) USING $dataSource") + sql(s"INSERT INTO $testTable VALUES ('a'), ('A')") + + // having clause uses session (default) collation + checkAnswer( + sql(s"SELECT COUNT(*) FROM $testTable GROUP BY c1 HAVING 'a' = 'A'"), + Seq(Row(1), Row(1))) + + // having clause uses column (implicit) collation + checkAnswer( + sql(s"SELECT COUNT(*) FROM $testTable GROUP BY c1 HAVING c1 = 'A'"), + Seq(Row(1))) + } + } + + test("min/max are aware of session collation") { + // scalastyle:off nonascii + withSessionCollationAndTable("UNICODE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING) USING $dataSource") + sql(s"INSERT INTO $testTable VALUES ('1'), ('½')") + + checkAnswer(sql(s"SELECT MIN(c1) FROM $testTable"), Seq(Row("1"))) + + checkAnswer(sql(s"SELECT MAX(c1) FROM $testTable"), Seq(Row("½"))) + } + // scalastyle:on nonascii + } + + test("union operation with subqueries") { + withSessionCollation("UTF8_LCASE") { + checkAnswer( + sql(s""" + |SELECT 'a' = 'A' + |UNION + |SELECT 'b' = 'B' + |""".stripMargin), + Seq(Row(true))) + + checkAnswer( + sql(s""" + |SELECT 'a' = 'A' + |UNION ALL + |SELECT 'b' = 'B' + |""".stripMargin), + Seq(Row(true), Row(true))) + } + } + + test("inline table in SELECT") { + withSessionCollation("UTF8_LCASE") { + val df = s""" + |SELECT * + |FROM (VALUES ('a', 'a' = 'A')) + |""".stripMargin + + checkAnswer(sql(df), Seq(Row("a", true))) + } + } + + test("inline table in insert") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING, c2 BOOLEAN) USING $dataSource") + + sql(s"INSERT INTO $testTable VALUES ('a', 'a' = 'A')") + checkAnswer(sql(s"SELECT * FROM $testTable"), Seq(Row("a", true))) + } + } + + test("literals in insert inherit session level collation") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 BOOLEAN) USING $dataSource") + + sql(s"INSERT INTO $testTable VALUES ('a' = 'A')") + sql(s"INSERT INTO $testTable VALUES (array_contains(array('a'), 'A'))") + sql(s"INSERT INTO $testTable VALUES (CONCAT(X'68656C6C6F', 'world') = 'HELLOWORLD')") + + checkAnswer(sql(s"SELECT COUNT(*) FROM $testTable WHERE c1"), Seq(Row(3))) + } + } + + // endregion +} + +class DefaultCollationTestSuiteV1 extends DefaultCollationTestSuite { + + test("create/alter view created from a table") { + val sessionCollation = "UTF8_LCASE" + withSessionCollationAndTable(sessionCollation, testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING, c2 STRING COLLATE UNICODE_CI) USING $dataSource") + sql(s"INSERT INTO $testTable VALUES ('a', 'a'), ('A', 'A')") + + withView(testView) { + sql(s"CREATE VIEW $testView AS SELECT * FROM $testTable") + + assertTableColumnCollation(testView, "c1", "UTF8_BINARY") + assertTableColumnCollation(testView, "c2", "UNICODE_CI") + checkAnswer( + sql(s"SELECT DISTINCT COLLATION(c1), COLLATION('a') FROM $testView"), + Row(fullyQualifiedPrefix + "UTF8_BINARY", fullyQualifiedPrefix + sessionCollation)) + + // filter should use session collation + checkAnswer(sql(s"SELECT COUNT(*) FROM $testView WHERE 'a' = 'A'"), Row(2)) + + // filter should use column collation + checkAnswer(sql(s"SELECT COUNT(*) FROM $testView WHERE c1 = 'A'"), Row(1)) + + checkAnswer( + sql(s"SELECT COUNT(*) FROM $testView WHERE c1 = substring('A', 0, 1)"), + Row(1)) + + // literal with explicit collation wins + checkAnswer( + sql(s"SELECT COUNT(*) FROM $testView WHERE c1 = 'A' collate UNICODE_CI"), + Row(2)) + + // two implicit collations -> errors out + assertThrowsImplicitMismatch(sql(s"SELECT c1 = c2 FROM $testView")) + + sql(s"ALTER VIEW $testView AS SELECT c1 COLLATE UNICODE_CI AS c1, c2 FROM $testTable") + assertTableColumnCollation(testView, "c1", "UNICODE_CI") + assertTableColumnCollation(testView, "c2", "UNICODE_CI") + checkAnswer( + sql(s"SELECT DISTINCT COLLATION(c1), COLLATION('a') FROM $testView"), + Row(fullyQualifiedPrefix + "UNICODE_CI", fullyQualifiedPrefix + sessionCollation)) + + // after alter both rows should be returned + checkAnswer(sql(s"SELECT COUNT(*) FROM $testView WHERE c1 = 'A'"), Row(2)) + } + } + } + + test("join view with table") { + val viewTableName = "view_table" + val joinTableName = "join_table" + val sessionCollation = "sr" + + withSessionCollationAndTable(sessionCollation, viewTableName, joinTableName) { + sql(s"CREATE TABLE $viewTableName (c1 STRING COLLATE UNICODE_CI) USING $dataSource") + sql(s"CREATE TABLE $joinTableName (c1 STRING COLLATE UTF8_LCASE) USING $dataSource") + sql(s"INSERT INTO $viewTableName VALUES ('a')") + sql(s"INSERT INTO $joinTableName VALUES ('A')") + + withView(testView) { + sql(s"CREATE VIEW $testView AS SELECT * FROM $viewTableName") + + assertThrowsImplicitMismatch( + sql(s"SELECT * FROM $testView JOIN $joinTableName ON $testView.c1 = $joinTableName.c1")) + + checkAnswer( + sql(s""" + |SELECT COLLATION($testView.c1), COLLATION($joinTableName.c1) + |FROM $testView JOIN $joinTableName + |ON $testView.c1 = $joinTableName.c1 COLLATE UNICODE_CI + |""".stripMargin), + Row(fullyQualifiedPrefix + "UNICODE_CI", fullyQualifiedPrefix + "UTF8_LCASE")) + } + } + } +} + +class DefaultCollationTestSuiteV2 extends DefaultCollationTestSuite with DatasourceV2SQLBase { + override def testTable: String = s"testcat.${super.testTable}" + override def testView: String = s"testcat.${super.testView}" + + // delete only works on v2 + test("delete behavior") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING) USING $dataSource") + sql(s"INSERT INTO $testTable VALUES ('a'), ('A')") + + sql(s"DELETE FROM $testTable WHERE 'a' = 'A'") + checkAnswer(sql(s"SELECT COUNT(*) FROM $testTable"), Seq(Row(0))) + } + } + + test("inline table in RTAS") { + withSessionCollationAndTable("UTF8_LCASE", testTable) { + sql(s"CREATE TABLE $testTable (c1 STRING, c2 BOOLEAN) USING $dataSource") + sql(s""" + |REPLACE TABLE $testTable + |USING $dataSource + |AS SELECT * + |FROM (VALUES ('a', 'a' = 'A')) + |AS inline_table(c1, c2); + |""".stripMargin) + + assertTableColumnCollation(testTable, "c1", "UTF8_BINARY") + checkAnswer(sql(s"SELECT COUNT(*) FROM $testTable WHERE c2"), Seq(Row(0))) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala index 21aa57cc1eace..00e1f2f93fdcb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala @@ -1396,4 +1396,39 @@ trait AlterTableTests extends SharedSparkSession with QueryErrorsBase { parameters = Map("columnName" -> "`data`")) } } + + test("Alter column type between string and char/varchar") { + val types = Seq( + ("STRING", "\"STRING\""), + ("STRING COLLATE UTF8_LCASE", "\"STRING COLLATE UTF8_LCASE\""), + ("CHAR(5)", "\"CHAR\\(5\\)\""), + ("VARCHAR(5)", "\"VARCHAR\\(5\\)\"")) + types.flatMap { a => types.map { b => (a, b) } } + .filter { case (a, b) => a != b } + .filter { case ((a, _), (b, _)) => !a.startsWith("STRING") || !b.startsWith("STRING") } + .foreach { case ((from, originType), (to, newType)) => + val t = "table_name" + withTable(t) { + sql(s"CREATE TABLE $t (id $from) USING PARQUET") + val sql1 = s"ALTER TABLE $t ALTER COLUMN id TYPE $to" + checkErrorMatchPVals( + exception = intercept[AnalysisException] { + sql(sql1) + }, + condition = "NOT_SUPPORTED_CHANGE_COLUMN", + sqlState = None, + parameters = Map( + "originType" -> originType, + "newType" -> newType, + "newName" -> "`id`", + "originName" -> "`id`", + "table" -> ".*table_name.*"), + context = ExpectedContext( + fragment = sql1, + start = 0, + stop = sql1.length - 1) + ) + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2MetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2MetricsSuite.scala new file mode 100644 index 0000000000000..fe28b85528632 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2MetricsSuite.scala @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import java.util + +import org.apache.spark.sql.QueryTest.withQueryExecutionsCaptured +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Column, Identifier, InMemoryTable, InMemoryTableCatalog, StagedTable, StagingInMemoryTableCatalog} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.metric.{CustomMetric, CustomSumMetric, CustomTaskMetric} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.datasources.v2.{AtomicCreateTableAsSelectExec, AtomicReplaceTableAsSelectExec, AtomicReplaceTableExec, CreateTableAsSelectExec, ReplaceTableAsSelectExec, ReplaceTableExec} + +class StagingInMemoryTableCatalogWithMetrics extends StagingInMemoryTableCatalog { + + case class TestSupportedCommitMetric(name: String, description: String) extends CustomSumMetric + + override def supportedCustomMetrics(): Array[CustomMetric] = Array( + TestSupportedCommitMetric("numFiles", "number of written files"), + TestSupportedCommitMetric("numOutputRows", "number of output rows"), + TestSupportedCommitMetric("numOutputBytes", "written output")) + + private class TestStagedTableWithMetric( + ident: Identifier, + delegateTable: InMemoryTable + ) extends TestStagedTable(ident, delegateTable) with StagedTable { + + private var stagedChangesCommitted = false + + override def commitStagedChanges(): Unit = { + tables.put(ident, delegateTable) + stagedChangesCommitted = true + } + + override def reportDriverMetrics: Array[CustomTaskMetric] = { + assert(stagedChangesCommitted) + StagingInMemoryTableCatalogWithMetrics.testMetrics + } + } + + override def stageCreate( + ident: Identifier, + columns: Array[Column], + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = { + new TestStagedTableWithMetric( + ident, + new InMemoryTable(s"$name.${ident.quoted}", + CatalogV2Util.v2ColumnsToStructType(columns), partitions, properties)) + } + + override def stageReplace( + ident: Identifier, + columns: Array[Column], + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + stageCreate(ident, columns, partitions, properties) + + override def stageCreateOrReplace( + ident: Identifier, + columns: Array[Column], + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + stageCreate(ident, columns, partitions, properties) +} + +object StagingInMemoryTableCatalogWithMetrics { + + case class TestCustomTaskMetric(name: String, value: Long) extends CustomTaskMetric + + val testMetrics: Array[CustomTaskMetric] = Array( + TestCustomTaskMetric("numFiles", 1337), + TestCustomTaskMetric("numOutputRows", 1338), + TestCustomTaskMetric("numOutputBytes", 1339)) +} + +class DataSourceV2MetricsSuite extends DatasourceV2SQLBase { + + private val testCatalog = "test_catalog" + private val atomicTestCatalog = "atomic_test_catalog" + private val nonExistingTable = "non_existing_table" + private val existingTable = "existing_table" + + private def captureStagedTableWrite(thunk: => Unit): SparkPlan = { + val physicalPlans = withQueryExecutionsCaptured(spark)(thunk).map(_.executedPlan) + val stagedTableWrites = physicalPlans.filter { + case _: AtomicCreateTableAsSelectExec | _: CreateTableAsSelectExec | + _: AtomicReplaceTableAsSelectExec | _: ReplaceTableAsSelectExec | + _: AtomicReplaceTableExec | _: ReplaceTableExec => true + case _ => false + } + assert(stagedTableWrites.size === 1) + stagedTableWrites.head + } + + private def commands: Seq[String => Unit] = Seq( + { catalogName => + sql(s"CREATE TABLE $catalogName.$nonExistingTable AS SELECT * FROM $existingTable") }, + { catalogName => + spark.table(existingTable).write.saveAsTable(s"$catalogName.$nonExistingTable") }, + { catalogName => + sql(s"CREATE OR REPLACE TABLE $catalogName.$nonExistingTable " + + s"AS SELECT * FROM $existingTable") }, + { catalogName => + sql(s"REPLACE TABLE $catalogName.$existingTable AS SELECT * FROM $existingTable") }, + { catalogName => + spark.table(existingTable) + .write.mode("overwrite").saveAsTable(s"$catalogName.$existingTable") }, + { catalogName => + sql(s"REPLACE TABLE $catalogName.$existingTable (id bigint, data string)") }) + + private def catalogCommitMetricsTest( + testName: String, catalogName: String)(testFunction: SparkPlan => Unit): Unit = { + commands.foreach { command => + test(s"$testName - $command") { + registerCatalog(testCatalog, classOf[InMemoryTableCatalog]) + registerCatalog(atomicTestCatalog, classOf[StagingInMemoryTableCatalogWithMetrics]) + withTable(existingTable, s"$catalogName.$existingTable") { + sql(s"CREATE TABLE $existingTable (id bigint, data string)") + sql(s"CREATE TABLE $catalogName.$existingTable (id bigint, data string)") + + testFunction(captureStagedTableWrite(command(catalogName))) + } + } + } + } + + catalogCommitMetricsTest( + "No metrics in the plan if the catalog does not support them", testCatalog) { sparkPlan => + val metrics = sparkPlan.metrics + + assert(metrics.isEmpty) + } + + catalogCommitMetricsTest( + "Plan metrics values are the values from the catalog", atomicTestCatalog) { sparkPlan => + val metrics = sparkPlan.metrics + + assert(metrics.size === StagingInMemoryTableCatalogWithMetrics.testMetrics.length) + StagingInMemoryTableCatalogWithMetrics.testMetrics.foreach(customTaskMetric => + assert(metrics(customTaskMetric.name()).value === customTaskMetric.value())) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2OptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2OptionSuite.scala new file mode 100644 index 0000000000000..70291336ba317 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2OptionSuite.scala @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import org.apache.spark.sql.{AnalysisException, Row} +import org.apache.spark.sql.QueryTest.withQueryExecutionsCaptured +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.connector.catalog.InMemoryBaseTable +import org.apache.spark.sql.execution.CommandResultExec +import org.apache.spark.sql.execution.datasources.v2._ +import org.apache.spark.sql.functions.lit + +class DataSourceV2OptionSuite extends DatasourceV2SQLBase { + import testImplicits._ + + private val catalogAndNamespace = "testcat.ns1.ns2." + + test("SPARK-36680: Supports Dynamic Table Options for SQL Select") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") + + var df = sql(s"SELECT * FROM $t1") + var collected = df.queryExecution.optimizedPlan.collect { + case scan: DataSourceV2ScanRelation => + assert(scan.relation.options.isEmpty) + } + assert (collected.size == 1) + checkAnswer(df, Seq(Row(1, "a"), Row(2, "b"))) + + df = sql(s"SELECT * FROM $t1 WITH (`split-size` = 5)") + collected = df.queryExecution.optimizedPlan.collect { + case scan: DataSourceV2ScanRelation => + assert(scan.relation.options.get("split-size") == "5") + } + assert (collected.size == 1) + checkAnswer(df, Seq(Row(1, "a"), Row(2, "b"))) + + collected = df.queryExecution.executedPlan.collect { + case BatchScanExec(_, scan: InMemoryBaseTable#InMemoryBatchScan, _, _, _, _) => + assert(scan.options.get("split-size") === "5") + } + assert (collected.size == 1) + + val noValues = intercept[AnalysisException]( + sql(s"SELECT * FROM $t1 WITH (`split-size`)")) + assert(noValues.message.contains( + "Operation not allowed: Values must be specified for key(s): [split-size]")) + } + } + + test("SPARK-50286: Propagate options for DataFrameReader") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") + + var df = spark.table(t1) + var collected = df.queryExecution.optimizedPlan.collect { + case scan: DataSourceV2ScanRelation => + assert(scan.relation.options.isEmpty) + } + assert (collected.size == 1) + checkAnswer(df, Seq(Row(1, "a"), Row(2, "b"))) + + df = spark.read.option("split-size", "5").table(t1) + collected = df.queryExecution.optimizedPlan.collect { + case scan: DataSourceV2ScanRelation => + assert(scan.relation.options.get("split-size") == "5") + } + assert (collected.size == 1) + checkAnswer(df, Seq(Row(1, "a"), Row(2, "b"))) + + collected = df.queryExecution.executedPlan.collect { + case BatchScanExec(_, scan: InMemoryBaseTable#InMemoryBatchScan, _, _, _, _) => + assert(scan.options.get("split-size") === "5") + } + assert (collected.size == 1) + } + } + + test("SPARK-49098, SPARK-50286: Supports Dynamic Table Options for SQL Insert") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + val df = sql(s"INSERT INTO $t1 WITH (`write.split-size` = 10) VALUES (1, 'a'), (2, 'b')") + + var collected = df.queryExecution.optimizedPlan.collect { + case CommandResult(_, AppendData(relation: DataSourceV2Relation, _, _, _, _, _), _, _) => + assert(relation.options.get("write.split-size") == "10") + } + assert (collected.size == 1) + + collected = df.queryExecution.executedPlan.collect { + case CommandResultExec( + _, AppendDataExec(_, _, write), + _) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#Append] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + + val insertResult = sql(s"SELECT * FROM $t1") + checkAnswer(insertResult, Seq(Row(1, "a"), Row(2, "b"))) + } + } + + test("SPARK-50286: Propagate options for DataFrameWriter Append") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + val captured = withQueryExecutionsCaptured(spark) { + Seq(1 -> "a", 2 -> "b").toDF("id", "data") + .write + .option("write.split-size", "10") + .mode("append") + .insertInto(t1) + } + assert(captured.size === 1) + val qe = captured.head + var collected = qe.optimizedPlan.collect { + case AppendData(_: DataSourceV2Relation, _, writeOptions, _, _, _) => + assert(writeOptions("write.split-size") == "10") + } + assert (collected.size == 1) + + collected = qe.executedPlan.collect { + case AppendDataExec(_, _, write) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#Append] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + } + } + + test("SPARK-50286: Propagate options for DataFrameWriterV2 Append") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + val captured = withQueryExecutionsCaptured(spark) { + Seq(1 -> "a", 2 -> "b").toDF("id", "data") + .writeTo(t1) + .option("write.split-size", "10") + .append() + } + assert(captured.size === 1) + val qe = captured.head + var collected = qe.optimizedPlan.collect { + case AppendData(_: DataSourceV2Relation, _, writeOptions, _, _, _) => + assert(writeOptions("write.split-size") == "10") + } + assert (collected.size == 1) + + collected = qe.executedPlan.collect { + case AppendDataExec(_, _, write) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#Append] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + } + } + + test("SPARK-49098, SPARK-50286: Supports Dynamic Table Options for SQL Insert Overwrite") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") + + val df = sql(s"INSERT OVERWRITE $t1 WITH (`write.split-size` = 10) " + + s"VALUES (3, 'c'), (4, 'd')") + var collected = df.queryExecution.optimizedPlan.collect { + case CommandResult(_, + OverwriteByExpression(relation: DataSourceV2Relation, _, _, _, _, _, _), + _, _) => + assert(relation.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + + collected = df.queryExecution.executedPlan.collect { + case CommandResultExec( + _, OverwriteByExpressionExec(_, _, write), + _) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#TruncateAndAppend] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + + val insertResult = sql(s"SELECT * FROM $t1") + checkAnswer(insertResult, Seq(Row(3, "c"), Row(4, "d"))) + } + } + + test("SPARK-50286: Propagate options for DataFrameWriterV2 OverwritePartitions") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") + + val captured = withQueryExecutionsCaptured(spark) { + Seq(3 -> "c", 4 -> "d").toDF("id", "data") + .writeTo(t1) + .option("write.split-size", "10") + .overwritePartitions() + } + assert(captured.size === 1) + val qe = captured.head + var collected = qe.optimizedPlan.collect { + case OverwritePartitionsDynamic(_: DataSourceV2Relation, _, writeOptions, _, _) => + assert(writeOptions("write.split-size") === "10") + } + assert (collected.size == 1) + + collected = qe.executedPlan.collect { + case OverwritePartitionsDynamicExec(_, _, write) => + val dynOverwrite = write.toBatch.asInstanceOf[InMemoryBaseTable#DynamicOverwrite] + assert(dynOverwrite.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + } + } + + test("SPARK-49098, SPARK-50286: Supports Dynamic Table Options for SQL Insert Replace") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") + + val df = sql(s"INSERT INTO $t1 WITH (`write.split-size` = 10) " + + s"REPLACE WHERE TRUE " + + s"VALUES (3, 'c'), (4, 'd')") + var collected = df.queryExecution.optimizedPlan.collect { + case CommandResult(_, + OverwriteByExpression(relation: DataSourceV2Relation, _, _, _, _, _, _), + _, _) => + assert(relation.options.get("write.split-size") == "10") + } + assert (collected.size == 1) + + collected = df.queryExecution.executedPlan.collect { + case CommandResultExec( + _, OverwriteByExpressionExec(_, _, write), + _) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#TruncateAndAppend] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + + val insertResult = sql(s"SELECT * FROM $t1") + checkAnswer(insertResult, Seq(Row(3, "c"), Row(4, "d"))) + } + } + + test("SPARK-50286: Propagate options for DataFrameWriter Overwrite") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + val captured = withQueryExecutionsCaptured(spark) { + Seq(1 -> "a", 2 -> "b").toDF("id", "data") + .write + .option("write.split-size", "10") + .mode("overwrite") + .insertInto(t1) + } + assert(captured.size === 1) + + val qe = captured.head + var collected = qe.optimizedPlan.collect { + case OverwriteByExpression(_: DataSourceV2Relation, _, _, writeOptions, _, _, _) => + assert(writeOptions("write.split-size") === "10") + } + assert (collected.size == 1) + + collected = qe.executedPlan.collect { + case OverwriteByExpressionExec(_, _, write) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#TruncateAndAppend] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + } + } + + test("SPARK-50286: Propagate options for DataFrameWriterV2 Overwrite") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") + + val captured = withQueryExecutionsCaptured(spark) { + Seq(3 -> "c", 4 -> "d").toDF("id", "data") + .writeTo(t1) + .option("write.split-size", "10") + .overwrite(lit(true)) + } + assert(captured.size === 1) + val qe = captured.head + + var collected = qe.optimizedPlan.collect { + case OverwriteByExpression(_: DataSourceV2Relation, _, _, writeOptions, _, _, _) => + assert(writeOptions("write.split-size") === "10") + } + assert (collected.size == 1) + + collected = qe.executedPlan.collect { + case OverwriteByExpressionExec(_, _, write) => + val append = write.toBatch.asInstanceOf[InMemoryBaseTable#TruncateAndAppend] + assert(append.info.options.get("write.split-size") === "10") + } + assert (collected.size == 1) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 510ea49b58418..8d255e9efda54 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.CurrentUserContext.CURRENT_USER import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchNamespaceException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils} import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, ColumnStat, CommandResult, OverwriteByExpression} +import org.apache.spark.sql.catalyst.plans.logical.ColumnStat import org.apache.spark.sql.catalyst.statsEstimation.StatsEstimationTestBase import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.connector.catalog.{Column => ColumnV2, _} @@ -44,7 +44,6 @@ import org.apache.spark.sql.execution.FilterExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelationWithTable} -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} @@ -279,8 +278,8 @@ class DataSourceV2SQLSuiteV1Filter test("CreateTable: without USING clause") { withSQLConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key -> "false") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") val testCatalog = catalog("testcat").asTableCatalog sql("CREATE TABLE testcat.t1 (id int)") @@ -786,8 +785,8 @@ class DataSourceV2SQLSuiteV1Filter } test("CreateTableAsSelect: v2 session catalog can load v1 source table") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") df.createOrReplaceTempView("source") @@ -847,8 +846,8 @@ class DataSourceV2SQLSuiteV1Filter // TODO: ignored by SPARK-31707, restore the test after create table syntax unification ignore("CreateTableAsSelect: without USING clause") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") val testCatalog = catalog("testcat").asTableCatalog sql("CREATE TABLE testcat.t1 AS SELECT 1 i") @@ -1087,11 +1086,11 @@ class DataSourceV2SQLSuiteV1Filter Seq(true, false).foreach { useV1Table => val format = if (useV1Table) "json" else v2Format if (useV1Table) { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") } else { spark.conf.set( - V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName) + V2_SESSION_CATALOG_IMPLEMENTATION, classOf[InMemoryTableSessionCatalog].getName) } withTable("t") { @@ -1261,8 +1260,12 @@ class DataSourceV2SQLSuiteV1Filter PROP_OWNER -> "it will be set to the current user", PROP_EXTERNAL -> "please use CREATE EXTERNAL TABLE" ) + val excludedProperties = Set(TableCatalog.PROP_COMMENT, TableCatalog.PROP_COLLATION) + val tableLegacyProperties = CatalogV2Util.TABLE_RESERVED_PROPERTIES + .filterNot(excludedProperties.contains) + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { - CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + tableLegacyProperties.foreach { key => Seq("OPTIONS", "TBLPROPERTIES").foreach { clause => Seq("CREATE", "REPLACE").foreach { action => val sqlText = s"$action TABLE testcat.reservedTest (key int) " + @@ -1315,7 +1318,7 @@ class DataSourceV2SQLSuiteV1Filter } } withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { - CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + tableLegacyProperties.foreach { key => Seq("OPTIONS", "TBLPROPERTIES").foreach { clause => withTable("testcat.reservedTest") { Seq("CREATE", "REPLACE").foreach { action => @@ -1812,8 +1815,8 @@ class DataSourceV2SQLSuiteV1Filter } test("SPARK-46972: asymmetrical replacement for char/varchar in V2SessionCatalog.createTable") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") withTable("t") { sql(s"CREATE TABLE t(c char(1), v varchar(2)) USING $v2Source") } @@ -2530,8 +2533,8 @@ class DataSourceV2SQLSuiteV1Filter } test("SPARK-30001: session catalog name can be specified in SQL statements") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") withTable("t") { sql("CREATE TABLE t USING json AS SELECT 1 AS i") @@ -2595,8 +2598,8 @@ class DataSourceV2SQLSuiteV1Filter } test("SPARK-30094: current namespace is used during table resolution") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") withTable("spark_catalog.default.t", "testcat.ns.t") { sql("CREATE TABLE t USING parquet AS SELECT 1") @@ -2610,8 +2613,8 @@ class DataSourceV2SQLSuiteV1Filter } test("SPARK-30284: CREATE VIEW should track the current catalog and namespace") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") val sessionCatalogName = CatalogManager.SESSION_CATALOG_NAME sql("CREATE NAMESPACE testcat.ns1.ns2") @@ -2648,8 +2651,8 @@ class DataSourceV2SQLSuiteV1Filter } test("COMMENT ON NAMESPACE") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") // Session catalog is used. sql("CREATE NAMESPACE ns") checkNamespaceComment("ns", "minor revision") @@ -2682,8 +2685,8 @@ class DataSourceV2SQLSuiteV1Filter } test("COMMENT ON TABLE") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") // Session catalog is used. withTable("t") { sql("CREATE TABLE t(k int) USING json") @@ -3390,6 +3393,7 @@ class DataSourceV2SQLSuiteV1Filter |TBLPROPERTIES ('prop1' = '1', 'prop2' = '2') |PARTITIONED BY (a) |LOCATION '/tmp' + |DEFAULT COLLATION sr_CI_AI """.stripMargin) val table = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog @@ -3397,6 +3401,7 @@ class DataSourceV2SQLSuiteV1Filter val properties = table.properties assert(properties.get(TableCatalog.PROP_PROVIDER) == "parquet") assert(properties.get(TableCatalog.PROP_COMMENT) == "This is a comment") + assert(properties.get(TableCatalog.PROP_COLLATION) == "sr_CI_AI") assert(properties.get(TableCatalog.PROP_LOCATION) == "file:/tmp") assert(properties.containsKey(TableCatalog.PROP_OWNER)) assert(properties.get(TableCatalog.PROP_EXTERNAL) == "true") @@ -3634,96 +3639,6 @@ class DataSourceV2SQLSuiteV1Filter } } - - test("SPARK-36680: Supports Dynamic Table Options for Spark SQL") { - val t1 = s"${catalogAndNamespace}table" - withTable(t1) { - sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") - sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b')") - - var df = sql(s"SELECT * FROM $t1") - var collected = df.queryExecution.optimizedPlan.collect { - case scan: DataSourceV2ScanRelation => - assert(scan.relation.options.isEmpty) - } - assert (collected.size == 1) - checkAnswer(df, Seq(Row(1, "a"), Row(2, "b"))) - - df = sql(s"SELECT * FROM $t1 WITH (`split-size` = 5)") - collected = df.queryExecution.optimizedPlan.collect { - case scan: DataSourceV2ScanRelation => - assert(scan.relation.options.get("split-size") == "5") - } - assert (collected.size == 1) - checkAnswer(df, Seq(Row(1, "a"), Row(2, "b"))) - - val noValues = intercept[AnalysisException]( - sql(s"SELECT * FROM $t1 WITH (`split-size`)")) - assert(noValues.message.contains( - "Operation not allowed: Values must be specified for key(s): [split-size]")) - } - } - - test("SPARK-36680: Supports Dynamic Table Options for Insert") { - val t1 = s"${catalogAndNamespace}table" - withTable(t1) { - sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") - val df = sql(s"INSERT INTO $t1 WITH (`write.split-size` = 10) VALUES (1, 'a'), (2, 'b')") - - val collected = df.queryExecution.optimizedPlan.collect { - case CommandResult(_, AppendData(relation: DataSourceV2Relation, _, _, _, _, _), _, _) => - assert(relation.options.get("write.split-size") == "10") - } - assert (collected.size == 1) - - val insertResult = sql(s"SELECT * FROM $t1") - checkAnswer(insertResult, Seq(Row(1, "a"), Row(2, "b"))) - } - } - - test("SPARK-36680: Supports Dynamic Table Options for Insert Overwrite") { - val t1 = s"${catalogAndNamespace}table" - withTable(t1) { - sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") - sql(s"INSERT INTO $t1 WITH (`write.split-size` = 10) VALUES (1, 'a'), (2, 'b')") - - val df = sql(s"INSERT OVERWRITE $t1 WITH (`write.split-size` = 10) " + - s"VALUES (3, 'c'), (4, 'd')") - val collected = df.queryExecution.optimizedPlan.collect { - case CommandResult(_, - OverwriteByExpression(relation: DataSourceV2Relation, _, _, _, _, _, _), - _, _) => - assert(relation.options.get("write.split-size") == "10") - } - assert (collected.size == 1) - - val insertResult = sql(s"SELECT * FROM $t1") - checkAnswer(insertResult, Seq(Row(3, "c"), Row(4, "d"))) - } - } - - test("SPARK-36680: Supports Dynamic Table Options for Insert Replace") { - val t1 = s"${catalogAndNamespace}table" - withTable(t1) { - sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") - sql(s"INSERT INTO $t1 WITH (`write.split-size` = 10) VALUES (1, 'a'), (2, 'b')") - - val df = sql(s"INSERT INTO $t1 WITH (`write.split-size` = 10) " + - s"REPLACE WHERE TRUE " + - s"VALUES (3, 'c'), (4, 'd')") - val collected = df.queryExecution.optimizedPlan.collect { - case CommandResult(_, - OverwriteByExpression(relation: DataSourceV2Relation, _, _, _, _, _, _), - _, _) => - assert(relation.options.get("write.split-size") == "10") - } - assert (collected.size == 1) - - val insertResult = sql(s"SELECT * FROM $t1") - checkAnswer(insertResult, Seq(Row(3, "c"), Row(4, "d"))) - } - } - test("SPARK-49183: custom spark_catalog generates location for managed tables") { // Reset CatalogManager to clear the materialized `spark_catalog` instance, so that we can // configure a new implementation. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DeleteFromTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DeleteFromTests.scala index fd022580db42b..26f64ceb33fe3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DeleteFromTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DeleteFromTests.scala @@ -100,8 +100,8 @@ trait DeleteFromTests extends DatasourceV2SQLBase { } test("DeleteFrom: DELETE is only supported with v2 tables") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // use the default v2 session catalog. + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION, "builtin") val v1Table = "tbl" withTable(v1Table) { sql(s"CREATE TABLE $v1Table" + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala index 152896499010c..c24f52bd93070 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala @@ -370,6 +370,62 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { checkAnswer(df.sort("res"), Seq(Row(10.0), Row(15.5), Row(41.0))) } + test("SPARK-48655: order by on partition keys should not introduce additional shuffle") { + val items_partitions = Array(identity("price"), identity("id")) + createTable(items, itemsColumns, items_partitions) + sql(s"INSERT INTO testcat.ns.$items VALUES " + + s"(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + s"(1, 'aa', 41.0, cast('2020-01-02' as timestamp)), " + + s"(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + + s"(3, 'cc', 15.5, cast('2020-02-01' as timestamp)), " + + s"(null, 'cc', 15.5, cast('2020-02-01' as timestamp)), " + + s"(3, 'cc', null, cast('2020-02-01' as timestamp))") + + Seq(true, false).foreach { sortingEnabled => + withSQLConf(SQLConf.V2_BUCKETING_SORTING_ENABLED.key -> sortingEnabled.toString) { + + def verifyShuffle(cmd: String, answer: Seq[Row]): Unit = { + val df = sql(cmd) + if (sortingEnabled) { + assert(collectAllShuffles(df.queryExecution.executedPlan).isEmpty, + "should contain no shuffle when sorting by partition values") + } else { + assert(collectAllShuffles(df.queryExecution.executedPlan).size == 1, + "should contain one shuffle when optimization is disabled") + } + checkAnswer(df, answer) + }: Unit + + verifyShuffle( + s"SELECT price, id FROM testcat.ns.$items ORDER BY price ASC, id ASC", + Seq(Row(null, 3), Row(10.0, 2), Row(15.5, null), + Row(15.5, 3), Row(40.0, 1), Row(41.0, 1))) + + verifyShuffle( + s"SELECT price, id FROM testcat.ns.$items " + + s"ORDER BY price ASC NULLS LAST, id ASC NULLS LAST", + Seq(Row(10.0, 2), Row(15.5, 3), Row(15.5, null), + Row(40.0, 1), Row(41.0, 1), Row(null, 3))) + + verifyShuffle( + s"SELECT price, id FROM testcat.ns.$items ORDER BY price DESC, id ASC", + Seq(Row(41.0, 1), Row(40.0, 1), Row(15.5, null), + Row(15.5, 3), Row(10.0, 2), Row(null, 3))) + + verifyShuffle( + s"SELECT price, id FROM testcat.ns.$items ORDER BY price DESC, id DESC", + Seq(Row(41.0, 1), Row(40.0, 1), Row(15.5, 3), + Row(15.5, null), Row(10.0, 2), Row(null, 3))) + + verifyShuffle( + s"SELECT price, id FROM testcat.ns.$items " + + s"ORDER BY price DESC NULLS FIRST, id DESC NULLS FIRST", + Seq(Row(null, 3), Row(41.0, 1), Row(40.0, 1), + Row(15.5, null), Row(15.5, 3), Row(10.0, 2))); + } + } + } + test("SPARK-49179: Fix v2 multi bucketed inner joins throw AssertionError") { val cols = Array( Column.create("id", LongType), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala index 04fc7e23ebb24..68c2a01c69aea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala @@ -24,7 +24,7 @@ import org.scalatest.BeforeAndAfter import org.apache.spark.rdd.RDD import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, SaveMode, SparkSession, SQLContext} -import org.apache.spark.sql.QueryTest.withPhysicalPlansCaptured +import org.apache.spark.sql.QueryTest.withQueryExecutionsCaptured import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule @@ -213,8 +213,8 @@ class V1WriteFallbackSuite extends QueryTest with SharedSparkSession with Before .getOrCreate() def captureWrite(sparkSession: SparkSession)(thunk: => Unit): SparkPlan = { - val physicalPlans = withPhysicalPlansCaptured(sparkSession, thunk) - val v1FallbackWritePlans = physicalPlans.filter { + val queryExecutions = withQueryExecutionsCaptured(sparkSession)(thunk) + val v1FallbackWritePlans = queryExecutions.map(_.executedPlan).filter { case _: AppendDataExecV1 | _: OverwriteByExpressionExecV1 => true case _ => false } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala index 5091c72ef96ac..67fca09802139 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala @@ -53,7 +53,8 @@ class V2CommandsCaseSensitivitySuite withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { Seq("ID", "iD").foreach { ref => val tableSpec = - UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), None, None, None, false) + UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), + None, None, None, None, false) val plan = CreateTableAsSelect( UnresolvedIdentifier(Array("table_name").toImmutableArraySeq), Expressions.identity(ref) :: Nil, @@ -77,7 +78,8 @@ class V2CommandsCaseSensitivitySuite withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { Seq("POINT.X", "point.X", "poInt.x", "poInt.X").foreach { ref => val tableSpec = - UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), None, None, None, false) + UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), + None, None, None, None, false) val plan = CreateTableAsSelect( UnresolvedIdentifier(Array("table_name").toImmutableArraySeq), Expressions.bucket(4, ref) :: Nil, @@ -102,7 +104,8 @@ class V2CommandsCaseSensitivitySuite withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { Seq("ID", "iD").foreach { ref => val tableSpec = - UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), None, None, None, false) + UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), + None, None, None, None, false) val plan = ReplaceTableAsSelect( UnresolvedIdentifier(Array("table_name").toImmutableArraySeq), Expressions.identity(ref) :: Nil, @@ -126,7 +129,8 @@ class V2CommandsCaseSensitivitySuite withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { Seq("POINT.X", "point.X", "poInt.x", "poInt.X").foreach { ref => val tableSpec = - UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), None, None, None, false) + UnresolvedTableSpec(Map.empty, None, OptionList(Seq.empty), + None, None, None, None, false) val plan = ReplaceTableAsSelect( UnresolvedIdentifier(Array("table_name").toImmutableArraySeq), Expressions.bucket(4, ref) :: Nil, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/functions/V2FunctionBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/functions/V2FunctionBenchmark.scala index 1401048cf705d..a5f0285bf2eff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/functions/V2FunctionBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/functions/V2FunctionBenchmark.scala @@ -21,15 +21,16 @@ import test.org.apache.spark.sql.connector.catalog.functions.JavaLongAdd import test.org.apache.spark.sql.connector.catalog.functions.JavaLongAdd.{JavaLongAddDefault, JavaLongAddMagic, JavaLongAddStaticMagic} import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BinaryArithmetic, EvalMode, Expression} import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode._ import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.connector.catalog.{Identifier, InMemoryCatalog} import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.functions.col -import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{AbstractDataType, DataType, LongType, NumericType, StructType} @@ -64,6 +65,7 @@ object V2FunctionBenchmark extends SqlBasedBenchmark { N: Long, codegenEnabled: Boolean, resultNullable: Boolean): Unit = { + import spark.toRichColumn withSQLConf(s"spark.sql.catalog.$catalogName" -> classOf[InMemoryCatalog].getName) { createFunction("java_long_add_default", new JavaLongAdd(new JavaLongAddDefault(resultNullable))) @@ -81,7 +83,9 @@ object V2FunctionBenchmark extends SqlBasedBenchmark { s"codegen = $codegenEnabled" val benchmark = new Benchmark(name, N, output = output) benchmark.addCase(s"native_long_add", numIters = 3) { _ => - spark.range(N).select(NativeAdd(col("id"), col("id"), resultNullable)).noop() + spark.range(N) + .select(Column(NativeAdd(col("id").expr, col("id").expr, resultNullable))) + .noop() } Seq("java_long_add_default", "java_long_add_magic", "java_long_add_static_magic", "scala_long_add_default", "scala_long_add_magic").foreach { functionName => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 92c175fe2f94a..779b5ba530aa6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -901,6 +901,23 @@ class QueryCompilationErrorsSuite } } + test("SPARK-50779: the object level collations feature is unsupported when flag is disabled") { + withSQLConf(SQLConf.OBJECT_LEVEL_COLLATIONS_ENABLED.key -> "false") { + Seq( + "CREATE TABLE t (c STRING) USING parquet DEFAULT COLLATION UNICODE", + "REPLACE TABLE t (c STRING) USING parquet DEFAULT COLLATION UNICODE_CI", + "ALTER TABLE t DEFAULT COLLATION sr_CI_AI", + "CREATE VIEW v DEFAULT COLLATION UNICODE as SELECT * FROM t", + "CREATE TEMPORARY VIEW v DEFAULT COLLATION UTF8_LCASE as SELECT * FROM t" + ).foreach { sqlText => + checkError( + exception = intercept[AnalysisException](sql(sqlText)), + condition = "UNSUPPORTED_FEATURE.OBJECT_LEVEL_COLLATIONS" + ) + } + } + } + test("UNSUPPORTED_CALL: call the unsupported method update()") { checkError( exception = intercept[SparkUnsupportedOperationException] { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala index f07d2d6620f72..fde5a32e722f4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala @@ -145,7 +145,7 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest sql("select array(1, 2, 3, 4, 5)[8]").collect() }, condition = "INVALID_ARRAY_INDEX", - parameters = Map("indexValue" -> "8", "arraySize" -> "5", "ansiConfig" -> ansiConf), + parameters = Map("indexValue" -> "8", "arraySize" -> "5"), context = ExpectedContext(fragment = "array(1, 2, 3, 4, 5)[8]", start = 7, stop = 29)) checkError( @@ -153,7 +153,7 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest OneRowRelation().select(lit(Array(1, 2, 3, 4, 5))(8)).collect() }, condition = "INVALID_ARRAY_INDEX", - parameters = Map("indexValue" -> "8", "arraySize" -> "5", "ansiConfig" -> ansiConf), + parameters = Map("indexValue" -> "8", "arraySize" -> "5"), context = ExpectedContext( fragment = "apply", callSitePattern = getCurrentClassCallSitePattern)) @@ -165,7 +165,7 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest sql("select element_at(array(1, 2, 3, 4, 5), 8)").collect() }, condition = "INVALID_ARRAY_INDEX_IN_ELEMENT_AT", - parameters = Map("indexValue" -> "8", "arraySize" -> "5", "ansiConfig" -> ansiConf), + parameters = Map("indexValue" -> "8", "arraySize" -> "5"), context = ExpectedContext( fragment = "element_at(array(1, 2, 3, 4, 5), 8)", start = 7, @@ -176,7 +176,7 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest OneRowRelation().select(element_at(lit(Array(1, 2, 3, 4, 5)), 8)).collect() }, condition = "INVALID_ARRAY_INDEX_IN_ELEMENT_AT", - parameters = Map("indexValue" -> "8", "arraySize" -> "5", "ansiConfig" -> ansiConf), + parameters = Map("indexValue" -> "8", "arraySize" -> "5"), context = ExpectedContext(fragment = "element_at", callSitePattern = getCurrentClassCallSitePattern)) } @@ -240,8 +240,8 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest }, condition = "CANNOT_PARSE_TIMESTAMP", parameters = Map( - "message" -> "Text 'abc' could not be parsed at index 0", - "ansiConfig" -> ansiConf) + "func" -> "`try_to_timestamp`", + "message" -> "Text 'abc' could not be parsed at index 0") ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala index 1adb1fdf05032..17c3c1e1e2a70 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala @@ -351,7 +351,7 @@ class QueryExecutionErrorsSuite sql("select timestampadd(YEAR, 1000000, timestamp'2022-03-09 01:02:03')").collect() }, condition = "DATETIME_OVERFLOW", - parameters = Map("operation" -> "add 1000000 YEAR to TIMESTAMP '2022-03-09 01:02:03'"), + parameters = Map("operation" -> "add 1000000L YEAR to TIMESTAMP '2022-03-09 01:02:03'"), sqlState = "22008") } @@ -1258,6 +1258,22 @@ class QueryExecutionErrorsSuite ) ) } + + test("SPARK-50485: Unwrap SparkThrowable in UEE thrown by tableRelationCache") { + withTable("t") { + sql("CREATE TABLE t (a INT)") + checkError( + exception = intercept[SparkUnsupportedOperationException] { + sql("ALTER TABLE t SET LOCATION 'https://mister/spark'") + }, + condition = "FAILED_READ_FILE.UNSUPPORTED_FILE_SYSTEM", + parameters = Map( + "path" -> "https://mister/spark", + "fileSystemClass" -> "org.apache.hadoop.fs.http.HttpsFileSystem", + "method" -> "listStatus")) + sql("ALTER TABLE t SET LOCATION '/mister/spark'") + } + } } class FakeFileSystemSetPermission extends LocalFileSystem { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala index 0078c3f9f65de..31b002a1e245d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala @@ -32,10 +32,10 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter * {{{ * 1. without sbt: * bin/spark-submit --class --jars - * 2. build/sbt build/sbt ";project sql;set javaOptions - * in Test += \"-Dspark.memory.debugFill=false\";Test/runMain " - * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt ";project sql;set javaOptions - * in Test += \"-Dspark.memory.debugFill=false\";Test/runMain " + * 2. build/sbt build/sbt ";project sql; + * set Test / javaOptions += \"-Dspark.memory.debugFill=false\";Test/runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt ";project sql; + * set Test / javaOptions += \"-Dspark.memory.debugFill=false\";Test/runMain " * Results will be written to * "benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt". * }}} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffsetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffsetSuite.scala index 8d640a1840f4c..d1b11a74cf35f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffsetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffsetSuite.scala @@ -17,10 +17,13 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.{Dataset, QueryTest} +import org.apache.spark.sql.IntegratedUDFTestUtils._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.functions.rand import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.IntegerType class InsertSortForLimitAndOffsetSuite extends QueryTest with SharedSparkSession @@ -51,6 +54,7 @@ class InsertSortForLimitAndOffsetSuite extends QueryTest private def hasLocalSort(plan: SparkPlan): Boolean = { find(plan) { case GlobalLimitExec(_, s: SortExec, _) => !s.global + case GlobalLimitExec(_, ProjectExec(_, s: SortExec), _) => !s.global case _ => false }.isDefined } @@ -91,12 +95,16 @@ class InsertSortForLimitAndOffsetSuite extends QueryTest // one partition to read the range-partition shuffle and there is only one shuffle block for // the final single-partition shuffle, random fetch order is no longer an issue. SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "false") { - val df = spark.range(10).orderBy($"id" % 8).limit(2).distinct() - df.collect() - val physicalPlan = df.queryExecution.executedPlan - assertHasGlobalLimitExec(physicalPlan) - // Extra local sort is needed for middle LIMIT - assert(hasLocalSort(physicalPlan)) + val df = 1.to(10).map(v => v -> v).toDF("c1", "c2").orderBy($"c1" % 8) + verifySortAdded(df.limit(2)) + verifySortAdded(df.filter($"c2" > rand()).limit(2)) + verifySortAdded(df.select($"c2").limit(2)) + verifySortAdded(df.filter($"c2" > rand()).select($"c2").limit(2)) + + assume(shouldTestPythonUDFs) + val pythonTestUDF = TestPythonUDF(name = "pyUDF", Some(IntegerType)) + verifySortAdded(df.filter(pythonTestUDF($"c2") > rand()).limit(2)) + verifySortAdded(df.select(pythonTestUDF($"c2")).limit(2)) } } @@ -110,11 +118,28 @@ class InsertSortForLimitAndOffsetSuite extends QueryTest } test("middle OFFSET preserves data ordering with the extra sort") { - val df = spark.range(10).orderBy($"id" % 8).offset(2).distinct() - df.collect() - val physicalPlan = df.queryExecution.executedPlan + val df = 1.to(10).map(v => v -> v).toDF("c1", "c2").orderBy($"c1" % 8) + verifySortAdded(df.offset(2)) + verifySortAdded(df.filter($"c2" > rand()).offset(2)) + verifySortAdded(df.select($"c2").offset(2)) + verifySortAdded(df.filter($"c2" > rand()).select($"c2").offset(2)) + + assume(shouldTestPythonUDFs) + val pythonTestUDF = TestPythonUDF(name = "pyUDF", Some(IntegerType)) + verifySortAdded(df.filter(pythonTestUDF($"c2") > rand()).offset(2)) + verifySortAdded(df.select(pythonTestUDF($"c2")).offset(2)) + } + + private def verifySortAdded(df: Dataset[_]): Unit = { + // Do distinct to trigger a shuffle, so that the LIMIT/OFFSET below won't be planned as + // `CollectLimitExec` + val shuffled = df.distinct() + shuffled.collect() + val physicalPlan = shuffled.queryExecution.executedPlan assertHasGlobalLimitExec(physicalPlan) - // Extra local sort is needed for middle OFFSET + // Extra local sort is needed for middle LIMIT/OFFSET assert(hasLocalSort(physicalPlan)) + // Make sure the schema does not change. + assert(physicalPlan.schema == shuffled.schema) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala index 974be2f627998..d670b3d8c77d3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala @@ -18,11 +18,12 @@ package org.apache.spark.sql.execution import scala.collection.mutable import scala.io.Source +import scala.util.Try import org.apache.spark.sql.{AnalysisException, Dataset, ExtendedExplainGenerator, FastOperator} -import org.apache.spark.sql.catalyst.{QueryPlanningTracker, QueryPlanningTrackerCallback} -import org.apache.spark.sql.catalyst.analysis.CurrentNamespace -import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.{QueryPlanningTracker, QueryPlanningTrackerCallback, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.{CurrentNamespace, UnresolvedFunction, UnresolvedRelation} +import org.apache.spark.sql.catalyst.expressions.{Alias, UnsafeRow} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{CommandResult, LogicalPlan, OneRowRelation, Project, ShowTables, SubqueryAlias} import org.apache.spark.sql.catalyst.trees.TreeNodeTag @@ -405,6 +406,21 @@ class QueryExecutionSuite extends SharedSparkSession { } } + test("SPARK-50600: Failed analysis should send analyzed event") { + val mockCallback = MockCallback() + + def table(ref: String): LogicalPlan = UnresolvedRelation(TableIdentifier(ref)) + + val unresolvedUndefinedFunc = UnresolvedFunction("unknown", Seq.empty, isDistinct = false) + val plan = Project(Seq(Alias(unresolvedUndefinedFunc, "call1")()), table("table")) + val dataset = Try { + val df = Dataset.ofRows(spark, plan, new QueryPlanningTracker(Some(mockCallback))) + df.queryExecution.assertAnalyzed() + } + assert(dataset.failed.get.isInstanceOf[AnalysisException]) + mockCallback.assertAnalyzed() + } + case class MockCallbackEagerCommand( var trackerAnalyzed: QueryPlanningTracker = null, var trackerReadyForExecution: QueryPlanningTracker = null) @@ -447,6 +463,15 @@ class QueryExecutionSuite extends SharedSparkSession { var trackerAnalyzed: QueryPlanningTracker = null, var trackerReadyForExecution: QueryPlanningTracker = null) extends QueryPlanningTrackerCallback { + override def analysisFailed( + trackerFromCallback: QueryPlanningTracker, + analyzedPlan: LogicalPlan): Unit = { + trackerAnalyzed = trackerFromCallback + assert(!trackerAnalyzed.phases.keySet.contains(QueryPlanningTracker.ANALYSIS)) + assert(!trackerAnalyzed.phases.keySet.contains(QueryPlanningTracker.OPTIMIZATION)) + assert(!trackerAnalyzed.phases.keySet.contains(QueryPlanningTracker.PLANNING)) + assert(analyzedPlan != null) + } def analyzed(trackerFromCallback: QueryPlanningTracker, plan: LogicalPlan): Unit = { trackerAnalyzed = trackerFromCallback assert(trackerAnalyzed.phases.keySet.contains(QueryPlanningTracker.ANALYSIS)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala new file mode 100644 index 0000000000000..4da3b9ab1d06b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Test suite for SQL user-defined functions (UDFs). + */ +class SQLFunctionSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + protected override def beforeAll(): Unit = { + super.beforeAll() + Seq((0, 1), (1, 2)).toDF("a", "b").createOrReplaceTempView("t") + } + + test("SQL scalar function") { + withUserDefinedFunction("area" -> false) { + sql( + """ + |CREATE FUNCTION area(width DOUBLE, height DOUBLE) + |RETURNS DOUBLE + |RETURN width * height + |""".stripMargin) + checkAnswer(sql("SELECT area(1, 2)"), Row(2)) + checkAnswer(sql("SELECT area(a, b) FROM t"), Seq(Row(0), Row(2))) + } + } + + test("SQL scalar function with subquery in the function body") { + withUserDefinedFunction("foo" -> false) { + withTable("tbl") { + sql("CREATE TABLE tbl AS SELECT * FROM VALUES (1, 2), (1, 3), (2, 3) t(a, b)") + sql( + """ + |CREATE FUNCTION foo(x INT) RETURNS INT + |RETURN SELECT SUM(b) FROM tbl WHERE x = a; + |""".stripMargin) + checkAnswer(sql("SELECT foo(1)"), Row(5)) + checkAnswer(sql("SELECT foo(a) FROM t"), Seq(Row(null), Row(5))) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index 03d6eb1a50209..acc3cdb01bf3f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -93,6 +93,15 @@ class SparkSqlParserSuite extends AnalysisTest with SharedSparkSession { parameters = Map.empty) } + test("SET with semi-colons") { + assertEqual(s"SET;", SetCommand(None)) + assertEqual(s"SET ;", SetCommand(None)) + assertEqual(s"SET -v;", SetCommand(Some("-v" -> None))) + assertEqual(s"SET -v ;", SetCommand(Some("-v" -> None))) + assertEqual(s"SET spark.sql.ansi.enabled;", SetCommand(Some("spark.sql.ansi.enabled" -> None))) + assertEqual(s"SET spark.sql.ansi.enabled ;", SetCommand(Some("spark.sql.ansi.enabled" -> None))) + } + test("Report Error for invalid usage of SET command") { assertEqual("SET", SetCommand(None)) assertEqual("SET -v", SetCommand(Some("-v", None))) @@ -885,105 +894,118 @@ class SparkSqlParserSuite extends AnalysisTest with SharedSparkSession { // scalastyle:on test("Operator pipe SQL syntax") { - withSQLConf(SQLConf.OPERATOR_PIPE_SYNTAX_ENABLED.key -> "true") { - // Basic selection. - // Here we check that every parsed plan contains a projection and a source relation or - // inline table. - def check(query: String, patterns: Seq[TreePattern]): Unit = { - val plan: LogicalPlan = parser.parsePlan(query) - assert(patterns.exists(plan.containsPattern), s"Failed to parse $query, plan: $plan") - assert(plan.containsAnyPattern(UNRESOLVED_RELATION, LOCAL_RELATION)) - } - def checkPipeSelect(query: String): Unit = check(query, Seq(PROJECT)) - checkPipeSelect("TABLE t |> SELECT 1 AS X") - checkPipeSelect("TABLE t |> SELECT 1 AS X, 2 AS Y |> SELECT X + Y AS Z") - checkPipeSelect("VALUES (0), (1) tab(col) |> SELECT col * 2 AS result") - checkPipeSelect("TABLE t |> EXTEND X + 1 AS Y") - checkPipeSelect("TABLE t |> EXTEND X + 1 AS Y, X + 2 Z") - // Basic WHERE operators. - def checkPipeWhere(query: String): Unit = check(query, Seq(FILTER)) - checkPipeWhere("TABLE t |> WHERE X = 1") - checkPipeWhere("TABLE t |> SELECT X, LENGTH(Y) AS Z |> WHERE X + LENGTH(Y) < 4") - checkPipeWhere("TABLE t |> WHERE X = 1 AND Y = 2 |> WHERE X + Y = 3") - checkPipeWhere("VALUES (0), (1) tab(col) |> WHERE col < 1") - // PIVOT and UNPIVOT operations - def checkPivotUnpivot(query: String): Unit = check(query, Seq(PIVOT, UNPIVOT)) - checkPivotUnpivot( - """ - |SELECT * FROM VALUES - | ("dotNET", 2012, 10000), - | ("Java", 2012, 20000), - | ("dotNET", 2012, 5000), - | ("dotNET", 2013, 48000), - | ("Java", 2013, 30000) - | AS courseSales(course, year, earnings) - ||> PIVOT ( - | SUM(earnings) - | FOR course IN ('dotNET', 'Java') - |) - |""".stripMargin) - checkPivotUnpivot( - """ - |SELECT * FROM VALUES - | ("dotNET", 15000, 48000, 22500), - | ("Java", 20000, 30000, NULL) - | AS courseEarnings(course, `2012`, `2013`, `2014`) - ||> UNPIVOT ( - | earningsYear FOR year IN (`2012`, `2013`, `2014`) - |) - |""".stripMargin) - // Sampling operations - def checkSample(query: String): Unit = { - val plan: LogicalPlan = parser.parsePlan(query) - assert(plan.collectFirst(_.isInstanceOf[Sample]).nonEmpty) - assert(plan.containsAnyPattern(UNRESOLVED_RELATION, LOCAL_RELATION)) - } - checkSample("TABLE t |> TABLESAMPLE (50 PERCENT)") - checkSample("TABLE t |> TABLESAMPLE (5 ROWS)") - checkSample("TABLE t |> TABLESAMPLE (BUCKET 4 OUT OF 10)") - // Joins. - def checkPipeJoin(query: String): Unit = check(query, Seq(JOIN)) - Seq("", "INNER", "LEFT", "LEFT OUTER", "SEMI", "LEFT SEMI", "RIGHT", "RIGHT OUTER", "FULL", - "FULL OUTER", "ANTI", "LEFT ANTI", "CROSS").foreach { joinType => - checkPipeJoin(s"TABLE t |> $joinType JOIN other ON (t.x = other.x)") - } - // Set operations - def checkDistinct(query: String): Unit = check(query, Seq(DISTINCT_LIKE)) - def checkExcept(query: String): Unit = check(query, Seq(EXCEPT)) - def checkIntersect(query: String): Unit = check(query, Seq(INTERSECT)) - def checkUnion(query: String): Unit = check(query, Seq(UNION)) - checkDistinct("TABLE t |> UNION DISTINCT TABLE t") - checkExcept("TABLE t |> EXCEPT ALL TABLE t") - checkExcept("TABLE t |> EXCEPT DISTINCT TABLE t") - checkExcept("TABLE t |> MINUS ALL TABLE t") - checkExcept("TABLE t |> MINUS DISTINCT TABLE t") - checkIntersect("TABLE t |> INTERSECT ALL TABLE t") - checkUnion("TABLE t |> UNION ALL TABLE t") - // Sorting and distributing operators. - def checkSort(query: String): Unit = check(query, Seq(SORT)) - def checkRepartition(query: String): Unit = check(query, Seq(REPARTITION_OPERATION)) - def checkLimit(query: String): Unit = check(query, Seq(LIMIT)) - checkSort("TABLE t |> ORDER BY x") - checkSort("TABLE t |> SELECT x |> SORT BY x") - checkLimit("TABLE t |> LIMIT 1") - checkLimit("TABLE t |> LIMIT 2 OFFSET 1") - checkRepartition("TABLE t |> DISTRIBUTE BY x |> WHERE x = 1") - checkRepartition("TABLE t |> CLUSTER BY x |> TABLESAMPLE (100 PERCENT)") - checkRepartition("TABLE t |> SORT BY x DISTRIBUTE BY x") - // Aggregation - def checkAggregate(query: String): Unit = check(query, Seq(AGGREGATE)) - checkAggregate("SELECT a, b FROM t |> AGGREGATE SUM(a)") - checkAggregate("SELECT a, b FROM t |> AGGREGATE SUM(a) AS result GROUP BY b") - checkAggregate("SELECT a, b FROM t |> AGGREGATE GROUP BY b") - checkAggregate("SELECT a, b FROM t |> AGGREGATE COUNT(*) AS result GROUP BY b") - // Window - def checkWindow(query: String): Unit = check(query, Seq(WITH_WINDOW_DEFINITION)) - checkWindow( - """ - |TABLE windowTestData - ||> SELECT cate, SUM(val) OVER w - | WINDOW w AS (PARTITION BY cate ORDER BY val) - |""".stripMargin) + // Basic selection. + // Here we check that every parsed plan contains a projection and a source relation or + // inline table. + def check(query: String, patterns: Seq[TreePattern]): Unit = { + val plan: LogicalPlan = parser.parsePlan(query) + assert(patterns.exists(plan.containsPattern), s"Failed to parse $query, plan: $plan") + assert(plan.containsAnyPattern(UNRESOLVED_RELATION, LOCAL_RELATION)) + } + def checkPipeSelect(query: String): Unit = check(query, Seq(PROJECT)) + checkPipeSelect("TABLE t |> SELECT 1 AS X") + checkPipeSelect("TABLE t |> SELECT 1 AS X, 2 AS Y |> SELECT X + Y AS Z") + checkPipeSelect("VALUES (0), (1) tab(col) |> SELECT col * 2 AS result") + checkPipeSelect("TABLE t |> EXTEND X + 1 AS Y") + checkPipeSelect("TABLE t |> EXTEND X + 1 AS Y, X + 2 Z") + checkPipeSelect("TABLE t |> EXTEND 1 AS z, 2 AS Z |> SET z = 1, Z = 2") + // FROM operators. + def checkPipeSelectFrom(query: String): Unit = check(query, Seq(PROJECT)) + checkPipeSelectFrom("FROM t |> SELECT 1 AS X") + // Basic WHERE operators. + def checkPipeWhere(query: String): Unit = check(query, Seq(FILTER)) + checkPipeWhere("TABLE t |> WHERE X = 1") + checkPipeWhere("TABLE t |> SELECT X, LENGTH(Y) AS Z |> WHERE X + LENGTH(Y) < 4") + checkPipeWhere("TABLE t |> WHERE X = 1 AND Y = 2 |> WHERE X + Y = 3") + checkPipeWhere("VALUES (0), (1) tab(col) |> WHERE col < 1") + // PIVOT and UNPIVOT operations + def checkPivotUnpivot(query: String): Unit = check(query, Seq(PIVOT, UNPIVOT)) + checkPivotUnpivot( + """ + |SELECT * FROM VALUES + | ("dotNET", 2012, 10000), + | ("Java", 2012, 20000), + | ("dotNET", 2012, 5000), + | ("dotNET", 2013, 48000), + | ("Java", 2013, 30000) + | AS courseSales(course, year, earnings) + ||> PIVOT ( + | SUM(earnings) + | FOR course IN ('dotNET', 'Java') + |) + |""".stripMargin) + checkPivotUnpivot( + """ + |SELECT * FROM VALUES + | ("dotNET", 15000, 48000, 22500), + | ("Java", 20000, 30000, NULL) + | AS courseEarnings(course, `2012`, `2013`, `2014`) + ||> UNPIVOT ( + | earningsYear FOR year IN (`2012`, `2013`, `2014`) + |) + |""".stripMargin) + // Sampling operations + def checkSample(query: String): Unit = { + val plan: LogicalPlan = parser.parsePlan(query) + assert(plan.collectFirst(_.isInstanceOf[Sample]).nonEmpty) + assert(plan.containsAnyPattern(UNRESOLVED_RELATION, LOCAL_RELATION)) + } + checkSample("TABLE t |> TABLESAMPLE (50 PERCENT)") + checkSample("TABLE t |> TABLESAMPLE (5 ROWS)") + checkSample("TABLE t |> TABLESAMPLE (BUCKET 4 OUT OF 10)") + // Joins. + def checkPipeJoin(query: String): Unit = check(query, Seq(JOIN)) + Seq("", "INNER", "LEFT", "LEFT OUTER", "SEMI", "LEFT SEMI", "RIGHT", "RIGHT OUTER", "FULL", + "FULL OUTER", "ANTI", "LEFT ANTI", "CROSS").foreach { joinType => + checkPipeJoin(s"TABLE t |> $joinType JOIN other ON (t.x = other.x)") + } + // Set operations + def checkDistinct(query: String): Unit = check(query, Seq(DISTINCT_LIKE)) + def checkExcept(query: String): Unit = check(query, Seq(EXCEPT)) + def checkIntersect(query: String): Unit = check(query, Seq(INTERSECT)) + def checkUnion(query: String): Unit = check(query, Seq(UNION)) + checkDistinct("TABLE t |> UNION DISTINCT TABLE t") + checkExcept("TABLE t |> EXCEPT ALL TABLE t") + checkExcept("TABLE t |> EXCEPT DISTINCT TABLE t") + checkExcept("TABLE t |> MINUS ALL TABLE t") + checkExcept("TABLE t |> MINUS DISTINCT TABLE t") + checkIntersect("TABLE t |> INTERSECT ALL TABLE t") + checkUnion("TABLE t |> UNION ALL TABLE t") + // Sorting and distributing operators. + def checkSort(query: String): Unit = check(query, Seq(SORT)) + def checkRepartition(query: String): Unit = check(query, Seq(REPARTITION_OPERATION)) + def checkLimit(query: String): Unit = check(query, Seq(LIMIT)) + checkSort("TABLE t |> ORDER BY x") + checkSort("TABLE t |> SELECT x |> SORT BY x") + checkLimit("TABLE t |> LIMIT 1") + checkLimit("TABLE t |> LIMIT 2 OFFSET 1") + checkRepartition("TABLE t |> DISTRIBUTE BY x |> WHERE x = 1") + checkRepartition("TABLE t |> CLUSTER BY x |> TABLESAMPLE (100 PERCENT)") + checkRepartition("TABLE t |> SORT BY x DISTRIBUTE BY x") + // Aggregation + def checkAggregate(query: String): Unit = check(query, Seq(AGGREGATE)) + checkAggregate("SELECT a, b FROM t |> AGGREGATE SUM(a)") + checkAggregate("SELECT a, b FROM t |> AGGREGATE SUM(a) AS result GROUP BY b") + checkAggregate("SELECT a, b FROM t |> AGGREGATE GROUP BY b") + checkAggregate("SELECT a, b FROM t |> AGGREGATE COUNT(*) AS result GROUP BY b") + // Window + def checkWindow(query: String): Unit = check(query, Seq(WITH_WINDOW_DEFINITION)) + checkWindow( + """ + |TABLE windowTestData + ||> SELECT cate, SUM(val) OVER w + | WINDOW w AS (PARTITION BY cate ORDER BY val) + |""".stripMargin) + withSQLConf(SQLConf.OPERATOR_PIPE_SYNTAX_ENABLED.key -> "false") { + val sql = s"TABLE t |> SELECT 1 AS X" + checkError( + exception = parseException(sql), + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> "Operator pipe SQL syntax using |>"), + context = ExpectedContext( + fragment = sql, + start = 0, + stop = sql.length - 1)) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/LargeRowBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/LargeRowBenchmark.scala new file mode 100644 index 0000000000000..8b4f78e79913a --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/LargeRowBenchmark.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.functions.lit + +/** + * Benchmark to measure performance for large row table. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/Test/runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results will be written to "benchmarks/LargeRowBenchmark-results.txt". + * }}} + */ +object LargeRowBenchmark extends SqlBasedBenchmark { + + /** + * Prepares a table with large row for benchmarking. The table will be written into + * the given path. + */ + private def writeLargeRow(path: String, rowsNum: Int, numCols: Int, cellSizeMb: Double): Unit = { + val stringLength = (cellSizeMb * 1024 * 1024).toInt + spark.range(rowsNum) + .select(Seq.tabulate(numCols)(i => lit("a" * stringLength).as(s"col$i")): _*) + .write.parquet(path) + } + + private def runLargeRowBenchmark(rowsNum: Int, numCols: Int, cellSizeMb: Double): Unit = { + withTempPath { path => + val benchmark = new Benchmark( + s"#rows: $rowsNum, #cols: $numCols, cell: $cellSizeMb MB", rowsNum, output = output) + writeLargeRow(path.getAbsolutePath, rowsNum, numCols, cellSizeMb) + val df = spark.read.parquet(path.getAbsolutePath) + df.createOrReplaceTempView("T") + benchmark.addCase("built-in UPPER") { _ => + val sqlSelect = df.columns.map(c => s"UPPER($c) as $c").mkString(", ") + spark.sql(s"SELECT $sqlSelect FROM T").noop() + } + benchmark.addCase("udf UPPER") { _ => + val sqlSelect = df.columns.map(c => s"udfUpper($c) as $c").mkString(", ") + spark.sql(s"SELECT $sqlSelect FROM T").noop() + } + benchmark.run() + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("Large Row Benchmark") { + val udfUpper = (s: String) => s.toUpperCase() + spark.udf.register("udfUpper", udfUpper(_: String): String) + + val benchmarks = Array( + Map("rows" -> 100, "cols" -> 10, "cellSizeMb" -> 1.3), // OutOfMemory @ 100, 10, 1.4 + Map("rows" -> 1, "cols" -> 1, "cellSizeMb" -> 300.0), // OutOfMemory @ 1, 1, 400 + Map("rows" -> 1, "cols" -> 200, "cellSizeMb" -> 1.0) // OutOfMemory @ 1, 300, 1 + ) + + benchmarks.foreach { b => + val rows = b("rows").asInstanceOf[Int] + val cols = b("cols").asInstanceOf[Int] + val cellSizeMb = b("cellSizeMb").asInstanceOf[Double] + runLargeRowBenchmark(rows, cols, cellSizeMb) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SetOperationsBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SetOperationsBenchmark.scala new file mode 100644 index 0000000000000..379e31ead2f31 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SetOperationsBenchmark.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark to measure performance for set operations. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/Test/runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results will be written to "benchmarks/SetOperationsBenchmark-results.txt". + * }}} + */ +object SetOperationsBenchmark extends SqlBasedBenchmark { + private val setOperations = Seq("UNION ALL", "EXCEPT ALL", "INTERSECT ALL") + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("Set Operations Benchmark") { + val numOperations = 500 + val numValues = 30 + + val benchmark = + new Benchmark( + "Parsing + Analysis", + valuesPerIteration = numOperations * numValues, + output = output + ) + + for (operation <- setOperations) { + benchmark.addCase(operation) { _ => + spark + .sql( + generateQuery( + operation = operation, + numOperations = numOperations, + numValues = numValues + ) + ) + .queryExecution + .analyzed + () + } + } + + benchmark.run() + } + } + + private def generateQuery(operation: String, numOperations: Int, numValues: Int) = { + s""" + SELECT + * + FROM + ${generateOperations( + operation = operation, + numOperations = numOperations, + numValues = numValues + )} + """ + } + + private def generateOperations(operation: String, numOperations: Int, numValues: Int) = { + (0 until numOperations).map(_ => generateValues(numValues)).mkString(s" ${operation} ") + } + + private def generateValues(num: Int) = { + s"VALUES (${(0 until num).mkString(", ")})" + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableAddPartitionSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableAddPartitionSuiteBase.scala index cb25942822f46..13ea6f5a30536 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableAddPartitionSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableAddPartitionSuiteBase.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkNumberFormatException import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.internal.SQLConf /** @@ -97,10 +98,20 @@ trait AlterTableAddPartitionSuiteBase extends QueryTest with DDLCommandTestUtils withNamespaceAndTable("ns", "tbl") { t => spark.sql(s"CREATE TABLE $t (id bigint, data string) $defaultUsing PARTITIONED BY (id)") withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val errMsg = intercept[AnalysisException] { - spark.sql(s"ALTER TABLE $t ADD PARTITION (ID=1) LOCATION 'loc1'") - }.getMessage - assert(errMsg.contains("ID is not a valid partition column")) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`tbl`" + } else { + "`test_catalog`.`ns`.`tbl`" + } + checkError( + exception = intercept[AnalysisException] { + spark.sql(s"ALTER TABLE $t ADD PARTITION (ID=1) LOCATION 'loc1'") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`ID`", + "tableName" -> expectedTableName) + ) } withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { spark.sql(s"ALTER TABLE $t ADD PARTITION (ID=1) LOCATION 'loc1'") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala index 279042f675cd5..a49a94174195c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionsException import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.internal.SQLConf /** @@ -103,10 +104,20 @@ trait AlterTableDropPartitionSuiteBase extends QueryTest with DDLCommandTestUtil withNamespaceAndTable("ns", "tbl") { t => sql(s"CREATE TABLE $t (id bigint, data string) $defaultUsing PARTITIONED BY (id)") withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val errMsg = intercept[AnalysisException] { - sql(s"ALTER TABLE $t DROP PARTITION (ID=1)") - }.getMessage - assert(errMsg.contains("ID is not a valid partition column")) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`tbl`" + } else { + "`test_catalog`.`ns`.`tbl`" + } + checkError( + exception = intercept[AnalysisException] { + sql(s"ALTER TABLE $t DROP PARTITION (ID=1)") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`ID`", + "tableName" -> expectedTableName) + ) } withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala index 905e6cfb9caaa..186f2b293ea81 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, PartitionsAlreadyExistException} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.internal.SQLConf /** @@ -170,10 +171,20 @@ trait AlterTableRenamePartitionSuiteBase extends QueryTest with DDLCommandTestUt checkPartitions(t, Map("id" -> "1")) withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val errMsg = intercept[AnalysisException] { - sql(s"ALTER TABLE $t PARTITION (ID = 1) RENAME TO PARTITION (id = 2)") - }.getMessage - assert(errMsg.contains("ID is not a valid partition column")) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`tbl`" + } else { + "`test_catalog`.`ns`.`tbl`" + } + checkError( + exception = intercept[AnalysisException] { + sql(s"ALTER TABLE $t PARTITION (ID = 1) RENAME TO PARTITION (id = 2)") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`ID`", + "tableName" -> expectedTableName) + ) checkPartitions(t, Map("id" -> "1")) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala index 52a90497fdd37..9ec63acb1d3a8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.command import org.apache.spark.sql.{AnalysisException, QueryTest} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, TableCatalog} +import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.errors.DataTypeErrors.toSQLId import org.apache.spark.sql.internal.SQLConf @@ -89,7 +89,7 @@ trait AlterTableSetTblPropertiesSuiteBase extends QueryTest with DDLCommandTestU PROP_EXTERNAL -> "please use CREATE EXTERNAL TABLE" ) withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { - CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + tableLegacyProperties.foreach { key => withNamespaceAndTable("ns", "tbl") { t => val sqlText = s"ALTER TABLE $t SET TBLPROPERTIES ('$key'='bar')" checkError( @@ -109,7 +109,7 @@ trait AlterTableSetTblPropertiesSuiteBase extends QueryTest with DDLCommandTestU } } withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { - CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + tableLegacyProperties.foreach { key => Seq("OPTIONS", "TBLPROPERTIES").foreach { clause => withNamespaceAndTable("ns", "tbl") { t => sql(s"CREATE TABLE $t (key int) USING parquet $clause ('$key'='bar')") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableUnsetTblPropertiesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableUnsetTblPropertiesSuiteBase.scala index 0013919fca08f..0e9e9d9c60815 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableUnsetTblPropertiesSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableUnsetTblPropertiesSuiteBase.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.command import org.apache.spark.sql.{AnalysisException, QueryTest} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, TableCatalog} +import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.errors.DataTypeErrors.toSQLId import org.apache.spark.sql.internal.SQLConf @@ -109,7 +109,7 @@ trait AlterTableUnsetTblPropertiesSuiteBase extends QueryTest with DDLCommandTes PROP_EXTERNAL -> "please use CREATE EXTERNAL TABLE" ) withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { - CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + tableLegacyProperties.foreach { key => withNamespaceAndTable("ns", "tbl") { t => val sqlText = s"ALTER TABLE $t UNSET TBLPROPERTIES ('$key')" checkError( @@ -129,7 +129,7 @@ trait AlterTableUnsetTblPropertiesSuiteBase extends QueryTest with DDLCommandTes } } withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { - CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + tableLegacyProperties.foreach { key => Seq("OPTIONS", "TBLPROPERTIES").foreach { clause => withNamespaceAndTable("ns", "tbl") { t => sql(s"CREATE TABLE $t (key int) USING parquet $clause ('$key'='bar')") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionParserSuite.scala new file mode 100644 index 0000000000000..75b42c6440719 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionParserSuite.scala @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedIdentifier} +import org.apache.spark.sql.catalyst.catalog.LanguageSQL +import org.apache.spark.sql.catalyst.plans.logical.CreateUserDefinedFunction +import org.apache.spark.sql.execution.SparkSqlParser + +class CreateSQLFunctionParserSuite extends AnalysisTest { + private lazy val parser = new SparkSqlParser() + + private def intercept(sqlCommand: String, messages: String*): Unit = + interceptParseException(parser.parsePlan)(sqlCommand, messages: _*)() + + private def checkParseError( + sqlCommand: String, + errorClass: String, + parameters: Map[String, String], + queryContext: Array[ExpectedContext] = Array.empty): Unit = + assertParseErrorClass(parser.parsePlan, sqlCommand, errorClass, parameters, queryContext) + + // scalastyle:off argcount + private def createSQLFunction( + nameParts: Seq[String], + inputParamText: Option[String] = None, + returnTypeText: String = "INT", + exprText: Option[String] = None, + queryText: Option[String] = None, + comment: Option[String] = None, + isDeterministic: Option[Boolean] = None, + containsSQL: Option[Boolean] = None, + isTableFunc: Boolean = false, + ignoreIfExists: Boolean = false, + replace: Boolean = false): CreateUserDefinedFunction = { + // scalastyle:on argcount + CreateUserDefinedFunction( + UnresolvedIdentifier(nameParts), + inputParamText = inputParamText, + returnTypeText = returnTypeText, + exprText = exprText, + queryText = queryText, + comment = comment, + isDeterministic = isDeterministic, + containsSQL = containsSQL, + language = LanguageSQL, + isTableFunc = isTableFunc, + ignoreIfExists = ignoreIfExists, + replace = replace) + } + + // scalastyle:off argcount + private def createSQLFunctionCommand( + name: String, + inputParamText: Option[String] = None, + returnTypeText: String = "INT", + exprText: Option[String] = None, + queryText: Option[String] = None, + comment: Option[String] = None, + isDeterministic: Option[Boolean] = None, + containsSQL: Option[Boolean] = None, + isTableFunc: Boolean = false, + ignoreIfExists: Boolean = false, + replace: Boolean = false): CreateSQLFunctionCommand = { + // scalastyle:on argcount + CreateSQLFunctionCommand( + FunctionIdentifier(name), + inputParamText = inputParamText, + returnTypeText = returnTypeText, + exprText = exprText, + queryText = queryText, + comment = comment, + isDeterministic = isDeterministic, + containsSQL = containsSQL, + isTableFunc = isTableFunc, + isTemp = true, + ignoreIfExists = ignoreIfExists, + replace = replace) + } + + test("create temporary SQL functions") { + comparePlans( + parser.parsePlan("CREATE TEMPORARY FUNCTION a() RETURNS INT RETURN 1"), + createSQLFunctionCommand("a", exprText = Some("1"))) + + comparePlans( + parser.parsePlan( + "CREATE TEMPORARY FUNCTION a(x INT) RETURNS TABLE (a INT) RETURN SELECT x"), + createSQLFunctionCommand( + name = "a", + inputParamText = Some("x INT"), + returnTypeText = "a INT", + queryText = Some("SELECT x"), + isTableFunc = true)) + + comparePlans( + parser.parsePlan("CREATE OR REPLACE TEMPORARY FUNCTION a() RETURNS INT RETURN 1"), + createSQLFunctionCommand("a", exprText = Some("1"), replace = true)) + + checkParseError( + "CREATE TEMPORARY FUNCTION a.b() RETURNS INT RETURN 1", + errorClass = "INVALID_SQL_SYNTAX.CREATE_TEMP_FUNC_WITH_DATABASE", + parameters = Map("database" -> "`a`"), + queryContext = Array( + ExpectedContext("CREATE TEMPORARY FUNCTION a.b() RETURNS INT RETURN 1", 0, 51) + ) + ) + + checkParseError( + "CREATE TEMPORARY FUNCTION a.b.c() RETURNS INT RETURN 1", + errorClass = "INVALID_SQL_SYNTAX.MULTI_PART_NAME", + parameters = Map( + "statement" -> "CREATE TEMPORARY FUNCTION", + "name" -> "`a`.`b`.`c`"), + queryContext = Array( + ExpectedContext("CREATE TEMPORARY FUNCTION a.b.c() RETURNS INT RETURN 1", 0, 53) + ) + ) + + checkParseError( + "CREATE TEMPORARY FUNCTION IF NOT EXISTS a() RETURNS INT RETURN 1", + errorClass = "INVALID_SQL_SYNTAX.CREATE_TEMP_FUNC_WITH_IF_NOT_EXISTS", + parameters = Map.empty, + queryContext = Array( + ExpectedContext("CREATE TEMPORARY FUNCTION IF NOT EXISTS a() RETURNS INT RETURN 1", 0, 63) + ) + ) + } + + test("create persistent SQL functions") { + comparePlans( + parser.parsePlan("CREATE FUNCTION a() RETURNS INT RETURN 1"), + createSQLFunction(Seq("a"), exprText = Some("1"))) + + comparePlans( + parser.parsePlan("CREATE FUNCTION a.b(x INT) RETURNS INT RETURN x"), + createSQLFunction(Seq("a", "b"), Some("x INT"), exprText = Some("x"))) + + comparePlans(parser.parsePlan( + "CREATE FUNCTION a.b.c(x INT) RETURNS TABLE (a INT) RETURN SELECT x"), + createSQLFunction(Seq("a", "b", "c"), Some("x INT"), returnTypeText = "a INT", None, + Some("SELECT x"), isTableFunc = true)) + + comparePlans(parser.parsePlan("CREATE FUNCTION IF NOT EXISTS a() RETURNS INT RETURN 1"), + createSQLFunction(Seq("a"), exprText = Some("1"), ignoreIfExists = true) + ) + + comparePlans(parser.parsePlan("CREATE OR REPLACE FUNCTION a() RETURNS INT RETURN 1"), + createSQLFunction(Seq("a"), exprText = Some("1"), replace = true)) + + comparePlans( + parser.parsePlan( + """ + |CREATE FUNCTION a(x INT COMMENT 'x') RETURNS INT + |LANGUAGE SQL DETERMINISTIC CONTAINS SQL + |COMMENT 'function' + |RETURN x + |""".stripMargin), + createSQLFunction(Seq("a"), inputParamText = Some("x INT COMMENT 'x'"), + exprText = Some("x"), isDeterministic = Some(true), containsSQL = Some(true), + comment = Some("function")) + ) + + intercept("CREATE OR REPLACE FUNCTION IF NOT EXISTS a() RETURNS INT RETURN 1", + "Cannot create a routine with both IF NOT EXISTS and REPLACE specified") + } + + test("create SQL functions with unsupported routine characteristics") { + intercept("CREATE FUNCTION foo() RETURNS INT LANGUAGE blah RETURN 1", + "Operation not allowed: Unsupported language for user defined functions: blah") + + intercept("CREATE FUNCTION foo() RETURNS INT SPECIFIC foo1 RETURN 1", + "Operation not allowed: SQL function with SPECIFIC name is not supported") + + intercept("CREATE FUNCTION foo() RETURNS INT NO SQL RETURN 1", + "Operation not allowed: SQL function with NO SQL is not supported") + + intercept("CREATE FUNCTION foo() RETURNS INT NO SQL CONTAINS SQL RETURN 1", + "Found duplicate clauses: SQL DATA ACCESS") + + intercept("CREATE FUNCTION foo() RETURNS INT RETURNS NULL ON NULL INPUT RETURN 1", + "Operation not allowed: SQL function with RETURNS NULL ON NULL INPUT is not supported") + + intercept("CREATE FUNCTION foo() RETURNS INT SQL SECURITY INVOKER RETURN 1", + "Operation not allowed: SQL function with SQL SECURITY INVOKER is not supported") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandTestUtils.scala index 39f2abd35c2b5..39624a33d8614 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandTestUtils.scala @@ -26,6 +26,7 @@ import org.scalatest.Tag import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, TableCatalog} import org.apache.spark.sql.execution.datasources.PartitioningUtils import org.apache.spark.sql.test.SQLTestUtils @@ -172,6 +173,11 @@ trait DDLCommandTestUtils extends SQLTestUtils { FileUtils.copyDirectory(new File(part0Loc), new File(part1Loc)) part1Loc } + + def tableLegacyProperties: Seq[String] = { + val excludedProperties = Set(TableCatalog.PROP_COMMENT, TableCatalog.PROP_COLLATION) + CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(excludedProperties.contains) + } } object DDLCommandTestUtils { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala index 8b868c0e17230..3dea8593b428d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.execution.command import org.apache.spark.SparkThrowable -import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, GlobalTempView, LocalTempView, SchemaCompensation, UnresolvedAttribute, UnresolvedFunctionName, UnresolvedIdentifier} import org.apache.spark.sql.catalyst.catalog.{ArchiveResource, FileResource, FunctionResource, JarResource} import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -37,9 +36,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { super.parseException(parser.parsePlan)(sqlText) } - private def intercept(sqlCommand: String, messages: String*): Unit = - interceptParseException(parser.parsePlan)(sqlCommand, messages: _*)() - private def compareTransformQuery(sql: String, expected: LogicalPlan): Unit = { val plan = parser.parsePlan(sql).asInstanceOf[ScriptTransformation].copy(ioschema = null) comparePlans(plan, expected, checkAnalysis = false) @@ -498,6 +494,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { UnresolvedIdentifier(Seq("view1")), Seq.empty[(String, Option[String])], None, + None, Map.empty[String, String], Some("SELECT * FROM tab1"), parser.parsePlan("SELECT * FROM tab1"), @@ -513,6 +510,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { Seq("a").asTableIdentifier, Seq.empty[(String, Option[String])], None, + None, Map.empty[String, String], Some("SELECT * FROM tab1"), parser.parsePlan("SELECT * FROM tab1"), @@ -539,6 +537,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { |(col1, col3 COMMENT 'hello') |TBLPROPERTIES('prop1Key'="prop1Val") |COMMENT 'BLABLA' + |DEFAULT COLLATION uNiCodE |AS SELECT * FROM tab1 """.stripMargin val parsed1 = parser.parsePlan(v1) @@ -546,6 +545,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { UnresolvedIdentifier(Seq("view1")), Seq("col1" -> None, "col3" -> Some("hello")), Some("BLABLA"), + Some("UNICODE"), Map("prop1Key" -> "prop1Val"), Some("SELECT * FROM tab1"), parser.parsePlan("SELECT * FROM tab1"), @@ -559,6 +559,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { |CREATE OR REPLACE GLOBAL TEMPORARY VIEW a |(col1, col3 COMMENT 'hello') |COMMENT 'BLABLA' + |DEFAULT COLLATION uNiCoDe |AS SELECT * FROM tab1 """.stripMargin val parsed2 = parser.parsePlan(v2) @@ -566,6 +567,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { Seq("a").asTableIdentifier, Seq("col1" -> None, "col3" -> Some("hello")), Some("BLABLA"), + Some("UNICODE"), Map(), Some("SELECT * FROM tab1"), parser.parsePlan("SELECT * FROM tab1"), @@ -821,44 +823,4 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { parser.parsePlan("SHOW CATALOGS LIKE 'defau*'"), ShowCatalogsCommand(Some("defau*"))) } - - test("Create SQL functions") { - comparePlans( - parser.parsePlan("CREATE TEMP FUNCTION foo() RETURNS INT RETURN 1"), - CreateSQLFunctionCommand( - FunctionIdentifier("foo"), - inputParamText = None, - returnTypeText = "INT", - exprText = Some("1"), - queryText = None, - comment = None, - isDeterministic = None, - containsSQL = None, - isTableFunc = false, - isTemp = true, - ignoreIfExists = false, - replace = false)) - intercept("CREATE FUNCTION foo() RETURNS INT RETURN 1", - "Operation not allowed: creating persistent SQL functions is not supported") - } - - test("create SQL functions with unsupported routine characteristics") { - intercept("CREATE FUNCTION foo() RETURNS INT LANGUAGE blah RETURN 1", - "Operation not allowed: Unsupported language for user defined functions: blah") - - intercept("CREATE FUNCTION foo() RETURNS INT SPECIFIC foo1 RETURN 1", - "Operation not allowed: SQL function with SPECIFIC name is not supported") - - intercept("CREATE FUNCTION foo() RETURNS INT NO SQL RETURN 1", - "Operation not allowed: SQL function with NO SQL is not supported") - - intercept("CREATE FUNCTION foo() RETURNS INT NO SQL CONTAINS SQL RETURN 1", - "Found duplicate clauses: SQL DATA ACCESS") - - intercept("CREATE FUNCTION foo() RETURNS INT RETURNS NULL ON NULL INPUT RETURN 1", - "Operation not allowed: SQL function with RETURNS NULL ON NULL INPUT is not supported") - - intercept("CREATE FUNCTION foo() RETURNS INT SQL SECURITY INVOKER RETURN 1", - "Operation not allowed: SQL function with SQL SECURITY INVOKER is not supported") - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 32a63f5c61976..d91d762048d29 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -2324,9 +2324,9 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { // Plain `StringType`. sql("CREATE TABLE t1(col STRING) USING parquet") sql("INSERT INTO t1 VALUES ('a')") - checkAnswer(sql("SELECT COLLATION(col) FROM t1"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col) FROM t1"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) sql("ALTER TABLE t1 ALTER COLUMN col TYPE STRING COLLATE UTF8_LCASE") - checkAnswer(sql("SELECT COLLATION(col) FROM t1"), Row("UTF8_LCASE")) + checkAnswer(sql("SELECT COLLATION(col) FROM t1"), Row("SYSTEM.BUILTIN.UTF8_LCASE")) // Invalid "ALTER COLUMN" to Integer. val alterInt = "ALTER TABLE t1 ALTER COLUMN col TYPE INTEGER" @@ -2348,23 +2348,23 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { // `ArrayType` with collation. sql("CREATE TABLE t2(col ARRAY) USING parquet") sql("INSERT INTO t2 VALUES (ARRAY('a'))") - checkAnswer(sql("SELECT COLLATION(col[0]) FROM t2"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col[0]) FROM t2"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) assertThrows[AnalysisException] { sql("ALTER TABLE t2 ALTER COLUMN col TYPE ARRAY") } - checkAnswer(sql("SELECT COLLATION(col[0]) FROM t2"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col[0]) FROM t2"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) // `MapType` with collation. sql("CREATE TABLE t3(col MAP) USING parquet") sql("INSERT INTO t3 VALUES (MAP('k', 'v'))") - checkAnswer(sql("SELECT COLLATION(col['k']) FROM t3"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col['k']) FROM t3"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) assertThrows[AnalysisException] { sql( """ |ALTER TABLE t3 ALTER COLUMN col TYPE |MAP""".stripMargin) } - checkAnswer(sql("SELECT COLLATION(col['k']) FROM t3"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col['k']) FROM t3"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) // Invalid change of map key collation. val alterMap = @@ -2388,11 +2388,11 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { // `StructType` with collation. sql("CREATE TABLE t4(col STRUCT) USING parquet") sql("INSERT INTO t4 VALUES (NAMED_STRUCT('a', 'value'))") - checkAnswer(sql("SELECT COLLATION(col.a) FROM t4"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col.a) FROM t4"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) assertThrows[AnalysisException] { sql("ALTER TABLE t4 ALTER COLUMN col TYPE STRUCT") } - checkAnswer(sql("SELECT COLLATION(col.a) FROM t4"), Row("UTF8_BINARY")) + checkAnswer(sql("SELECT COLLATION(col.a) FROM t4"), Row("SYSTEM.BUILTIN.UTF8_BINARY")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableParserSuite.scala index 944f20bf8e924..f8174d24c9499 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableParserSuite.scala @@ -17,11 +17,14 @@ package org.apache.spark.sql.execution.command +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedTableOrView} -import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan import org.apache.spark.sql.catalyst.plans.logical.{DescribeColumn, DescribeRelation} +import org.apache.spark.sql.test.SharedSparkSession + +class DescribeTableParserSuite extends SharedSparkSession with AnalysisTest { + private def parsePlan(statement: String) = spark.sessionState.sqlParser.parsePlan(statement) -class DescribeTableParserSuite extends AnalysisTest { test("SPARK-17328: Fix NPE with EXPLAIN DESCRIBE TABLE") { comparePlans(parsePlan("describe t"), DescribeRelation( @@ -75,6 +78,12 @@ class DescribeTableParserSuite extends AnalysisTest { UnresolvedAttribute(Seq("col")), isExtended = true)) + val error = intercept[AnalysisException](parsePlan("DESCRIBE EXTENDED t col AS JSON")) + + checkError( + exception = error, + condition = "UNSUPPORTED_FEATURE.DESC_TABLE_COLUMN_JSON") + val sql = "DESCRIBE TABLE t PARTITION (ds='1970-01-01') col" checkError( exception = parseException(parsePlan)(sql), @@ -85,4 +94,17 @@ class DescribeTableParserSuite extends AnalysisTest { start = 0, stop = 47)) } + + test("retain sql text position") { + val tbl = "unknown" + val sqlStatement = s"DESCRIBE TABLE $tbl" + val startPos = sqlStatement.indexOf(tbl) + assert(startPos != -1) + assertAnalysisErrorCondition( + parsePlan(sqlStatement), + "TABLE_OR_VIEW_NOT_FOUND", + Map("relationName" -> s"`$tbl`"), + Array(ExpectedContext(tbl, startPos, startPos + tbl.length - 1)) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala index c4e9ff93ef85d..f8d2e9dd3a3cb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala @@ -293,4 +293,29 @@ trait DescribeTableSuiteBase extends QueryTest with DDLCommandTestUtils { Row("col1", "string", null))) } } + + Seq(true, false).foreach { hasCollations => + test(s"DESCRIBE TABLE EXTENDED with collation specified = $hasCollations") { + + withNamespaceAndTable("ns", "tbl") { tbl => + val getCollationDescription = () => sql(s"DESCRIBE TABLE EXTENDED $tbl") + .where("col_name = 'Collation'") + + val defaultCollation = if (hasCollations) "DEFAULT COLLATION uNiCoDe" else "" + + sql(s"CREATE TABLE $tbl (id string) $defaultUsing $defaultCollation") + val descriptionDf = getCollationDescription() + + if (hasCollations) { + checkAnswer(descriptionDf, Seq(Row("Collation", "UNICODE", ""))) + } else { + assert(descriptionDf.isEmpty) + } + + sql(s"ALTER TABLE $tbl DEFAULT COLLATION UniCode_cI_rTrIm") + val newDescription = getCollationDescription() + checkAnswer(newDescription, Seq(Row("Collation", "UNICODE_CI_RTRIM", ""))) + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 92467cbcb6c05..2cc203129817b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, AnalysisTest, An import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog, TempVariableManager} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Cast, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, Literal, StringLiteral} import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke -import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{AlterColumn, AnalysisOnlyCommand, AppendData, Assignment, CreateTable, CreateTableAsSelect, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, InsertIntoStatement, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, OverwriteByExpression, OverwritePartitionsDynamic, Project, SetTableLocation, SetTableProperties, ShowTableProperties, SubqueryAlias, UnsetTableProperties, UpdateAction, UpdateTable} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId @@ -45,11 +45,12 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode} import org.apache.spark.sql.sources.SimpleScanSource +import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{BooleanType, CharType, DoubleType, IntegerType, LongType, StringType, StructField, StructType, VarcharType} import org.apache.spark.unsafe.types.UTF8String -class PlanResolutionSuite extends AnalysisTest { - import CatalystSqlParser._ +class PlanResolutionSuite extends SharedSparkSession with AnalysisTest { + private def parsePlan(statement: String) = spark.sessionState.sqlParser.parsePlan(statement) private val v1Format = classOf[SimpleScanSource].getName private val v2Format = classOf[FakeV2Provider].getName @@ -240,7 +241,7 @@ class PlanResolutionSuite extends AnalysisTest { } // We don't check analysis here by default, as we expect the plan to be unresolved // such as `CreateTable`. - val analyzed = analyzer.execute(CatalystSqlParser.parsePlan(query)) + val analyzed = analyzer.execute(parsePlan(query)) if (checkAnalysis) { analyzer.checkAnalysis(analyzed) } @@ -2867,9 +2868,8 @@ class PlanResolutionSuite extends AnalysisTest { exception = intercept[ParseException] { parsePlan(query) }, - condition = "_LEGACY_ERROR_TEMP_0035", - parameters = Map( - "message" -> "CREATE TEMPORARY TABLE ..., use CREATE TEMPORARY VIEW instead"), + condition = "_LEGACY_ERROR_TEMP_0046", + parameters = Map(), context = ExpectedContext(fragment = query, start = 0, stop = 48)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala index 462b967a75900..f7d41556b4e6b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.command import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StringType, StructType} @@ -66,10 +67,20 @@ trait ShowPartitionsSuiteBase extends QueryTest with DDLCommandTestUtils { test("non-partitioning columns") { withNamespaceAndTable("ns", "dateTable") { t => createDateTable(t) - val errMsg = intercept[AnalysisException] { - sql(s"SHOW PARTITIONS $t PARTITION(abcd=2015, xyz=1)") - }.getMessage - assert(errMsg.contains("abcd is not a valid partition column")) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`datetable`" + } else { + "`test_catalog`.`ns`.`dateTable`" + } + checkError( + exception = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $t PARTITION(abcd=2015, xyz=1)") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`abcd`", + "tableName" -> expectedTableName) + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala index f6a5f6a7da26a..dbeb67c253208 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala @@ -353,7 +353,7 @@ trait ShowTablesSuiteBase extends QueryTest with DDLCommandTestUtils { |View Text: SELECT id FROM $catalog.$namespace.$table |View Schema Mode: BINDING |View Catalog and Namespace: spark_catalog.default - |View Query Output Columns: [id] + |View Query Output Columns: [`id`] |Schema: root | |-- id: integer (nullable = true)""".stripMargin assert(actualLocalResult === expectedLocalResult) @@ -380,7 +380,7 @@ trait ShowTablesSuiteBase extends QueryTest with DDLCommandTestUtils { |View Text: SELECT id FROM $catalog.$namespace.$table |View Schema Mode: BINDING |View Catalog and Namespace: spark_catalog.default - |View Query Output Columns: [id] + |View Query Output Columns: [`id`] |Schema: root | |-- id: integer (nullable = true)""".stripMargin assert(actualGlobalResult1 === expectedGlobalResult1) @@ -398,7 +398,7 @@ trait ShowTablesSuiteBase extends QueryTest with DDLCommandTestUtils { |View Text: SELECT id FROM $catalog.$namespace.$table |View Schema Mode: BINDING |View Catalog and Namespace: spark_catalog.default - |View Query Output Columns: [id] + |View Query Output Columns: [`id`] |Schema: root | |-- id: integer (nullable = true)""".stripMargin assert(actualLocalResult2 === expectedLocalResult2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala index 8c985ea1f0527..b61065f41c5e6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.internal.SQLConf /** @@ -103,10 +104,20 @@ trait TruncateTableSuiteBase extends QueryTest with DDLCommandTestUtils { } // throw exception if the column in partition spec is not a partition column. - val errMsg = intercept[AnalysisException] { - sql(s"TRUNCATE TABLE $t PARTITION (unknown = 1)") - }.getMessage - assert(errMsg.contains("unknown is not a valid partition column")) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`parttable`" + } else { + "`test_catalog`.`ns`.`partTable`" + } + checkError( + exception = intercept[AnalysisException] { + sql(s"TRUNCATE TABLE $t PARTITION (unknown = 1)") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`unknown`", + "tableName" -> expectedTableName) + ) } } @@ -117,10 +128,28 @@ trait TruncateTableSuiteBase extends QueryTest with DDLCommandTestUtils { sql(s"CREATE TABLE $t (c0 INT) $defaultUsing") sql(s"INSERT INTO $t SELECT 0") - val errMsg = intercept[AnalysisException] { - sql(s"TRUNCATE TABLE $t PARTITION (c0=1)") - }.getMessage - assert(errMsg.contains(invalidPartColumnError)) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`tbl`" + } else { + "`test_catalog`.`ns`.`tbl`" + } + val expectedCondition = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + "_LEGACY_ERROR_TEMP_1267" + } else { + "PARTITIONS_NOT_FOUND" + } + val expectedParameters = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + Map("tableIdentWithDB" -> expectedTableName) + } else { + Map("partitionList" -> "`c0`", "tableName" -> expectedTableName) + } + checkError( + exception = intercept[AnalysisException] { + sql(s"TRUNCATE TABLE $t PARTITION (c0=1)") + }, + condition = expectedCondition, + parameters = expectedParameters + ) } } @@ -145,10 +174,20 @@ trait TruncateTableSuiteBase extends QueryTest with DDLCommandTestUtils { sql(s"INSERT INTO $t PARTITION (id=0) SELECT 'abc'") sql(s"INSERT INTO $t PARTITION (id=1) SELECT 'def'") withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val errMsg = intercept[AnalysisException] { - sql(s"TRUNCATE TABLE $t PARTITION (ID=1)") - }.getMessage - assert(errMsg.contains("ID is not a valid partition column")) + val expectedTableName = if (commandVersion == DDLCommandTestUtils.V1_COMMAND_VERSION) { + s"`$SESSION_CATALOG_NAME`.`ns`.`tbl`" + } else { + "`test_catalog`.`ns`.`tbl`" + } + checkError( + exception = intercept[AnalysisException] { + sql(s"TRUNCATE TABLE $t PARTITION (ID=1)") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`ID`", + "tableName" -> expectedTableName) + ) } withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { sql(s"TRUNCATE TABLE $t PARTITION (ID=1)") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetLocationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetLocationSuite.scala index 8f5af2e1f2e76..343a591fb5585 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetLocationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetLocationSuite.scala @@ -93,8 +93,8 @@ trait AlterTableSetLocationSuiteBase extends command.AlterTableSetLocationSuiteB exception = intercept[AnalysisException] { sql(s"ALTER TABLE $t PARTITION (A='1', B='2') SET LOCATION '/path/to/part/ways3'") }, - condition = "_LEGACY_ERROR_TEMP_1231", - parameters = Map("key" -> "A", "tblName" -> "`spark_catalog`.`ns`.`tbl`") + condition = "PARTITIONS_NOT_FOUND", + parameters = Map("partitionList" -> "`A`", "tableName" -> "`spark_catalog`.`ns`.`tbl`") ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala index eaf016ac2fa9f..3602853e53aa8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala @@ -19,6 +19,10 @@ package org.apache.spark.sql.execution.command.v1 import java.util.Locale +import org.json4s._ +import org.json4s.jackson.JsonMethods.parse + +import org.apache.spark.SPARK_VERSION import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.execution.command @@ -36,9 +40,12 @@ import org.apache.spark.sql.types.StringType */ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase with command.TestsV1AndV2Commands { + implicit val formats: org.json4s.DefaultFormats.type = org.json4s.DefaultFormats def getProvider(): String = defaultUsing.stripPrefix("USING").trim.toLowerCase(Locale.ROOT) + val iso8601Regex = raw"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z$$".r + test("Describing of a non-existent partition") { withNamespaceAndTable("ns", "table") { tbl => spark.sql(s"CREATE TABLE $tbl (id bigint, data string) $defaultUsing " + @@ -203,6 +210,410 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase Row("histogram", "NULL"))) } } + + test("DESCRIBE AS JSON partitions, clusters, buckets") { + withNamespaceAndTable("ns", "table") { t => + val tableCreationStr = + s""" + |CREATE TABLE $t ( + | employee_id INT, + | employee_name STRING, + | department STRING, + | hire_date DATE + |) USING parquet + |OPTIONS ('compression' = 'snappy', 'max_records' = '1000') + |PARTITIONED BY (department, hire_date) + |CLUSTERED BY (employee_id) SORTED BY (employee_name ASC) INTO 4 BUCKETS + |COMMENT 'Employee data table for testing partitions and buckets' + |TBLPROPERTIES ('version' = '1.0') + |""".stripMargin + spark.sql(tableCreationStr) + val descriptionDf = spark.sql(s"DESCRIBE EXTENDED $t AS JSON") + val firstRow = descriptionDf.select("json_metadata").head() + val jsonValue = firstRow.getString(0) + val parsedOutput = parse(jsonValue).extract[DescribeTableJson] + + val expectedOutput = DescribeTableJson( + table_name = Some("table"), + catalog_name = Some(SESSION_CATALOG_NAME), + namespace = Some(List("ns")), + schema_name = Some("ns"), + columns = Some(List( + TableColumn("employee_id", Type("int"), true), + TableColumn("employee_name", Type("string"), true), + TableColumn("department", Type("string"), true), + TableColumn("hire_date", Type("date"), true) + )), + last_access = Some("UNKNOWN"), + created_by = Some(s"Spark $SPARK_VERSION"), + `type` = Some("MANAGED"), + provider = Some("parquet"), + bucket_columns = Some(List("employee_id")), + sort_columns = Some(List("employee_name")), + comment = Some("Employee data table for testing partitions and buckets"), + table_properties = Some(Map( + "version" -> "1.0" + )), + serde_library = if (getProvider() == "hive") { + Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe") + } else { + None + }, + storage_properties = Some(Map( + "compression" -> "snappy", + "max_records" -> "1000" + )), + partition_provider = Some("Catalog"), + partition_columns = Some(List("department", "hire_date")) + ) + + assert(parsedOutput.location.isDefined) + assert(iso8601Regex.matches(parsedOutput.created_time.get)) + assert(expectedOutput == parsedOutput.copy(location = None, created_time = None)) + } + } + + test("DESCRIBE AS JSON partition spec") { + withNamespaceAndTable("ns", "table") { t => + val tableCreationStr = + s""" + |CREATE TABLE $t ( + | id INT, + | name STRING, + | region STRING, + | category STRING + |) USING parquet + |PARTITIONED BY (region, category) + |COMMENT 'test partition spec' + |TBLPROPERTIES ('t' = 'test') + |""".stripMargin + spark.sql(tableCreationStr) + spark.sql(s"ALTER TABLE $t ADD PARTITION (region='USA', category='tech')") + + val descriptionDf = + spark.sql(s"DESCRIBE FORMATTED $t PARTITION (region='USA', category='tech') AS JSON") + val firstRow = descriptionDf.select("json_metadata").head() + val jsonValue = firstRow.getString(0) + val parsedOutput = parse(jsonValue).extract[DescribeTableJson] + + val expectedOutput = DescribeTableJson( + table_name = Some("table"), + catalog_name = Some("spark_catalog"), + namespace = Some(List("ns")), + schema_name = Some("ns"), + columns = Some(List( + TableColumn("id", Type("int"), true), + TableColumn("name", Type("string"), true), + TableColumn("region", Type("string"), true), + TableColumn("category", Type("string"), true) + )), + last_access = Some("UNKNOWN"), + created_by = Some(s"Spark $SPARK_VERSION"), + `type` = Some("MANAGED"), + provider = Some("parquet"), + bucket_columns = Some(Nil), + sort_columns = Some(Nil), + comment = Some("test partition spec"), + table_properties = Some(Map( + "t" -> "test" + )), + serde_library = if (getProvider() == "hive") { + Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe") + } else { + None + }, + partition_provider = Some("Catalog"), + partition_columns = Some(List("region", "category")), + partition_values = Some(Map("region" -> "USA", "category" -> "tech")) + ) + + assert(parsedOutput.location.isDefined) + assert(iso8601Regex.matches(parsedOutput.created_time.get)) + assert(expectedOutput == parsedOutput.copy( + location = None, created_time = None, storage_properties = None)) + } + } + + test("DESCRIBE AS JSON default values") { + withNamespaceAndTable("ns", "table") { t => + val tableCreationStr = + s""" + |CREATE TABLE $t ( + | id INT DEFAULT 1, + | name STRING DEFAULT 'unknown', + | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + | is_active BOOLEAN DEFAULT true + |) + |USING parquet COMMENT 'table_comment' + |""".stripMargin + spark.sql(tableCreationStr) + + val descriptionDf = spark.sql(s"DESC EXTENDED $t AS JSON") + val firstRow = descriptionDf.select("json_metadata").head() + val jsonValue = firstRow.getString(0) + val parsedOutput = parse(jsonValue).extract[DescribeTableJson] + + val expectedOutput = DescribeTableJson( + table_name = Some("table"), + catalog_name = Some("spark_catalog"), + namespace = Some(List("ns")), + schema_name = Some("ns"), + columns = Some(List( + TableColumn("id", Type("int"), default = Some("1")), + TableColumn("name", Type("string"), default = Some("'unknown'")), + TableColumn("created_at", Type("timestamp_ltz"), default = Some("CURRENT_TIMESTAMP")), + TableColumn("is_active", Type("boolean"), default = Some("true")) + )), + last_access = Some("UNKNOWN"), + created_by = Some(s"Spark $SPARK_VERSION"), + `type` = Some("MANAGED"), + storage_properties = None, + provider = Some("parquet"), + bucket_columns = Some(Nil), + sort_columns = Some(Nil), + comment = Some("table_comment"), + serde_library = if (getProvider() == "hive") { + Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe") + } else { + None + }, + table_properties = None + ) + assert(parsedOutput.location.isDefined) + assert(iso8601Regex.matches(parsedOutput.created_time.get)) + assert(expectedOutput == parsedOutput.copy(location = None, created_time = None)) + } + } + + test("DESCRIBE AS JSON view") { + Seq(true, false).foreach { isTemp => + withNamespaceAndTable("ns", "table") { t => + withView("view") { + val tableCreationStr = + s""" + |CREATE TABLE $t (id INT, name STRING, created_at TIMESTAMP) + | USING parquet + | OPTIONS ('compression' 'snappy') + | CLUSTERED BY (id, name) SORTED BY (created_at) INTO 4 BUCKETS + | COMMENT 'test temp view' + | TBLPROPERTIES ('parquet.encryption' = 'true') + |""".stripMargin + spark.sql(tableCreationStr) + val viewType = if (isTemp) "TEMP VIEW" else "VIEW" + spark.sql(s"CREATE $viewType view AS SELECT * FROM $t") + val descriptionDf = spark.sql(s"DESCRIBE EXTENDED view AS JSON") + val firstRow = descriptionDf.select("json_metadata").head() + val jsonValue = firstRow.getString(0) + val parsedOutput = parse(jsonValue).extract[DescribeTableJson] + + val expectedOutput = DescribeTableJson( + table_name = Some("view"), + catalog_name = if (isTemp) Some("system") else Some("spark_catalog"), + namespace = if (isTemp) Some(List("session")) else Some(List("default")), + schema_name = if (isTemp) Some("session") else Some("default"), + columns = Some(List( + TableColumn("id", Type("int")), + TableColumn("name", Type("string")), + TableColumn("created_at", Type("timestamp_ltz")) + )), + last_access = Some("UNKNOWN"), + created_by = Some(s"Spark $SPARK_VERSION"), + `type` = Some("VIEW"), + view_text = Some("SELECT * FROM spark_catalog.ns.table"), + view_original_text = if (isTemp) None else Some("SELECT * FROM spark_catalog.ns.table"), + // TODO: this is unexpected and temp view should also use COMPENSATION mode. + view_schema_mode = if (isTemp) Some("BINDING") else Some("COMPENSATION"), + view_catalog_and_namespace = Some("spark_catalog.default"), + view_query_output_columns = Some(List("id", "name", "created_at")) + ) + + assert(iso8601Regex.matches(parsedOutput.created_time.get)) + assert(expectedOutput == parsedOutput.copy( + created_time = None, + table_properties = None, + storage_properties = None, + serde_library = None)) + } + } + } + } + + test("DESCRIBE AS JSON for column throws Analysis Exception") { + withNamespaceAndTable("ns", "table") { t => + val tableCreationStr = + s""" + |CREATE TABLE ns.table( + | cust_id INT, + | state VARCHAR(20), + | name STRING COMMENT "Short name" + | ) + | USING parquet + | PARTITIONED BY (state) + |""".stripMargin + spark.sql(tableCreationStr) + spark.sql("INSERT INTO ns.table PARTITION (state = \"CA\") VALUES (100, \"Jane\")") + val error = intercept[AnalysisException] { + spark.sql("DESCRIBE FORMATTED ns.table ns.table.name AS JSON") + } + + checkError( + exception = error, + condition = "UNSUPPORTED_FEATURE.DESC_TABLE_COLUMN_JSON") + } + } + + test("DESCRIBE AS JSON complex types") { + withNamespaceAndTable("ns", "table") { t => + val tableCreationStr = + s""" + |CREATE TABLE $t ( + | id STRING, + | logs VARIANT, + | nested_struct STRUCT< + | name: STRING, + | age: INT, + | contact: STRUCT< + | email: STRING, + | phone_numbers: ARRAY, + | addresses: ARRAY> + | > + | >, + | preferences MAP> + |) USING parquet + | OPTIONS (option1 'value1', option2 'value2') + | PARTITIONED BY (id) + | COMMENT 'A table with nested complex types' + | TBLPROPERTIES ('property1' = 'value1', 'password' = 'password') + """.stripMargin + spark.sql(tableCreationStr) + val descriptionDf = spark.sql(s"DESCRIBE EXTENDED $t AS JSON") + val firstRow = descriptionDf.select("json_metadata").head() + val jsonValue = firstRow.getString(0) + val parsedOutput = parse(jsonValue).extract[DescribeTableJson] + + val expectedOutput = DescribeTableJson( + table_name = Some("table"), + catalog_name = Some("spark_catalog"), + namespace = Some(List("ns")), + schema_name = Some("ns"), + columns = Some(List( + TableColumn( + name = "logs", + `type` = Type("variant"), + default = None + ), + TableColumn( + name = "nested_struct", + `type` = Type( + name = "struct", + fields = Some(List( + Field( + name = "name", + `type` = Type("string") + ), + Field( + name = "age", + `type` = Type("int") + ), + Field( + name = "contact", + `type` = Type( + name = "struct", + fields = Some(List( + Field( + name = "email", + `type` = Type("string") + ), + Field( + name = "phone_numbers", + `type` = Type( + name = "array", + element_type = Some(Type("string")), + element_nullable = Some(true) + ) + ), + Field( + name = "addresses", + `type` = Type( + name = "array", + element_type = Some(Type( + name = "struct", + fields = Some(List( + Field( + name = "street", + `type` = Type("string") + ), + Field( + name = "city", + `type` = Type("string") + ), + Field( + name = "zip", + `type` = Type("int") + ) + )) + )), + element_nullable = Some(true) + ) + ) + )) + ) + ) + )) + ), + default = None + ), + TableColumn( + name = "preferences", + `type` = Type( + name = "map", + key_type = Some(Type("string")), + value_type = Some(Type( + name = "array", + element_type = Some(Type("string")), + element_nullable = Some(true) + )), + value_nullable = Some(true) + ), + default = None + ), + TableColumn( + name = "id", + `type` = Type("string"), + default = None + ) + )), + serde_library = if (getProvider() == "hive") { + Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") + } else { + None + }, + storage_properties = Some(Map( + "option1" -> "value1", + "option2" -> "value2" + )), + last_access = Some("UNKNOWN"), + created_by = Some(s"Spark $SPARK_VERSION"), + `type` = Some("MANAGED"), + provider = Some("parquet"), + comment = Some("A table with nested complex types"), + table_properties = Some(Map( + "password" -> "*********(redacted)", + "property1" -> "value1" + )), + partition_provider = Some("Catalog"), + partition_columns = Some(List("id")) + ) + + assert(parsedOutput.location.isDefined) + assert(iso8601Regex.matches(parsedOutput.created_time.get)) + assert(expectedOutput == parsedOutput.copy(location = None, created_time = None)) + } + } } /** @@ -218,6 +629,7 @@ class DescribeTableSuite extends DescribeTableSuiteBase with CommandSuiteBase { " PARTITIONED BY (id)" + " TBLPROPERTIES ('bar'='baz')" + " COMMENT 'this is a test table'" + + " DEFAULT COLLATION unicode" + " LOCATION 'file:/tmp/testcat/table_name'") val descriptionDf = spark.sql(s"DESCRIBE TABLE EXTENDED $tbl") assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === Seq( @@ -241,6 +653,7 @@ class DescribeTableSuite extends DescribeTableSuiteBase with CommandSuiteBase { Row("Type", "EXTERNAL", ""), Row("Provider", getProvider(), ""), Row("Comment", "this is a test table", ""), + Row("Collation", "UNICODE", ""), Row("Table Properties", "[bar=baz]", ""), Row("Location", "file:/tmp/testcat/table_name", ""), Row("Partition Provider", "Catalog", ""))) @@ -275,3 +688,63 @@ class DescribeTableSuite extends DescribeTableSuiteBase with CommandSuiteBase { } } } + +/** Represents JSON output of DESCRIBE TABLE AS JSON */ +case class DescribeTableJson( + table_name: Option[String] = None, + catalog_name: Option[String] = None, + namespace: Option[List[String]] = Some(Nil), + schema_name: Option[String] = None, + columns: Option[List[TableColumn]] = Some(Nil), + created_time: Option[String] = None, + last_access: Option[String] = None, + created_by: Option[String] = None, + `type`: Option[String] = None, + provider: Option[String] = None, + bucket_columns: Option[List[String]] = Some(Nil), + sort_columns: Option[List[String]] = Some(Nil), + comment: Option[String] = None, + table_properties: Option[Map[String, String]] = None, + location: Option[String] = None, + serde_library: Option[String] = None, + storage_properties: Option[Map[String, String]] = None, + partition_provider: Option[String] = None, + partition_columns: Option[List[String]] = Some(Nil), + partition_values: Option[Map[String, String]] = None, + view_text: Option[String] = None, + view_original_text: Option[String] = None, + view_schema_mode: Option[String] = None, + view_catalog_and_namespace: Option[String] = None, + view_query_output_columns: Option[List[String]] = None + ) + +/** Used for columns field of DescribeTableJson */ +case class TableColumn( + name: String, + `type`: Type, + element_nullable: Boolean = true, + comment: Option[String] = None, + default: Option[String] = None +) + +case class Type( + name: String, + fields: Option[List[Field]] = None, + `type`: Option[Type] = None, + element_type: Option[Type] = None, + key_type: Option[Type] = None, + value_type: Option[Type] = None, + comment: Option[String] = None, + default: Option[String] = None, + element_nullable: Option[Boolean] = Some(true), + value_nullable: Option[Boolean] = Some(true), + nullable: Option[Boolean] = Some(true) +) + +case class Field( + name: String, + `type`: Type, + element_nullable: Boolean = true, + comment: Option[String] = None, + default: Option[String] = None +) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala index 92bea4d8655c5..9d353fde898f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala @@ -207,7 +207,7 @@ class ShowTablesSuite extends ShowTablesSuiteBase with CommandSuiteBase { |View Original Text: SELECT id FROM $catalog.$namespace.$table |View Schema Mode: COMPENSATION |View Catalog and Namespace: $catalog.$namespace - |View Query Output Columns: [id] + |View Query Output Columns: [`id`] |Schema: root | |-- id: integer (nullable = true)""".stripMargin assert(actualResult === expectedResult) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala index d66dca20d77b8..5719fbee370a8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala @@ -53,8 +53,8 @@ class ShowTablesSuite extends command.ShowTablesSuiteBase with CommandSuiteBase catalog: String, namespace: String, table: String): (String, Map[String, String]) = { - ("_LEGACY_ERROR_TEMP_1231", - Map("key" -> "id", "tblName" -> s"`$catalog`.`$namespace`.`$table`")) + ("PARTITIONS_NOT_FOUND", + Map("partitionList" -> "`id`", "tableName" -> s"`$catalog`.`$namespace`.`$table`")) } protected override def namespaceKey: String = "Namespace" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceResolverSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceResolverSuite.scala new file mode 100644 index 0000000000000..016c1e2f5457d --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceResolverSuite.scala @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.analysis.resolver.{MetadataResolver, Resolver} +import org.apache.spark.sql.catalyst.catalog.UnresolvedCatalogRelation +import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} + +class DataSourceResolverSuite extends QueryTest with SharedSparkSession { + private val keyValueTableSchema = StructType( + Seq( + StructField("key", IntegerType, true), + StructField("value", StringType, true) + ) + ) + + test("CSV relation") { + withTable("src_csv") { + spark.sql("CREATE TABLE src_csv (key INT, value STRING) USING CSV;").collect() + + checkResolveOperator( + sqlText = "SELECT * FROM src_csv", + expectedTableName = "spark_catalog.default.src_csv", + expectedTableSchema = keyValueTableSchema + ) + } + } + + test("JSON relation") { + withTable("src_json") { + spark.sql("CREATE TABLE src_json (key INT, value STRING) USING JSON;").collect() + + checkResolveOperator( + sqlText = "SELECT * FROM src_json", + expectedTableName = "spark_catalog.default.src_json", + expectedTableSchema = keyValueTableSchema + ) + } + } + + test("PARQUET relation") { + withTable("src_parquet") { + spark.sql("CREATE TABLE src_parquet (key INT, value STRING) USING PARQUET;").collect() + + checkResolveOperator( + sqlText = "SELECT * FROM src_parquet", + expectedTableName = "spark_catalog.default.src_parquet", + expectedTableSchema = keyValueTableSchema + ) + } + } + + test("ORC relation") { + withTable("src_orc") { + spark.sql("CREATE TABLE src_orc (key INT, value STRING) USING ORC;").collect() + + checkResolveOperator( + sqlText = "SELECT * FROM src_orc", + expectedTableName = "spark_catalog.default.src_orc", + expectedTableSchema = keyValueTableSchema + ) + } + } + + private def checkResolveOperator( + sqlText: String, + expectedTableName: String, + expectedTableSchema: StructType) = { + val metadataResolver = new MetadataResolver( + spark.sessionState.catalogManager, + Resolver.createRelationResolution(spark.sessionState.catalogManager) + ) + val dataSourceResolver = new DataSourceResolver(spark) + + val unresolvedPlan = spark.sql(sqlText).queryExecution.logical + + metadataResolver.resolve(unresolvedPlan) + + val unresolvedRelations = unresolvedPlan.collect { + case unresolvedRelation: UnresolvedRelation => unresolvedRelation + } + assert(unresolvedRelations.size == 1) + + val partiallyResolvedRelation = metadataResolver + .getRelationWithResolvedMetadata(unresolvedRelations.head) + .get + .asInstanceOf[SubqueryAlias] + .child + assert(partiallyResolvedRelation.isInstanceOf[UnresolvedCatalogRelation]) + + val result = dataSourceResolver.resolveOperator(partiallyResolvedRelation) + + val logicalRelation = result.asInstanceOf[LogicalRelation] + assert( + logicalRelation.catalogTable.get.identifier.unquotedString + == expectedTableName + ) + assert(logicalRelation.relation.schema == expectedTableSchema) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala index fd9d31e7a594d..d2acdcfc62053 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} import org.scalatest.PrivateMethodTester +import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils @@ -207,6 +208,18 @@ class DataSourceSuite extends SharedSparkSession with PrivateMethodTester { Utils.deleteRecursively(baseDir) } } + + test("SPARK-50458: Proper error handling for unsupported file system") { + val loc = "https://raw.githubusercontent.com/apache/spark/refs/heads/master/examples/" + + "src/main/resources/employees.json" + checkError(exception = intercept[SparkUnsupportedOperationException]( + sql(s"CREATE TABLE HTTP USING JSON LOCATION '$loc'")), + condition = "FAILED_READ_FILE.UNSUPPORTED_FILE_SYSTEM", + parameters = Map( + "path" -> loc, + "fileSystemClass" -> "org.apache.hadoop.fs.http.HttpsFileSystem", + "method" -> "listStatus")) + } } object TestPaths { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileResolverSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileResolverSuite.scala new file mode 100644 index 0000000000000..1d1b228028bdb --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileResolverSuite.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{LongType, StringType, StructType} + +class FileResolverSuite extends QueryTest with SharedSparkSession { + private val tableSchema = new StructType().add("id", LongType) + private val csvTableSchema = new StructType().add("_c0", StringType) + + test("JSON file format") { + val df = spark.range(100).toDF() + withTempPath(f => { + df.write.json(f.getCanonicalPath) + checkResolveOperator( + sqlText = s"select id from json.`${f.getCanonicalPath}`", + expectedTablePath = s"file:${f.getCanonicalPath}", + expectedTableSchema = tableSchema + ) + }) + } + + test("PARQUET file format") { + val df = spark.range(100).toDF() + withTempPath(f => { + df.write.parquet(f.getCanonicalPath) + checkResolveOperator( + sqlText = s"select id from parquet.`${f.getCanonicalPath}`", + expectedTablePath = s"file:${f.getCanonicalPath}", + expectedTableSchema = tableSchema + ) + }) + } + + test("ORC file format") { + val df = spark.range(100).toDF() + withTempPath(f => { + df.write.orc(f.getCanonicalPath) + checkResolveOperator( + sqlText = s"select id from ORC.`${f.getCanonicalPath}`", + expectedTablePath = s"file:${f.getCanonicalPath}", + expectedTableSchema = tableSchema + ) + }) + } + + test("CSV file format") { + val df = spark.range(100).toDF() + withTempPath(f => { + df.write.csv(f.getCanonicalPath) + checkResolveOperator( + sqlText = s"select _c0 from csv.`${f.getCanonicalPath}`", + expectedTablePath = s"file:${f.getCanonicalPath}", + expectedTableSchema = csvTableSchema + ) + }) + } + + private def checkResolveOperator( + sqlText: String, + expectedTablePath: String, + expectedTableSchema: StructType) = { + val fileResolver = new FileResolver(spark) + + val unresolvedPlan = spark.sql(sqlText).queryExecution.logical + + val result = fileResolver.resolveOperator( + unresolvedPlan.asInstanceOf[Project].child.asInstanceOf[UnresolvedRelation] + ) + + val logicalRelation = result.asInstanceOf[LogicalRelation] + assert( + logicalRelation.relation.asInstanceOf[HadoopFsRelation].location.rootPaths.mkString(",") == + expectedTablePath + ) + assert(logicalRelation.relation.schema == expectedTableSchema) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/PushVariantIntoScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/PushVariantIntoScanSuite.scala new file mode 100644 index 0000000000000..2a866dcd66f06 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/PushVariantIntoScanSuite.scala @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.SparkConf +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.variant._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ + +class PushVariantIntoScanSuite extends SharedSparkSession { + override def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.PUSH_VARIANT_INTO_SCAN.key, "true") + + private def localTimeZone = spark.sessionState.conf.sessionLocalTimeZone + + // Return a `StructField` with the expected `VariantMetadata`. + private def field(ordinal: Int, dataType: DataType, path: String, + failOnError: Boolean = true, timeZone: String = localTimeZone): StructField = + StructField(ordinal.toString, dataType, + metadata = VariantMetadata(path, failOnError, timeZone).toMetadata) + + // Validate an `Alias` expression has the expected name and child. + private def checkAlias(expr: Expression, expectedName: String, expected: Expression): Unit = { + expr match { + case Alias(child, name) => + assert(name == expectedName) + assert(child == expected) + case _ => fail() + } + } + + private def testOnFormats(fn: String => Unit): Unit = { + for (format <- Seq("PARQUET")) { + test("test - " + format) { + withTable("T") { + fn(format) + } + } + } + } + + testOnFormats { format => + sql("create table T (v variant, vs struct, " + + "va array, vd variant default parse_json('1')) " + + s"using $format") + + sql("select variant_get(v, '$.a', 'int') as a, v, cast(v as struct) as v from T") + .queryExecution.optimizedPlan match { + case Project(projectList, l: LogicalRelation) => + val output = l.output + val v = output(0) + checkAlias(projectList(0), "a", GetStructField(v, 0)) + checkAlias(projectList(1), "v", GetStructField(v, 1)) + checkAlias(projectList(2), "v", GetStructField(v, 2)) + assert(v.dataType == StructType(Array( + field(0, IntegerType, "$.a"), + field(1, VariantType, "$", timeZone = "UTC"), + field(2, StructType(Array(StructField("b", FloatType))), "$")))) + case _ => fail() + } + + sql("select 1 from T where isnotnull(v)") + .queryExecution.optimizedPlan match { + case Project(projectList, Filter(condition, l: LogicalRelation)) => + val output = l.output + val v = output(0) + checkAlias(projectList(0), "1", Literal(1)) + assert(condition == IsNotNull(v)) + assert(v.dataType == StructType(Array( + field(0, BooleanType, "$.__placeholder_field__", failOnError = false, timeZone = "UTC")))) + case _ => fail() + } + + sql("select variant_get(v, '$.a', 'int') + 1 as a, try_variant_get(v, '$.b', 'string') as b " + + "from T where variant_get(v, '$.a', 'int') = 1").queryExecution.optimizedPlan match { + case Project(projectList, Filter(condition, l: LogicalRelation)) => + val output = l.output + val v = output(0) + checkAlias(projectList(0), "a", Add(GetStructField(v, 0), Literal(1))) + checkAlias(projectList(1), "b", GetStructField(v, 1)) + assert(condition == And(IsNotNull(v), EqualTo(GetStructField(v, 0), Literal(1)))) + assert(v.dataType == StructType(Array( + field(0, IntegerType, "$.a"), + field(1, StringType, "$.b", failOnError = false)))) + case _ => fail() + } + + sql("select variant_get(vs.v1, '$.a', 'int') as a, variant_get(vs.v1, '$.b', 'int') as b, " + + "variant_get(vs.v2, '$.a', 'int') as a, vs.i from T").queryExecution.optimizedPlan match { + case Project(projectList, l: LogicalRelation) => + val output = l.output + val vs = output(1) + val v1 = GetStructField(vs, 0, Some("v1")) + val v2 = GetStructField(vs, 1, Some("v2")) + checkAlias(projectList(0), "a", GetStructField(v1, 0)) + checkAlias(projectList(1), "b", GetStructField(v1, 1)) + checkAlias(projectList(2), "a", GetStructField(v2, 0)) + checkAlias(projectList(3), "i", GetStructField(vs, 2, Some("i"))) + assert(vs.dataType == StructType(Array( + StructField("v1", StructType(Array( + field(0, IntegerType, "$.a"), field(1, IntegerType, "$.b")))), + StructField("v2", StructType(Array(field(0, IntegerType, "$.a")))), + StructField("i", IntegerType)))) + case _ => fail() + } + + def variantGet(child: Expression): Expression = VariantGet( + child, + path = Literal("$.a"), + targetType = VariantType, + failOnError = true, + timeZoneId = Some(localTimeZone)) + + // No push down if the struct containing variant is used. + sql("select vs, variant_get(vs.v1, '$.a') as a from T").queryExecution.optimizedPlan match { + case Project(projectList, l: LogicalRelation) => + val output = l.output + val vs = output(1) + assert(projectList(0) == vs) + checkAlias(projectList(1), "a", variantGet(GetStructField(vs, 0, Some("v1")))) + assert(vs.dataType == StructType(Array( + StructField("v1", VariantType), + StructField("v2", VariantType), + StructField("i", IntegerType)))) + case _ => fail() + } + + // No push down for variant in array. + sql("select variant_get(va[0], '$.a') as a from T").queryExecution.optimizedPlan match { + case Project(projectList, l: LogicalRelation) => + val output = l.output + val va = output(2) + checkAlias(projectList(0), "a", variantGet(GetArrayItem(va, Literal(0)))) + assert(va.dataType == ArrayType(VariantType)) + case _ => fail() + } + + // No push down if variant has default value. + sql("select variant_get(vd, '$.a') as a from T").queryExecution.optimizedPlan match { + case Project(projectList, l: LogicalRelation) => + val output = l.output + val vd = output(3) + checkAlias(projectList(0), "a", variantGet(vd)) + assert(vd.dataType == VariantType) + case _ => fail() + } + } + + test("No push down for JSON") { + withTable("T") { + sql("create table T (v variant) using JSON") + sql("select variant_get(v, '$.a') from T").queryExecution.optimizedPlan match { + case Project(_, l: LogicalRelation) => + val output = l.output + assert(output(0).dataType == VariantType) + case _ => fail() + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 7cacd8ea2dc50..850e887ac8e75 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -3078,6 +3078,23 @@ abstract class CSVSuite } } + test("SPARK-50616: We can write with a tsv file extension") { + withTempPath { path => + val input = Seq( + "1423-11-12T23:41:00", + "1765-03-28", + "2016-01-28T20:00:00" + ).toDF().repartition(1) + input.write.option("extension", "tsv").csv(path.getAbsolutePath) + + val files = Files.list(path.toPath) + .iterator().asScala.map(x => x.getFileName.toString) + .toList.filter(x => x.takeRight(3).equals("tsv")) + + assert(files.size == 1) + } + } + test("SPARK-39904: Parse incorrect timestamp values") { withTempPath { path => Seq( @@ -3308,7 +3325,7 @@ abstract class CSVSuite } test("SPARK-40667: validate CSV Options") { - assert(CSVOptions.getAllOptions.size == 39) + assert(CSVOptions.getAllOptions.size == 40) // Please add validation on any new CSV options here assert(CSVOptions.isValidOption("header")) assert(CSVOptions.isValidOption("inferSchema")) @@ -3347,6 +3364,7 @@ abstract class CSVSuite assert(CSVOptions.isValidOption("compression")) assert(CSVOptions.isValidOption("codec")) assert(CSVOptions.isValidOption("sep")) + assert(CSVOptions.isValidOption("extension")) assert(CSVOptions.isValidOption("delimiter")) assert(CSVOptions.isValidOption("columnPruning")) // Please add validation on any new parquet options with alternative here diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala index 500c0647bcb2a..bf9740970a667 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala @@ -28,14 +28,13 @@ import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument, SearchA import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder import org.apache.spark.{SparkConf, SparkException, SparkRuntimeException} -import org.apache.spark.sql.{AnalysisException, DataFrame, Row} +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Row} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan import org.apache.spark.sql.functions.col -import org.apache.spark.sql.internal.ExpressionUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -47,7 +46,7 @@ import org.apache.spark.util.ArrayImplicits._ */ @ExtendedSQLTest class OrcFilterSuite extends OrcTest with SharedSparkSession { - import testImplicits.toRichColumn + import testImplicits.{toRichColumn, ColumnConstructorExt} override protected def sparkConf: SparkConf = super @@ -60,8 +59,8 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { checker: (SearchArgument) => Unit): Unit = { val output = predicate.collect { case a: Attribute => a }.distinct val query = df - .select(output.map(e => ExpressionUtils.column(e)): _*) - .where(ExpressionUtils.column(predicate)) + .select(output.map(e => Column(e)): _*) + .where(Column(predicate)) query.queryExecution.optimizedPlan match { case PhysicalOperation(_, filters, DataSourceV2ScanRelation(_, o: OrcScan, _, _, _)) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala index b8669ee4d1ef1..9fbc872ad262b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala @@ -28,10 +28,10 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Predicate} import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.execution.datasources.FileBasedDataSourceTest import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION import org.apache.spark.util.ArrayImplicits._ @@ -118,8 +118,8 @@ trait OrcTest extends QueryTest with FileBasedDataSourceTest with BeforeAndAfter (implicit df: DataFrame): Unit = { val output = predicate.collect { case a: Attribute => a }.distinct val query = df - .select(output.map(e => column(e)): _*) - .where(predicate) + .select(output.map(e => Column(e)): _*) + .where(Column(predicate)) query.queryExecution.optimizedPlan match { case PhysicalOperation(_, filters, DataSourceV2ScanRelation(_, o: OrcScan, _, _, _)) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1FilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1FilterSuite.scala index 5260ebf15e4f3..8018417f923af 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1FilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1FilterSuite.scala @@ -21,12 +21,12 @@ import scala.jdk.CollectionConverters._ import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl import org.apache.spark.SparkConf -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{Column, DataFrame} import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Predicate} import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelationWithTable} import org.apache.spark.sql.execution.datasources.orc.OrcShimUtils.{Operator, SearchArgument} -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.tags.ExtendedSQLTest @@ -44,8 +44,8 @@ class OrcV1FilterSuite extends OrcFilterSuite { checker: (SearchArgument) => Unit): Unit = { val output = predicate.collect { case a: Attribute => a }.distinct val query = df - .select(output.map(e => column(e)): _*) - .where(predicate) + .select(output.map(e => Column(e)): _*) + .where(Column(predicate)) var maybeRelation: Option[HadoopFsRelation] = None val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect { @@ -90,8 +90,8 @@ class OrcV1FilterSuite extends OrcFilterSuite { (implicit df: DataFrame): Unit = { val output = predicate.collect { case a: Attribute => a }.distinct val query = df - .select(output.map(e => column(e)): _*) - .where(predicate) + .select(output.map(e => Column(e)): _*) + .where(Column(predicate)) var maybeRelation: Option[HadoopFsRelation] = None val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 37edb9ea2315e..5f7a0c9e7e749 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -49,7 +49,7 @@ import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsR import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.{ExpressionUtils, LegacyBehaviorPolicy, SQLConf} +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.internal.LegacyBehaviorPolicy.{CORRECTED, LEGACY} import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType.{INT96, TIMESTAMP_MICROS, TIMESTAMP_MILLIS} import org.apache.spark.sql.test.SharedSparkSession @@ -2233,6 +2233,8 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared @ExtendedSQLTest class ParquetV1FilterSuite extends ParquetFilterSuite { + import testImplicits.ColumnConstructorExt + override protected def sparkConf: SparkConf = super .sparkConf @@ -2260,8 +2262,8 @@ class ParquetV1FilterSuite extends ParquetFilterSuite { SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", SQLConf.NESTED_PREDICATE_PUSHDOWN_FILE_SOURCE_LIST.key -> pushdownDsList) { val query = df - .select(output.map(ExpressionUtils.column): _*) - .where(ExpressionUtils.column(predicate)) + .select(output.map(Column(_)): _*) + .where(Column(predicate)) val nestedOrAttributes = predicate.collectFirst { case g: GetStructField => g @@ -2313,6 +2315,8 @@ class ParquetV1FilterSuite extends ParquetFilterSuite { @ExtendedSQLTest class ParquetV2FilterSuite extends ParquetFilterSuite { + import testImplicits.ColumnConstructorExt + // TODO: enable Parquet V2 write path after file source V2 writers are workable. override protected def sparkConf: SparkConf = super @@ -2339,8 +2343,8 @@ class ParquetV2FilterSuite extends ParquetFilterSuite { SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> InferFiltersFromConstraints.ruleName, SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { val query = df - .select(output.map(ExpressionUtils.column): _*) - .where(ExpressionUtils.column(predicate)) + .select(output.map(Column(_)): _*) + .where(Column(predicate)) query.queryExecution.optimizedPlan.collectFirst { case PhysicalOperation(_, filters, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 22a02447e720f..bba71f1c48dec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -473,6 +473,26 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS } } + test("SPARK-50463: Partition values can be read over multiple batches") { + withTempDir { dir => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE.key -> "1") { + val path = dir.getAbsolutePath + spark.range(0, 5) + .selectExpr("concat(cast(id % 2 as string), 'a') as partCol", "id") + .write + .format("parquet") + .mode("overwrite") + .partitionBy("partCol").save(path) + val df = spark.read.format("parquet").load(path).selectExpr("partCol") + val expected = spark.range(0, 5) + .selectExpr("concat(cast(id % 2 as string), 'a') as partCol") + .collect() + + checkAnswer(df, expected) + } + } + } + test("SPARK-10301 requested schema clipping - same schema") { withTempPath { dir => val path = dir.getCanonicalPath diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVariantShreddingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVariantShreddingSuite.scala new file mode 100644 index 0000000000000..8bb5a4b1d0bc5 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVariantShreddingSuite.scala @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import java.io.File + +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.unsafe.types.VariantVal + +/** + * Test shredding Variant values in the Parquet reader/writer. + */ +class ParquetVariantShreddingSuite extends QueryTest with ParquetTest with SharedSparkSession { + + private def testWithTempDir(name: String)(block: File => Unit): Unit = test(name) { + withTempDir { dir => + block(dir) + } + } + + testWithTempDir("write shredded variant basic") { dir => + val schema = "a int, b string, c decimal(15, 1)" + val df = spark.sql( + """ + | select case + | when id = 0 then parse_json('{"a": 1, "b": "2", "c": 3.3, "d": 4.4}') + | when id = 1 then parse_json('{"a": [1,2,3], "b": "hello", "c": {"x": 0}}') + | when id = 2 then parse_json('{"A": 1, "c": 1.23}') + | end v from range(3) + |""".stripMargin) + val fullSchema = "v struct, b struct," + + "c struct>>" + withSQLConf(SQLConf.VARIANT_WRITE_SHREDDING_ENABLED.key -> true.toString, + SQLConf.VARIANT_FORCE_SHREDDING_SCHEMA_FOR_TEST.key -> schema) { + df.write.mode("overwrite").parquet(dir.getAbsolutePath) + + + // Verify that we can read the full variant. The exact binary layout can change before and + // after shredding, so just check that the JSON representation matches. + checkAnswer( + spark.read.parquet(dir.getAbsolutePath).selectExpr("to_json(v)"), + df.selectExpr("to_json(v)").collect() + ) + + // Verify that it was shredded to the expected fields. + + val shreddedDf = spark.read.schema(fullSchema).parquet(dir.getAbsolutePath) + // Metadata should be unchanaged. + checkAnswer(shreddedDf.selectExpr("v.metadata"), + df.collect().map(v => Row(v.get(0).asInstanceOf[VariantVal].getMetadata)) + ) + + // Check typed values. + // Second row is not an integer, and third is A, not a + checkAnswer( + shreddedDf.selectExpr("v.typed_value.a.typed_value"), + Seq(Row(1), Row(null), Row(null))) + // b is missing from third row. + checkAnswer( + shreddedDf.selectExpr("v.typed_value.b.typed_value"), + Seq(Row("2"), Row("hello"), Row(null))) + // Second row is an object, third is the wrong scale. (Note: we may eventually allow the + // latter, in which case this test should be updated.) + checkAnswer( + shreddedDf.selectExpr("v.typed_value.c.typed_value"), + Seq(Row(3.3), Row(null), Row(null))) + + // Untyped values are more awkward to check, so for now just check their nullness. We + // can do more thorough checking once the reader is ready. + checkAnswer( + shreddedDf.selectExpr("v.value is null"), + // First row has "d" and third has "A". + Seq(Row(false), Row(true), Row(false))) + checkAnswer( + shreddedDf.selectExpr("v.typed_value.a.value is null"), + // First row is fully shredded, third is missing. + Seq(Row(true), Row(false), Row(true))) + checkAnswer( + shreddedDf.selectExpr("v.typed_value.b.value is null"), + // b is always fully shredded or missing. + Seq(Row(true), Row(true), Row(true))) + checkAnswer( + shreddedDf.selectExpr("v.typed_value.c.value is null"), + Seq(Row(true), Row(false), Row(false))) + // The a/b/c levels are not null, even if the field is missing. + checkAnswer( + shreddedDf.selectExpr( + "v.typed_value.a is null or v.typed_value.b is null or v.typed_value.c is null"), + Seq(Row(false), Row(false), Row(false))) + } + } + + testWithTempDir("write shredded variant array") { dir => + val schema = "array" + val df = spark.sql( + """ + | select case + | when id = 0 then parse_json('[1, "2", 3.5, null, 5]') + | when id = 1 then parse_json('{"a": [1, 2, 3]}') + | when id = 2 then parse_json('1') + | when id = 3 then parse_json('null') + | end v from range(4) + |""".stripMargin) + val fullSchema = "v struct>>" + withSQLConf(SQLConf.VARIANT_WRITE_SHREDDING_ENABLED.key -> true.toString, + SQLConf.VARIANT_FORCE_SHREDDING_SCHEMA_FOR_TEST.key -> schema) { + df.write.mode("overwrite").parquet(dir.getAbsolutePath) + + // Verify that we can read the full variant. + checkAnswer( + spark.read.parquet(dir.getAbsolutePath).selectExpr("to_json(v)"), + df.selectExpr("to_json(v)").collect() + ) + + // Verify that it was shredded to the expected fields. + + val shreddedDf = spark.read.schema(fullSchema).parquet(dir.getAbsolutePath) + // Metadata should be unchanaged. + checkAnswer(shreddedDf.selectExpr("v.metadata"), + df.collect().map(v => Row(v.get(0).asInstanceOf[VariantVal].getMetadata)) + ) + + // Check typed values. + checkAnswer( + shreddedDf.selectExpr("v.typed_value.typed_value"), + Seq(Row(Array(1, null, null, null, 5)), Row(null), Row(null), Row(null))) + + // All the other array elements should have non-null value. + checkAnswer( + shreddedDf.selectExpr("transform(v.typed_value.value, v -> v is null)"), + Seq(Row(Array(true, false, false, false, true)), Row(null), Row(null), Row(null))) + + // The non-arrays should have non-null top-level value. + checkAnswer( + shreddedDf.selectExpr("v.value is null"), + Seq(Row(true), Row(false), Row(false), Row(false))) + } + } + + testWithTempDir("write no shredding schema") { dir => + // Check that we can write and read normally when shredding is enabled if + // we don't provide a shredding schema. + withSQLConf(SQLConf.VARIANT_WRITE_SHREDDING_ENABLED.key -> true.toString) { + val df = spark.sql( + """ + | select parse_json('{"a": ' || id || ', "b": 2}') as v, + | array(parse_json('{"c": 3}'), 123::variant) as a + | from range(1, 3, 1, 1) + |""".stripMargin) + df.write.mode("overwrite").parquet(dir.getAbsolutePath) + checkAnswer( + spark.read.parquet(dir.getAbsolutePath), df.collect() + ) + } + } + + testWithTempDir("arrays and maps ignore shredding schema") { dir => + // Check that we don't try to shred array or map elements, even if a shredding schema + // is specified. + val schema = "a int" + val df = spark.sql( + """ select v, array(v) as arr, map('myKey', v) as m from + | (select parse_json('{"a":' || id || '}') v from range(3)) + |""".stripMargin) + val fullSchema = "v struct>>, " + + "arr array>, " + + "m map>" + withSQLConf(SQLConf.VARIANT_WRITE_SHREDDING_ENABLED.key -> true.toString, + SQLConf.VARIANT_FORCE_SHREDDING_SCHEMA_FOR_TEST.key -> schema) { + df.write.mode("overwrite").parquet(dir.getAbsolutePath) + + // Verify that we can read the full variant. + checkAnswer( + spark.read.parquet(dir.getAbsolutePath).selectExpr("to_json(v)"), + df.selectExpr("to_json(v)").collect() + ) + + // Verify that it was shredded to the expected fields. + + val shreddedDf = spark.read.schema(fullSchema).parquet(dir.getAbsolutePath) + // Metadata should be unchanaged. + checkAnswer(shreddedDf.selectExpr("v.metadata"), + df.selectExpr("v").collect().map(v => Row(v.get(0).asInstanceOf[VariantVal].getMetadata)) + ) + checkAnswer(shreddedDf.selectExpr("arr[0].metadata"), + df.selectExpr("arr[0]").collect().map(v => + Row(v.get(0).asInstanceOf[VariantVal].getMetadata)) + ) + checkAnswer(shreddedDf.selectExpr("m['myKey'].metadata"), + df.selectExpr("m['myKey']").collect().map( + v => Row(v.get(0).asInstanceOf[VariantVal].getMetadata)) + ) + + // v should be fully shredded, but the array and map should not be. + checkAnswer( + shreddedDf.selectExpr( + "v.value is null"), + Seq(Row(true), Row(true), Row(true))) + checkAnswer( + shreddedDf.selectExpr( + "arr[0].value is null"), + Seq(Row(false), Row(false), Row(false))) + checkAnswer( + shreddedDf.selectExpr( + "m['myKey'].value is null"), + Seq(Row(false), Row(false), Row(false))) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala index 0316f09e42ce3..0d18e3bf809e0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala @@ -22,7 +22,7 @@ import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.{QueryTest, SparkSession} import org.apache.spark.sql.connector.read.ScanBuilder -import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, LogicalWriteInfoImpl, WriteBuilder} import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.text.TextFileFormat @@ -96,8 +96,8 @@ class FileTableSuite extends QueryTest with SharedSparkSession { } allFileBasedDataSources.foreach { format => - test(s"SPARK-49519: Merge options of table and relation when constructing FileScanBuilder" + - s" - $format") { + test("SPARK-49519, SPARK-50287: Merge options of table and relation when " + + s"constructing ScanBuilder and WriteBuilder in FileFormat - $format") { withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") { val userSpecifiedSchema = StructType(Seq(StructField("c1", StringType))) @@ -108,20 +108,29 @@ class FileTableSuite extends QueryTest with SharedSparkSession { val table = provider.getTable( userSpecifiedSchema, Array.empty, - dsOptions.asCaseSensitiveMap()) + dsOptions.asCaseSensitiveMap()).asInstanceOf[FileTable] val tableOptions = new CaseInsensitiveStringMap( Map("k2" -> "table_v2", "k3" -> "v3").asJava) - val mergedOptions = table.asInstanceOf[FileTable].newScanBuilder(tableOptions) match { + + val mergedReadOptions = table.newScanBuilder(tableOptions) match { case csv: CSVScanBuilder => csv.options case json: JsonScanBuilder => json.options case orc: OrcScanBuilder => orc.options case parquet: ParquetScanBuilder => parquet.options case text: TextScanBuilder => text.options } - assert(mergedOptions.size() == 3) - assert("v1".equals(mergedOptions.get("k1"))) - assert("table_v2".equals(mergedOptions.get("k2"))) - assert("v3".equals(mergedOptions.get("k3"))) + assert(mergedReadOptions.size === 3) + assert(mergedReadOptions.get("k1") === "v1") + assert(mergedReadOptions.get("k2") === "table_v2") + assert(mergedReadOptions.get("k3") === "v3") + + val writeInfo = LogicalWriteInfoImpl("query-id", userSpecifiedSchema, tableOptions) + val mergedWriteOptions = table.newWriteBuilder(writeInfo).build() + .asInstanceOf[FileWrite].options + assert(mergedWriteOptions.size === 3) + assert(mergedWriteOptions.get("k1") === "v1") + assert(mergedWriteOptions.get("k2") === "table_v2") + assert(mergedWriteOptions.get("k3") === "v3") case _ => throw new IllegalArgumentException(s"Failed to get table provider for $format") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceTransformWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceTransformWithStateSuite.scala index baab6327b35c1..fe224608fffd8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceTransformWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceTransformWithStateSuite.scala @@ -24,11 +24,12 @@ import org.apache.hadoop.conf.Configuration import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.{Encoders, Row} import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithChangelogCheckpointingEnabled, RocksDBFileManager, RocksDBStateStoreProvider, TestClass} +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithEncodingTypes, AlsoTestWithRocksDBFeatures, RocksDBFileManager, RocksDBStateStoreProvider, TestClass} import org.apache.spark.sql.functions.{col, explode, timestamp_seconds} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.{InputMapRow, ListState, MapInputEvent, MapOutputEvent, MapStateTTLProcessor, MaxEventTimeStatefulProcessor, OutputMode, RunningCountStatefulProcessor, RunningCountStatefulProcessorWithProcTimeTimerUpdates, StatefulProcessor, StateStoreMetricsTest, TestMapStateProcessor, TimeMode, TimerValues, TransformWithStateSuiteUtils, Trigger, TTLConfig, ValueState} import org.apache.spark.sql.streaming.util.StreamManualClock +import org.apache.spark.tags.SlowSQLTest import org.apache.spark.util.Utils /** Stateful processor of single value state var with non-primitive type */ @@ -125,8 +126,9 @@ class SessionGroupsStatefulProcessorWithTTL extends /** * Test suite to verify integration of state data source reader with the transformWithState operator */ +@SlowSQLTest class StateDataSourceTransformWithStateSuite extends StateStoreMetricsTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures with AlsoTestWithEncodingTypes { import testImplicits._ @@ -1075,7 +1077,7 @@ class StateDataSourceTransformWithStateSuite extends StateStoreMetricsTest // Read the changelog for one of the partitions at version 3 and // ensure that we have two entries // For this test - keys 9 and 12 are written at version 3 for partition 4 - val changelogReader = fileManager.getChangelogReader(3, true) + val changelogReader = fileManager.getChangelogReader(3) val entries = changelogReader.toSeq assert(entries.size == 2) val retainEntry = entries.head @@ -1085,13 +1087,13 @@ class StateDataSourceTransformWithStateSuite extends StateStoreMetricsTest Utils.deleteRecursively(new File(changelogFilePath)) // Write the retained entry back to the changelog - val changelogWriter = fileManager.getChangeLogWriter(3, true) + val changelogWriter = fileManager.getChangeLogWriter(3) changelogWriter.put(retainEntry._2, retainEntry._3) changelogWriter.commit() // Ensure that we have only one entry in the changelog for version 3 // For this test - key 9 is retained and key 12 is deleted - val changelogReader1 = fileManager.getChangelogReader(3, true) + val changelogReader1 = fileManager.getChangelogReader(3) val entries1 = changelogReader1.toSeq assert(entries1.size == 1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala index fe910c21cb0c6..560292b263ba3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala @@ -1930,7 +1930,7 @@ class XmlSuite } checkXmlOptionErrorMessage(Map.empty, - "[XML_ROW_TAG_MISSING] `rowTag` option is required for reading files in XML format.", + "[XML_ROW_TAG_MISSING] `rowTag` option is required for reading/writing files in XML format.", QueryCompilationErrors.xmlRowTagRequiredError(XmlOptions.ROW_TAG).getCause) checkXmlOptionErrorMessage(Map("rowTag" -> ""), "'rowTag' option should not be an empty string.") @@ -1951,6 +1951,20 @@ class XmlSuite .xml(spark.createDataset(Seq(xmlString))) } + test("SPARK-50688: rowTag requirement for write") { + withTempDir { dir => + dir.delete() + val e = intercept[AnalysisException] { + spark.range(1).write.xml(dir.getCanonicalPath) + } + checkError( + exception = e, + condition = "XML_ROW_TAG_MISSING", + parameters = Map("rowTag" -> "`rowTag`") + ) + } + } + test("Primitive field casting") { val ts = Seq("2002-05-30 21:46:54", "2002-05-30T21:46:54", "2002-05-30T21:46:54.1234", "2002-05-30T21:46:54Z", "2002-05-30T21:46:54.1234Z", "2002-05-30T21:46:54-06:00", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonDataSourceSuite.scala index 1f2be12058eb7..73c05ff0e0b58 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonDataSourceSuite.scala @@ -94,6 +94,27 @@ abstract class PythonDataSourceSuiteBase extends QueryTest with SharedSparkSessi class PythonDataSourceSuite extends PythonDataSourceSuiteBase { import IntegratedUDFTestUtils._ + test("SPARK-50426: should not trigger static Python data source lookup") { + assume(shouldTestPandasUDFs) + val testAppender = new LogAppender("Python data source lookup") + // Using builtin and Java data sources should not trigger a static + // Python data source lookup + withLogAppender(testAppender) { + spark.read.format("org.apache.spark.sql.test").load() + spark.range(3).write.mode("overwrite").format("noop").save() + } + assert(!testAppender.loggingEvents + .exists(msg => msg.getMessage.getFormattedMessage.contains( + "Loading static Python Data Sources."))) + // Now trigger a Python data source lookup + withLogAppender(testAppender) { + spark.read.format(staticSourceName).load() + } + assert(testAppender.loggingEvents + .exists(msg => msg.getMessage.getFormattedMessage.contains( + "Loading static Python Data Sources."))) + } + test("SPARK-45917: automatic registration of Python Data Source") { assume(shouldTestPandasUDFs) val df = spark.read.format(staticSourceName).load() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala index 4b46331be107a..2f44994c301b3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala @@ -91,7 +91,10 @@ class PythonUDFSuite extends QueryTest with SharedSparkSession { val pythonSQLMetrics = List( "data sent to Python workers", "data returned from Python workers", - "number of output rows") + "number of output rows", + "total time to initialize Python workers", + "total time to start Python workers", + "total time to run Python workers") val df = base.groupBy(pythonTestUDF(base("a") + 1)) .agg(pythonTestUDF(pythonTestUDF(base("a") + 1))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServerSuite.scala index e05264825f773..c3d4541bac29c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServerSuite.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.execution.streaming.{StatefulProcessorHandleImpl, StatefulProcessorHandleState} import org.apache.spark.sql.execution.streaming.state.StateMessage -import org.apache.spark.sql.execution.streaming.state.StateMessage.{AppendList, AppendValue, Clear, ContainsKey, DeleteTimer, Exists, ExpiryTimerRequest, Get, GetProcessingTime, GetValue, GetWatermark, HandleState, Keys, ListStateCall, ListStateGet, ListStatePut, ListTimers, MapStateCall, RegisterTimer, RemoveKey, SetHandleState, StateCallCommand, StatefulProcessorCall, TimerRequest, TimerStateCallCommand, TimerValueRequest, UpdateValue, Values, ValueStateCall, ValueStateUpdate} +import org.apache.spark.sql.execution.streaming.state.StateMessage.{AppendList, AppendValue, Clear, ContainsKey, DeleteTimer, Exists, ExpiryTimerRequest, Get, GetProcessingTime, GetValue, GetWatermark, HandleState, Keys, ListStateCall, ListStateGet, ListStatePut, ListTimers, MapStateCall, ParseStringSchema, RegisterTimer, RemoveKey, SetHandleState, StateCallCommand, StatefulProcessorCall, TimerRequest, TimerStateCallCommand, TimerValueRequest, UpdateValue, UtilsRequest, Values, ValueStateCall, ValueStateUpdate} import org.apache.spark.sql.streaming.{ListState, MapState, TTLConfig, ValueState} import org.apache.spark.sql.types.{IntegerType, StructField, StructType} @@ -574,6 +574,16 @@ class TransformWithStateInPandasStateServerSuite extends SparkFunSuite with Befo verify(arrowStreamWriter).finalizeCurrentArrowBatch() } + test("utils request - parse string schema") { + val message = UtilsRequest.newBuilder().setParseStringSchema( + ParseStringSchema.newBuilder().setSchema( + "value int" + ).build() + ).build() + stateServer.handleUtilsRequest(message) + verify(outputStream).writeInt(argThat((x: Int) => x > 0)) + } + private def getIntegerRow(value: Int): Row = { new GenericRowWithSchema(Array(value), stateSchema) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala index e5dfa33164903..5f7205eaf4bbe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.streaming import java.io.File +import org.apache.commons.io.FileUtils + import org.apache.spark.sql.catalyst.util.stringToFile import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -129,4 +131,69 @@ class OffsetSeqLogSuite extends SharedSparkSession { val log = new OffsetSeqLog(spark, input.toString) log.getLatest().get } + + // SPARK-50526 - sanity tests to ensure that values are set correctly for state store + // encoding format within OffsetSeqMetadata + test("offset log records defaults to unsafeRow for store encoding format") { + val offsetSeqMetadata = OffsetSeqMetadata.apply(batchWatermarkMs = 0, batchTimestampMs = 0, + spark.conf) + assert(offsetSeqMetadata.conf.get(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key) === + Some("unsaferow")) + } + + test("offset log uses the store encoding format set in the conf") { + val offsetSeqMetadata = OffsetSeqMetadata.apply(batchWatermarkMs = 0, batchTimestampMs = 0, + Map(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro")) + assert(offsetSeqMetadata.conf.get(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key) === + Some("avro")) + } + + // Verify whether entry exists within the offset log and has the right value or that we pick up + // the correct default values when populating the session conf. + private def verifyOffsetLogEntry( + checkpointDir: String, + entryExists: Boolean, + encodingFormat: String): Unit = { + val log = new OffsetSeqLog(spark, s"$checkpointDir/offsets") + val latestBatchId = log.getLatestBatchId() + assert(latestBatchId.isDefined, "No offset log entries found in the checkpoint location") + + // Read the latest offset log + val offsetSeq = log.get(latestBatchId.get).get + val offsetSeqMetadata = offsetSeq.metadata.get + + if (entryExists) { + val encodingFormatOpt = offsetSeqMetadata.conf.get( + SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key) + assert(encodingFormatOpt.isDefined, "No store encoding format found in the offset log entry") + assert(encodingFormatOpt.get == encodingFormat) + } + + val clonedSqlConf = spark.sessionState.conf.clone() + OffsetSeqMetadata.setSessionConf(offsetSeqMetadata, clonedSqlConf) + assert(clonedSqlConf.stateStoreEncodingFormat == encodingFormat) + } + + // verify that checkpoint created with different store encoding formats are read correctly + Seq("unsaferow", "avro").foreach { storeEncodingFormat => + test(s"verify format values from checkpoint loc - $storeEncodingFormat") { + withTempDir { checkpointDir => + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-4.0.0-tws-" + storeEncodingFormat + "/").toURI + FileUtils.copyDirectory(new File(resourceUri), checkpointDir.getCanonicalFile) + verifyOffsetLogEntry(checkpointDir.getAbsolutePath, entryExists = true, + storeEncodingFormat) + } + } + } + + test("verify format values from old checkpoint with Spark version 3.5.1") { + withTempDir { checkpointDir => + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/").toURI + FileUtils.copyDirectory(new File(resourceUri), checkpointDir.getCanonicalFile) + verifyOffsetLogEntry(checkpointDir.getAbsolutePath, entryExists = false, + "unsaferow") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ListStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ListStateSuite.scala index 22876831c00d1..bb4343bf32159 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ListStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ListStateSuite.scala @@ -190,8 +190,8 @@ class ListStateSuite extends StateVariableSuiteBase { var ttlValues = testState.getTTLValues() assert(ttlValues.nonEmpty) assert(ttlValues.forall(_._2 === ttlExpirationMs)) - var ttlStateValueIterator = testState.getValuesInTTLState() - assert(ttlStateValueIterator.hasNext) + var ttlStateValue = testState.getValueInTTLState() + assert(ttlStateValue.isDefined) // increment batchProcessingTime, or watermark and ensure expired value is not returned val nextBatchHandle = new StatefulProcessorHandleImpl(store, UUID.randomUUID(), @@ -212,10 +212,9 @@ class ListStateSuite extends StateVariableSuiteBase { ttlValues = nextBatchTestState.getTTLValues() assert(ttlValues.nonEmpty) assert(ttlValues.forall(_._2 === ttlExpirationMs)) - ttlStateValueIterator = nextBatchTestState.getValuesInTTLState() - assert(ttlStateValueIterator.hasNext) - assert(ttlStateValueIterator.next() === ttlExpirationMs) - assert(ttlStateValueIterator.isEmpty) + ttlStateValue = nextBatchTestState.getValueInTTLState() + assert(ttlStateValue.isDefined) + assert(ttlStateValue.get === ttlExpirationMs) // getWithoutTTL should still return the expired value assert(nextBatchTestState.getWithoutEnforcingTTL().toSeq === Seq("v1", "v2", "v3")) @@ -276,8 +275,8 @@ class ListStateSuite extends StateVariableSuiteBase { val ttlValues = testState.getTTLValues() assert(ttlValues.nonEmpty) assert(ttlValues.forall(_._2 === ttlExpirationMs)) - val ttlStateValueIterator = testState.getValuesInTTLState() - assert(ttlStateValueIterator.hasNext) + val ttlStateValue = testState.getValueInTTLState() + assert(ttlStateValue.isDefined) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreCheckpointFormatV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreCheckpointFormatV2Suite.scala index 9ac74eb5b9e8f..5725ebaf727bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreCheckpointFormatV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreCheckpointFormatV2Suite.scala @@ -182,7 +182,7 @@ class CkptIdCollectingStateStoreProviderWrapper extends StateStoreProvider { // return their own state store checkpointID. This can happen because of task retry or // speculative execution. class RocksDBStateStoreCheckpointFormatV2Suite extends StreamTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures { import testImplicits._ val providerClassName = classOf[CkptIdCollectingStateStoreProviderWrapper].getCanonicalName @@ -445,11 +445,12 @@ class RocksDBStateStoreCheckpointFormatV2Suite extends StreamTest val numBatches = checkpointInfoList.size / 8 // We don't pass batch versions that would need base checkpoint IDs because we don't know - // batchIDs for that. We only know that there are 3 batches without it. + // batchIDs for that. We only know that there are 1 batches without it. + // The two checkpoint IDs in between are stored in the commit log. validateCheckpointInfo(numBatches, 4, Set()) assert(CkptIdCollectingStateStoreWrapper .getStateStoreCheckpointInfos - .count(_.baseStateStoreCkptId.isDefined) == (numBatches - 3) * 8) + .count(_.baseStateStoreCkptId.isDefined) == (numBatches - 1) * 8) } testWithCheckpointInfoTracked(s"checkpointFormatVersion2 validate DropDuplicates") { @@ -541,4 +542,35 @@ class RocksDBStateStoreCheckpointFormatV2Suite extends StreamTest } validateCheckpointInfo(6, 1, Set(2, 4, 6)) } + + test("checkpointFormatVersion2 validate transformWithState") { + withTempDir { checkpointDir => + val inputData = MemoryStream[String] + val result = inputData.toDS() + .groupByKey(x => x) + .transformWithState(new RunningCountStatefulProcessor(), + TimeMode.None(), + OutputMode.Update()) + + testStream(result, Update())( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + AddData(inputData, "a"), + CheckNewAnswer(("a", "1")), + Execute { q => + assert(q.lastProgress.stateOperators(0).customMetrics.get("numValueStateVars") > 0) + assert(q.lastProgress.stateOperators(0).customMetrics.get("numRegisteredTimers") == 0) + }, + AddData(inputData, "a", "b"), + CheckNewAnswer(("a", "2"), ("b", "1")), + StopStream, + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + AddData(inputData, "a", "b"), // should remove state for "a" and not return anything for a + CheckNewAnswer(("b", "2")), + StopStream, + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + AddData(inputData, "a", "c"), // should recreate state for "a" and return count as 1 and + CheckNewAnswer(("a", "1"), ("c", "1")) + ) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreIntegrationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreIntegrationSuite.scala index d20cfb04f8e81..f170de66ee9df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreIntegrationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreIntegrationSuite.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.streaming.OutputMode.Update import org.apache.spark.util.Utils class RocksDBStateStoreIntegrationSuite extends StreamTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures { import testImplicits._ testWithColumnFamilies("RocksDBStateStore", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala index e1bd9dd38066b..72035ee268cbb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala @@ -42,7 +42,8 @@ import org.apache.spark.util.Utils @ExtendedSQLTest class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvider] - with AlsoTestWithChangelogCheckpointingEnabled + with AlsoTestWithRocksDBFeatures + with AlsoTestWithEncodingTypes with SharedSparkSession with BeforeAndAfter { @@ -58,7 +59,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid import StateStoreTestsHelper._ - testWithColumnFamilies(s"version encoding", + testWithColumnFamiliesAndEncodingTypes(s"version encoding", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => import RocksDBStateStoreProvider._ @@ -127,7 +128,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb file manager metrics exposed", + testWithColumnFamiliesAndEncodingTypes("rocksdb file manager metrics exposed", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => import RocksDBStateStoreProvider._ def getCustomMetric(metrics: StateStoreMetrics, @@ -162,7 +163,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan validation - invalid num columns", + testWithColumnFamiliesAndEncodingTypes("rocksdb range scan validation - invalid num columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => // zero ordering cols val ex1 = intercept[SparkUnsupportedOperationException] { @@ -201,7 +202,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid ) } - testWithColumnFamilies("rocksdb range scan validation - variable sized columns", + testWithColumnFamiliesAndEncodingTypes("rocksdb range scan validation - variable sized columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => val keySchemaWithVariableSizeCols: StructType = StructType( Seq(StructField("key1", StringType, false), StructField("key2", StringType, false))) @@ -224,7 +225,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid ) } - testWithColumnFamilies("rocksdb range scan validation - variable size data types unsupported", + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan validation - variable size data types unsupported", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => val keySchemaWithSomeUnsupportedTypeCols: StructType = StructType(Seq( StructField("key1", StringType, false), @@ -264,7 +266,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan validation - null type columns", + testWithColumnFamiliesAndEncodingTypes("rocksdb range scan validation - null type columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => val keySchemaWithNullTypeCols: StructType = StructType( Seq(StructField("key1", NullType, false), StructField("key2", StringType, false))) @@ -287,7 +289,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid ) } - testWithColumnFamilies("rocksdb range scan - fixed size non-ordering columns", + testWithColumnFamiliesAndEncodingTypes("rocksdb range scan - fixed size non-ordering columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource(newStoreProvider(keySchemaWithRangeScan, @@ -339,7 +341,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan - variable size non-ordering columns with " + + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan - variable size non-ordering columns with " + "double type values are supported", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => @@ -395,7 +398,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan - variable size non-ordering columns", + testWithColumnFamiliesAndEncodingTypes("rocksdb range scan - variable size non-ordering columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource(newStoreProvider(keySchemaWithRangeScan, @@ -448,7 +451,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan multiple ordering columns - variable size " + + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan multiple ordering columns - variable size " + s"non-ordering columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => @@ -492,15 +496,16 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan multiple non-contiguous ordering columns", + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan multiple non-contiguous ordering columns", TestWithBothChangelogCheckpointingEnabledAndDisabled ) { colFamiliesEnabled => val testSchema: StructType = StructType( Seq( - StructField("ordering-1", LongType, false), + StructField("ordering1", LongType, false), StructField("key2", StringType, false), - StructField("ordering-2", IntegerType, false), - StructField("string-2", StringType, false), - StructField("ordering-3", DoubleType, false) + StructField("ordering2", IntegerType, false), + StructField("string2", StringType, false), + StructField("ordering3", DoubleType, false) ) ) @@ -582,7 +587,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } - testWithColumnFamilies("rocksdb range scan multiple ordering columns - variable size " + + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan multiple ordering columns - variable size " + s"non-ordering columns with null values in first ordering column", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => @@ -682,7 +688,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan multiple ordering columns - variable size " + + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan multiple ordering columns - variable size " + s"non-ordering columns with null values in second ordering column", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => @@ -735,7 +742,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan byte ordering column - variable size " + + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan byte ordering column - variable size " + s"non-ordering columns", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => @@ -779,7 +787,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan - ordering cols and key schema cols are same", + testWithColumnFamiliesAndEncodingTypes( + "rocksdb range scan - ordering cols and key schema cols are same", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => // use the same schema as value schema for single col key schema @@ -821,7 +830,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb range scan - with prefix scan", + testWithColumnFamiliesAndEncodingTypes("rocksdb range scan - with prefix scan", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource(newStoreProvider(keySchemaWithRangeScan, @@ -858,7 +867,8 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies("rocksdb key and value schema encoders for column families", + testWithColumnFamiliesAndEncodingTypes( + "rocksdb key and value schema encoders for column families", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => val testColFamily = "testState" @@ -919,7 +929,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } /* Column family related tests */ - testWithColumnFamilies("column family creation with invalid names", + testWithColumnFamiliesAndEncodingTypes("column family creation with invalid names", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource( newStoreProvider(useColumnFamilies = colFamiliesEnabled)) { provider => @@ -956,7 +966,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies(s"column family creation with reserved chars", + testWithColumnFamiliesAndEncodingTypes(s"column family creation with reserved chars", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource( newStoreProvider(useColumnFamilies = colFamiliesEnabled)) { provider => @@ -992,7 +1002,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid } } - testWithColumnFamilies(s"operations on absent column family", + testWithColumnFamiliesAndEncodingTypes(s"operations on absent column family", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource( newStoreProvider(useColumnFamilies = colFamiliesEnabled)) { provider => @@ -1145,7 +1155,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid Seq( NoPrefixKeyStateEncoderSpec(keySchema), PrefixKeyScanStateEncoderSpec(keySchema, 1) ).foreach { keyEncoder => - testWithColumnFamilies(s"validate rocksdb " + + testWithColumnFamiliesAndEncodingTypes(s"validate rocksdb " + s"${keyEncoder.getClass.toString.split('.').last} correctness", TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => tryWithProviderResource(newStoreProvider(keySchema, keyEncoder, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala index 637eb49130305..634a3c9de9011 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.streaming.state import java.io._ import java.nio.charset.Charset +import java.util.UUID import java.util.concurrent.Executors import scala.collection.mutable @@ -29,19 +30,27 @@ import scala.util.Random import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.rocksdb.CompressionType import org.scalactic.source.Position +import org.scalatest.PrivateMethodTester import org.scalatest.Tag -import org.apache.spark.{SparkConf, SparkException, TaskContext} +import org.apache.spark.{SparkConf, SparkException, SparkFunSuite, TaskContext} +import org.apache.spark.internal.Logging +import org.apache.spark.io.CompressionCodec +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.util.quietly -import org.apache.spark.sql.execution.streaming.{CreateAtomicTestManager, FileSystemBasedCheckpointFileManager} +import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, CreateAtomicTestManager, FileContextBasedCheckpointFileManager, FileSystemBasedCheckpointFileManager} import org.apache.spark.sql.execution.streaming.CheckpointFileManager.{CancellableFSDataOutputStream, RenameBasedFSDataOutputStream} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.STREAMING_CHECKPOINT_FILE_MANAGER_CLASS import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} +import org.apache.spark.sql.types._ import org.apache.spark.tags.SlowSQLTest +import org.apache.spark.unsafe.Platform +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.{ThreadUtils, Utils} import org.apache.spark.util.ArrayImplicits._ @@ -62,6 +71,17 @@ class NoOverwriteFileSystemBasedCheckpointFileManager(path: Path, hadoopConf: Co } } +class TestStateStoreChangelogWriterV101( + fm: CheckpointFileManager, + file: Path, + compressionCodec: CompressionCodec) + extends StateStoreChangelogWriterV1(fm, file, compressionCodec) { + + override def version: Short = 101 + + writeVersion() +} + trait RocksDBStateStoreChangelogCheckpointingTestUtil { val rocksdbChangelogCheckpointingConfKey: String = RocksDBConf.ROCKSDB_SQL_CONF_NAME_PREFIX + ".changelogCheckpointing.enabled" @@ -71,22 +91,41 @@ trait RocksDBStateStoreChangelogCheckpointingTestUtil { def snapshotVersionsPresent(dir: File): Seq[Long] = { dir.listFiles.filter(_.getName.endsWith(".zip")) - .map(_.getName.stripSuffix(".zip")) - .map(_.toLong) + .map(_.getName.stripSuffix(".zip").split("_")) + .map { + case Array(version, _) => version.toLong + case Array(version) => version.toLong + } .sorted .toImmutableArraySeq } def changelogVersionsPresent(dir: File): Seq[Long] = { dir.listFiles.filter(_.getName.endsWith(".changelog")) - .map(_.getName.stripSuffix(".changelog")) - .map(_.toLong) + .map(_.getName.stripSuffix(".changelog").split("_")) + .map { + case Array(version, _) => version.toLong + case Array(version) => version.toLong + } .sorted .toImmutableArraySeq } } -trait AlsoTestWithChangelogCheckpointingEnabled +trait AlsoTestWithEncodingTypes extends SQLTestUtils { + override protected def test(testName: String, testTags: Tag*)(testBody: => Any) + (implicit pos: Position): Unit = { + Seq("unsaferow", "avro").foreach { encoding => + super.test(s"$testName (encoding = $encoding)", testTags: _*) { + withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> encoding) { + testBody + } + } + } + } +} + +trait AlsoTestWithRocksDBFeatures extends SQLTestUtils with RocksDBStateStoreChangelogCheckpointingTestUtil { sealed trait TestMode @@ -128,6 +167,35 @@ trait AlsoTestWithChangelogCheckpointingEnabled } } + def testWithRocksDBStateStore( + testName: String, testTags: Tag*)(testBody: => Any): Unit = { + super.test(testName, testTags: _*) { + // in case tests have any code that needs to execute before every test + super.beforeEach() + withSQLConf( + SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName) { + testBody + } + // in case tests have any code that needs to execute after every test + super.afterEach() + } + } + + def testWithColumnFamiliesAndEncodingTypes( + testName: String, + testMode: TestMode = TestWithBothChangelogCheckpointingEnabledAndDisabled) + (testBody: Boolean => Any): Unit = { + // For each encoding type + Seq("unsaferow", "avro").foreach { encoding => + // Call testWithColumnFamilies for each encoding + testWithColumnFamilies(s"$testName (encoding = $encoding)", testMode) { colFamiliesEnabled => + withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> encoding) { + testBody(colFamiliesEnabled) + } + } + } + } + def testWithColumnFamilies( testName: String, testMode: TestMode, @@ -162,258 +230,585 @@ trait AlsoTestWithChangelogCheckpointingEnabled } } } -} -@SlowSQLTest -class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with SharedSparkSession { + def testWithStateStoreCheckpointIdsAndColumnFamilies( + testName: String, + testMode: TestMode, + testTags: Tag*) + (testBody: (Boolean, Boolean) => Any): Unit = { + Seq(true, false).foreach { enableStateStoreCheckpointIds => + val newTestName = s"$testName - with enableStateStoreCheckpointIds = " + + s"$enableStateStoreCheckpointIds" + testWithColumnFamilies(newTestName, testMode, testTags: _*) { colFamiliesEnabled => + testBody(enableStateStoreCheckpointIds, colFamiliesEnabled) + } + } + } - override protected def sparkConf: SparkConf = { - super.sparkConf - .set(SQLConf.STATE_STORE_PROVIDER_CLASS, classOf[RocksDBStateStoreProvider].getName) + def testWithStateStoreCheckpointIds( + testName: String, + testTags: Tag*) + (testBody: Boolean => Any): Unit = { + Seq(true, false).foreach { enableStateStoreCheckpointIds => + val newTestName = s"$testName - with enableStateStoreCheckpointIds = " + + s"$enableStateStoreCheckpointIds" + test(newTestName, testTags: _*) { enableStateStoreCheckpointIds => + testBody(enableStateStoreCheckpointIds) + } + } } - testWithColumnFamilies( - "RocksDB: check changelog and snapshot version", - TestWithChangelogCheckpointingEnabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - val conf = dbConf.copy(minDeltasForSnapshot = 1) - new File(remoteDir).delete() // to make sure that the directory gets created - for (version <- 0 to 49) { - withDB(remoteDir, version = version, conf = conf, - useColumnFamilies = colFamiliesEnabled) { db => - db.put(version.toString, version.toString) - db.commit() - if ((version + 1) % 5 == 0) db.doMaintenance() + def testWithStateStoreCheckpointIdsAndChangelogEnabled( + testName: String, + testTags: Tag*) + (testBody: Boolean => Any): Unit = { + Seq(true, false).foreach { enableStateStoreCheckpointIds => + val newTestName = s"$testName - with enableStateStoreCheckpointIds = " + + s"$enableStateStoreCheckpointIds" + testWithChangelogCheckpointingDisabled(newTestName, testTags: _*) { + enableStateStoreCheckpointIds => testBody(enableStateStoreCheckpointIds) } } + } +} - if (isChangelogCheckpointingEnabled) { - assert(changelogVersionsPresent(remoteDir) === (1 to 50)) - assert(snapshotVersionsPresent(remoteDir) === Range.inclusive(5, 50, 5)) - } else { - assert(changelogVersionsPresent(remoteDir) === Seq.empty) - assert(snapshotVersionsPresent(remoteDir) === (1 to 50)) +class OpenNumCountedTestInputStream(in: InputStream) extends FSDataInputStream(in) { + import OpenNumCountedTestInputStream._ + + addOpenStreams(this) + + override def close(): Unit = { + removeOpenStream(this) + super.close() + } +} + +class OpenStreamCountedTestFileManager(path: Path, hadoopConf: Configuration) + extends FileContextBasedCheckpointFileManager(path, hadoopConf) { + + override def open(path: Path): FSDataInputStream = { + val stream = new OpenNumCountedTestInputStream(super.open(path)) + stream + } +} + +object OpenNumCountedTestInputStream extends Logging { + private val openStreams = mutable.Map.empty[FSDataInputStream, Throwable] + + def addOpenStreams(stream: FSDataInputStream): Unit = openStreams.synchronized { + openStreams.put(stream, new Throwable()) + } + + def removeOpenStream(stream: FSDataInputStream): Unit = openStreams.synchronized { + openStreams.remove(stream) + } + + def clearOpenStreams(): Unit = openStreams.synchronized { + openStreams.clear() + } + + def assertNoOpenStreams(): Unit = openStreams.synchronized { + val numOpen = openStreams.values.size + if (numOpen > 0) { + for (exc <- openStreams.values) { + logWarning("Leaked filesystem connection created at:") + exc.printStackTrace() + } + throw new IllegalStateException(s"There are $numOpen possibly leaked file streams.", + openStreams.values.head) } } +} - testWithColumnFamilies(s"RocksDB: load version that doesn't exist", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { - colFamiliesEnabled => - val provider = new RocksDBStateStoreProvider() - var ex = intercept[SparkException] { - provider.getStore(-1) +class RocksDBStateEncoderSuite extends SparkFunSuite { + + // Helper method to create test schemas + private def createTestSchemas() = { + val keySchema = StructType(Seq( + StructField("k1", IntegerType), + StructField("k2", LongType), + StructField("k3", DoubleType) + )) + val valueSchema = StructType(Seq( + StructField("v1", StringType), + StructField("v2", BooleanType) + )) + (keySchema, valueSchema) + } + + // Create encoders for different key encoding strategies + private def createTestEncoder(keyStateEncoderSpec: KeyStateEncoderSpec): RocksDBDataEncoder = { + val (keySchema, valueSchema) = createTestSchemas() + val stateSchemaInfo = Some(StateSchemaInfo( + keySchemaId = 0, + valueSchemaId = 0 + )) + new AvroStateEncoder(keyStateEncoderSpec, valueSchema, stateSchemaInfo) + } + + private def createNoPrefixKeyEncoder(): RocksDBDataEncoder = { + val (keySchema, _) = createTestSchemas() + createTestEncoder(NoPrefixKeyStateEncoderSpec(keySchema)) + } + + private def createPrefixKeyScanEncoder(): RocksDBDataEncoder = { + val (keySchema, _) = createTestSchemas() + createTestEncoder(PrefixKeyScanStateEncoderSpec(keySchema, numColsPrefixKey = 2)) + } + + private def createRangeKeyScanEncoder(): RocksDBDataEncoder = { + val (keySchema, _) = createTestSchemas() + createTestEncoder(RangeKeyScanStateEncoderSpec(keySchema, orderingOrdinals = Seq(0, 1))) + } + + test("verify schema ID handling in prefix and range scan key encoding") { + val keySchema = StructType(Seq( + StructField("k1", IntegerType), + StructField("k2", LongType), + StructField("k3", DoubleType) + )) + val valueSchema = StructType(Seq( + StructField("v1", StringType) + )) + + // Create test row with some data + val keyProj = UnsafeProjection.create(keySchema) + val fullKeyRow = keyProj.apply(InternalRow(42, 123L, 3.14)) + + // Test prefix scan encoding with schema evolution + withClue("Testing prefix scan encoding: ") { + val prefixKeySpec = PrefixKeyScanStateEncoderSpec(keySchema, numColsPrefixKey = 2) + val stateSchemaInfo = Some(StateSchemaInfo(keySchemaId = 42, valueSchemaId = 0)) + val encoder = new AvroStateEncoder(prefixKeySpec, valueSchema, stateSchemaInfo) + + // Then encode just the remaining key portion (which should include schema ID) + val remainingKeyRow = keyProj.apply(InternalRow(null, null, 3.14)) + val encodedRemainingKey = encoder.encodeRemainingKey(remainingKeyRow) + + // Verify schema ID in remaining key bytes + val decodedSchemaIdRow = encoder.decodeStateSchemaIdRow(encodedRemainingKey) + assert(decodedSchemaIdRow.schemaId === 42, + "Schema ID not preserved in prefix scan remaining key encoding") + } + + // Test range scan encoding with schema evolution + withClue("Testing range scan encoding: ") { + val rangeScanSpec = RangeKeyScanStateEncoderSpec(keySchema, orderingOrdinals = Seq(0, 1)) + val stateSchemaInfo = Some(StateSchemaInfo(keySchemaId = 24, valueSchemaId = 0)) + val encoder = new AvroStateEncoder(rangeScanSpec, valueSchema, stateSchemaInfo) + + // Encode remaining key (non-ordering columns) + // For range scan, the remaining key schema only contains columns NOT in orderingOrdinals + val remainingKeySchema = StructType(Seq( + StructField("k3", DoubleType) // Only the non-ordering column + )) + val remainingKeyProj = UnsafeProjection.create(remainingKeySchema) + val remainingKeyRow = remainingKeyProj.apply(InternalRow(3.14)) + val encodedRemainingKey = encoder.encodeRemainingKey(remainingKeyRow) + + // Verify schema ID in remaining key bytes + val decodedSchemaIdRow = encoder.decodeStateSchemaIdRow(encodedRemainingKey) + assert(decodedSchemaIdRow.schemaId === 24, + "Schema ID not preserved in range scan remaining key encoding") + + // Verify we can decode the remaining key correctly + // The decoded row should only have the non-ordering column (k3) + val decodedRemainingKey = encoder.decodeRemainingKey(encodedRemainingKey) + assert(decodedRemainingKey.getDouble(0) === 3.14, + "Data not preserved in range scan remaining key encoding") + + // Test the range scan key portion (ordering columns) + val rangeScanKeySchema = StructType(Seq( + StructField("k1", IntegerType), + StructField("k2", LongType) + )) + val rangeScanProj = UnsafeProjection.create(rangeScanKeySchema) + val rangeScanRow = rangeScanProj.apply(InternalRow(42, 123L)) + val encodedRangeScan = encoder.encodePrefixKeyForRangeScan(rangeScanRow) + + // Range scan portion should not have schema ID since it uses special encoding + val decodedRangeScan = encoder.decodePrefixKeyForRangeScan(encodedRangeScan) + assert(decodedRangeScan.getInt(0) === 42) + assert(decodedRangeScan.getLong(1) === 123L) } - checkError( - ex, - condition = "CANNOT_LOAD_STATE_STORE.UNEXPECTED_VERSION", - parameters = Map("version" -> "-1") + } + + test("verify schema ID preservation through encode/decode cycle") { + val encoders = Seq( + ("NoPrefixKey", createNoPrefixKeyEncoder()), + ("PrefixKeyScan", createPrefixKeyScanEncoder()), + ("RangeKeyScan", createRangeKeyScanEncoder()) ) - ex = intercept[SparkException] { - provider.getReadStore(-1) - } - checkError( - ex, - condition = "CANNOT_LOAD_STATE_STORE.UNEXPECTED_VERSION", - parameters = Map("version" -> "-1") + + // Test a range of schema IDs including edge cases + val testSchemaIds = Seq[Short]( + 0, // Min value + 1, // Common case + 42, // Arbitrary value + -1, // Negative value + Short.MaxValue, // Max positive + Short.MinValue // Max negative ) - val remoteDir = Utils.createTempDir().toString - new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, useColumnFamilies = colFamiliesEnabled) { db => - ex = intercept[SparkException] { - db.load(1) + encoders.foreach { case (encoderType, encoder) => + testSchemaIds.foreach { schemaId => + withClue(s"Testing $encoderType encoder with schema ID $schemaId: ") { + val testData = Array[Byte](1, 2, 3, 4) + val schemaIdRow = StateSchemaIdRow(schemaId, testData) + + // Encode the row + val encoded = encoder.encodeWithStateSchemaId(schemaIdRow) + + // Verify schema ID directly in encoded bytes + val encodedSchemaId = Platform.getShort(encoded, Platform.BYTE_ARRAY_OFFSET) + assert(encodedSchemaId === schemaId, + s"Schema ID mismatch in encoded bytes: expected $schemaId but got $encodedSchemaId") + + // Decode and verify + val decoded = encoder.decodeStateSchemaIdRow(encoded) + assert(decoded.schemaId === schemaId, + s"Schema ID mismatch after decode: expected $schemaId but got ${decoded.schemaId}") + + // Also verify data wasn't corrupted + assert(decoded.bytes === testData, + "Data corruption detected in encode/decode cycle") + } } - checkError( - ex, - condition = "CANNOT_LOAD_STATE_STORE.CANNOT_READ_STREAMING_STATE_FILE", - parameters = Map( - "fileToRead" -> s"$remoteDir/1.changelog" - ) - ) } } - testWithColumnFamilies( - "RocksDB: purge changelog and snapshots with minVersionsToDelete = 0", - TestWithChangelogCheckpointingEnabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - new File(remoteDir).delete() // to make sure that the directory gets created - val conf = dbConf.copy(enableChangelogCheckpointing = true, - minVersionsToRetain = 3, minDeltasForSnapshot = 1, minVersionsToDelete = 0) - withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled) { db => - db.load(0) - db.commit() - for (version <- 1 to 2) { - db.load(version) - db.commit() - db.doMaintenance() - } - assert(snapshotVersionsPresent(remoteDir) === Seq(2, 3)) - assert(changelogVersionsPresent(remoteDir) == Seq(1, 2, 3)) + test("verify schema ID handling in single value encoder") { + val keySchema = StructType(Seq( + StructField("k1", IntegerType) + )) + val valueSchema = StructType(Seq( + StructField("v1", StringType), + StructField("v2", IntegerType), + StructField("v3", BooleanType) + )) - for (version <- 3 to 4) { - db.load(version) - db.commit() - } - assert(snapshotVersionsPresent(remoteDir) === Seq(2, 3)) - assert(changelogVersionsPresent(remoteDir) == (1 to 5)) - db.doMaintenance() - // 3 is the latest snapshot <= maxSnapshotVersionPresent - minVersionsToRetain + 1 - assert(snapshotVersionsPresent(remoteDir) === Seq(3, 5)) - assert(changelogVersionsPresent(remoteDir) == (3 to 5)) + val valueProj = UnsafeProjection.create(valueSchema) + val value = valueProj.apply(InternalRow(UTF8String.fromString("hello"), 42, true)) - for (version <- 5 to 7) { - db.load(version) - db.commit() - } - assert(snapshotVersionsPresent(remoteDir) === Seq(3, 5)) - assert(changelogVersionsPresent(remoteDir) == (3 to 8)) - db.doMaintenance() - // 5 is the latest snapshot <= maxSnapshotVersionPresent - minVersionsToRetain + 1 - assert(snapshotVersionsPresent(remoteDir) === Seq(5, 8)) - assert(changelogVersionsPresent(remoteDir) == (5 to 8)) + withClue("Testing single value encoder: ") { + val keySpec = NoPrefixKeyStateEncoderSpec(keySchema) + val stateSchemaInfo = Some(StateSchemaInfo(keySchemaId = 0, valueSchemaId = 42)) + val avroEncoder = new AvroStateEncoder(keySpec, valueSchema, stateSchemaInfo) + val valueEncoder = new SingleValueStateEncoder(avroEncoder, valueSchema) + + // Encode value + val encodedValue = valueEncoder.encodeValue(value) + + // Verify schema ID was included and preserved + val decodedSchemaIdRow = avroEncoder.decodeStateSchemaIdRow(encodedValue) + assert(decodedSchemaIdRow.schemaId === 42, + "Schema ID not preserved in single value encoding") + + // Verify value was preserved + val decodedValue = valueEncoder.decodeValue(encodedValue) + assert(decodedValue.getString(0) === "hello") + assert(decodedValue.getInt(1) === 42) + assert(decodedValue.getBoolean(2) === true) } } +} - testWithColumnFamilies( - "RocksDB: purge version files with minVersionsToDelete > 0", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - new File(remoteDir).delete() // to make sure that the directory gets created - val conf = dbConf.copy( - minVersionsToRetain = 3, minDeltasForSnapshot = 1, minVersionsToDelete = 3) - withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled) { db => - // Commit 5 versions - // stale versions: (1, 2) - // keep versions: (3, 4, 5) - for (version <- 0 to 4) { - // Should upload latest snapshot but not delete any files - // since number of stale versions < minVersionsToDelete - db.load(version) - db.commit() - db.doMaintenance() - } +@SlowSQLTest +class RocksDBSuite extends AlsoTestWithRocksDBFeatures with SharedSparkSession + with PrivateMethodTester { - // Commit 1 more version - // stale versions: (1, 2, 3) - // keep versions: (4, 5, 6) - db.load(5) - db.commit() + override protected def sparkConf: SparkConf = { + super.sparkConf + .set(SQLConf.STATE_STORE_PROVIDER_CLASS, classOf[RocksDBStateStoreProvider].getName) + } + + // In each test we verify opened streams are all closed + private def hadoopConf: Configuration = { + val fmClass = "org.apache.spark.sql.execution.streaming.state." + + "OpenStreamCountedTestFileManager" + val hadoopConf = new Configuration() + hadoopConf.set(STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, fmClass) + hadoopConf + } + + override def beforeEach(): Unit = { + OpenNumCountedTestInputStream.clearOpenStreams() + } + + override def afterEach(): Unit = { + eventually(timeout(10.seconds), interval(2.seconds)) { + OpenNumCountedTestInputStream.assertNoOpenStreams() + } + } + + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDB: check changelog and snapshot version", + TestWithChangelogCheckpointingEnabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + val conf = dbConf.copy(minDeltasForSnapshot = 1) + new File(remoteDir).delete() // to make sure that the directory gets created + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, + useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + for (version <- 0 to 49) { + db.load(version, versionToUniqueId.get(version)) + db.put(version.toString, version.toString) + db.commit() + if ((version + 1) % 5 == 0) db.doMaintenance() + } + } - // Checkpoint directory before maintenance if (isChangelogCheckpointingEnabled) { - assert(snapshotVersionsPresent(remoteDir) == (1 to 5)) - assert(changelogVersionsPresent(remoteDir) == (1 to 6)) + assert(changelogVersionsPresent(remoteDir) === (1 to 50)) + assert(snapshotVersionsPresent(remoteDir) === Range.inclusive(5, 50, 5)) } else { - assert(snapshotVersionsPresent(remoteDir) == (1 to 6)) + assert(changelogVersionsPresent(remoteDir) === Seq.empty) + assert(snapshotVersionsPresent(remoteDir) === (1 to 50)) } + } - // Should delete stale versions for zip files and change log files - // since number of stale versions >= minVersionsToDelete - db.doMaintenance() + testWithStateStoreCheckpointIdsAndColumnFamilies(s"RocksDB: load version that doesn't exist", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val provider = new RocksDBStateStoreProvider() + var ex = intercept[SparkException] { + provider.getStore(-1) + } + checkError( + ex, + condition = "CANNOT_LOAD_STATE_STORE.UNEXPECTED_VERSION", + parameters = Map("version" -> "-1") + ) + ex = intercept[SparkException] { + provider.getReadStore(-1) + } + checkError( + ex, + condition = "CANNOT_LOAD_STATE_STORE.UNEXPECTED_VERSION", + parameters = Map("version" -> "-1") + ) - // Checkpoint directory after maintenance - assert(snapshotVersionsPresent(remoteDir) == Seq(4, 5, 6)) - if (isChangelogCheckpointingEnabled) { - assert(changelogVersionsPresent(remoteDir) == Seq(4, 5, 6)) + val remoteDir = Utils.createTempDir().toString + new File(remoteDir).delete() // to make sure that the directory gets created + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + ex = intercept[SparkException] { + db.load(1, versionToUniqueId.get(1)) + } + checkError( + ex, + condition = "CANNOT_LOAD_STATE_STORE.CANNOT_READ_STREAMING_STATE_FILE", + parameters = Map( + "fileToRead" -> s"$remoteDir/1.changelog" + ) + ) } - } } - testWithColumnFamilies( - "RocksDB: minDeltasForSnapshot", - TestWithChangelogCheckpointingEnabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - new File(remoteDir).delete() // to make sure that the directory gets created - val conf = dbConf.copy(enableChangelogCheckpointing = true, minDeltasForSnapshot = 3) - withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled) { db => - for (version <- 0 to 1) { - db.load(version) + testWithStateStoreCheckpointIdsAndColumnFamilies( + "RocksDB: purge changelog and snapshots with minVersionsToDelete = 0", + TestWithChangelogCheckpointingEnabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + new File(remoteDir).delete() // to make sure that the directory gets created + val conf = dbConf.copy(enableChangelogCheckpointing = true, + minVersionsToRetain = 3, minDeltasForSnapshot = 1, minVersionsToDelete = 0) + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds) { db => + db.load(0) db.commit() + for (version <- 1 to 2) { + db.load(version) + db.commit() + db.doMaintenance() + } + assert(snapshotVersionsPresent(remoteDir) === Seq(2, 3)) + assert(changelogVersionsPresent(remoteDir) == Seq(1, 2, 3)) + + for (version <- 3 to 4) { + db.load(version) + db.commit() + } + assert(snapshotVersionsPresent(remoteDir) === Seq(2, 3)) + assert(changelogVersionsPresent(remoteDir) == (1 to 5)) db.doMaintenance() + // 3 is the latest snapshot <= maxSnapshotVersionPresent - minVersionsToRetain + 1 + assert(snapshotVersionsPresent(remoteDir) === Seq(3, 5)) + assert(changelogVersionsPresent(remoteDir) == (3 to 5)) + + for (version <- 5 to 7) { + db.load(version) + db.commit() + } + assert(snapshotVersionsPresent(remoteDir) === Seq(3, 5)) + assert(changelogVersionsPresent(remoteDir) == (3 to 8)) + db.doMaintenance() + // 5 is the latest snapshot <= maxSnapshotVersionPresent - minVersionsToRetain + 1 + assert(snapshotVersionsPresent(remoteDir) === Seq(5, 8)) + assert(changelogVersionsPresent(remoteDir) == (5 to 8)) } - // Snapshot should not be created because minDeltasForSnapshot = 3 - assert(snapshotVersionsPresent(remoteDir) === Seq.empty) - assert(changelogVersionsPresent(remoteDir) == Seq(1, 2)) - db.load(2) - db.commit() - db.doMaintenance() - assert(snapshotVersionsPresent(remoteDir) === Seq(3)) - db.load(3) + } + + testWithStateStoreCheckpointIdsAndColumnFamilies( + "RocksDB: purge version files with minVersionsToDelete > 0", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + new File(remoteDir).delete() // to make sure that the directory gets created + val conf = dbConf.copy( + minVersionsToRetain = 3, minDeltasForSnapshot = 1, minVersionsToDelete = 3) + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds) { db => + // Commit 5 versions + // stale versions: (1, 2) + // keep versions: (3, 4, 5) + for (version <- 0 to 4) { + // Should upload latest snapshot but not delete any files + // since number of stale versions < minVersionsToDelete + db.load(version) + db.commit() + db.doMaintenance() + } - for (version <- 3 to 7) { - db.load(version) + // Commit 1 more version + // stale versions: (1, 2, 3) + // keep versions: (4, 5, 6) + db.load(5) db.commit() + + // Checkpoint directory before maintenance + if (isChangelogCheckpointingEnabled) { + assert(snapshotVersionsPresent(remoteDir) == (1 to 5)) + assert(changelogVersionsPresent(remoteDir) == (1 to 6)) + } else { + assert(snapshotVersionsPresent(remoteDir) == (1 to 6)) + } + + // Should delete stale versions for zip files and change log files + // since number of stale versions >= minVersionsToDelete db.doMaintenance() + + // Checkpoint directory after maintenance + assert(snapshotVersionsPresent(remoteDir) == Seq(4, 5, 6)) + if (isChangelogCheckpointingEnabled) { + assert(changelogVersionsPresent(remoteDir) == Seq(4, 5, 6)) + } } - assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6)) - for (version <- 8 to 17) { - db.load(version) + } + + testWithStateStoreCheckpointIdsAndColumnFamilies( + "RocksDB: minDeltasForSnapshot", + TestWithChangelogCheckpointingEnabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + new File(remoteDir).delete() // to make sure that the directory gets created + val conf = dbConf.copy(enableChangelogCheckpointing = true, minDeltasForSnapshot = 3) + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + for (version <- 0 to 1) { + db.load(version, versionToUniqueId.get(version)) + db.commit() + db.doMaintenance() + } + // Snapshot should not be created because minDeltasForSnapshot = 3 + assert(snapshotVersionsPresent(remoteDir) === Seq.empty) + assert(changelogVersionsPresent(remoteDir) == Seq(1, 2)) + db.load(2, versionToUniqueId.get(2)) db.commit() - } - db.doMaintenance() - assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6, 18)) - } + db.doMaintenance() + assert(snapshotVersionsPresent(remoteDir) === Seq(3)) + db.load(3, versionToUniqueId.get(3)) - // pick up from the last snapshot and the next upload will be for version 21 - withDB(remoteDir, conf = conf) { db => - db.load(18) - db.commit() - db.doMaintenance() - assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6, 18)) + for (version <- 3 to 7) { + db.load(version, versionToUniqueId.get(version)) + db.commit() + db.doMaintenance() + } + assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6)) + for (version <- 8 to 17) { + db.load(version, versionToUniqueId.get(version)) + db.commit() + } + db.doMaintenance() + assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6, 18)) + } - for (version <- 19 to 20) { - db.load(version) + // pick up from the last snapshot and the next upload will be for version 21 + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + db.load(18, versionToUniqueId.get(18)) db.commit() + db.doMaintenance() + assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6, 18)) + + for (version <- 19 to 20) { + db.load(version, versionToUniqueId.get(version)) + db.commit() + } + db.doMaintenance() + assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6, 18, 21)) } - db.doMaintenance() - assert(snapshotVersionsPresent(remoteDir) === Seq(3, 6, 18, 21)) - } } - testWithColumnFamilies("SPARK-45419: Do not reuse SST files" + + testWithStateStoreCheckpointIdsAndColumnFamilies("SPARK-45419: Do not reuse SST files" + " in different RocksDB instances", - TestWithChangelogCheckpointingEnabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - val conf = dbConf.copy(minDeltasForSnapshot = 0, compactOnCommit = false) - new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled) { db => - for (version <- 0 to 2) { - db.load(version) - db.put(version.toString, version.toString) - db.commit() - } - // upload snapshot 3.zip - db.doMaintenance() - // Roll back to version 1 and start to process data. - for (version <- 1 to 3) { - db.load(version) - db.put(version.toString, version.toString) - db.commit() + TestWithChangelogCheckpointingEnabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + val conf = dbConf.copy(minDeltasForSnapshot = 0, compactOnCommit = false) + new File(remoteDir).delete() // to make sure that the directory gets created + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + for (version <- 0 to 2) { + db.load(version, versionToUniqueId.get(version)) + db.put(version.toString, version.toString) + db.commit() + } + // upload snapshot 3.zip + db.doMaintenance() + // Roll back to version 1 and start to process data. + for (version <- 1 to 3) { + db.load(version, versionToUniqueId.get(version)) + db.put(version.toString, version.toString) + db.commit() + } + // Upload snapshot 4.zip, should not reuse the SST files in 3.zip + db.doMaintenance() } - // Upload snapshot 4.zip, should not reuse the SST files in 3.zip - db.doMaintenance() - } - withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled) { db => - // Open the db to verify that the state in 4.zip is no corrupted. - db.load(4) - } + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + // Open the db to verify that the state in 4.zip is no corrupted. + db.load(4, versionToUniqueId.get(4)) + } } // A rocksdb instance with changelog checkpointing enabled should be able to load // an existing checkpoint without changelog. - testWithColumnFamilies( + testWithStateStoreCheckpointIdsAndColumnFamilies( "RocksDB: changelog checkpointing backward compatibility", - TestWithChangelogCheckpointingEnabled) { colFamiliesEnabled => + TestWithChangelogCheckpointingEnabled) { (enableStateStoreCheckpointIds, colFamiliesEnabled) => val remoteDir = Utils.createTempDir().toString new File(remoteDir).delete() // to make sure that the directory gets created val disableChangelogCheckpointingConf = dbConf.copy(enableChangelogCheckpointing = false, minVersionsToRetain = 30) + val versionToUniqueId = new mutable.HashMap[Long, String]() withDB(remoteDir, conf = disableChangelogCheckpointingConf, - useColumnFamilies = colFamiliesEnabled) { db => + useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 1 to 30) { - db.load(version - 1) + db.load(version - 1, versionToUniqueId.get(version - 1)) db.put(version.toString, version.toString) db.remove((version - 1).toString) db.commit() @@ -427,13 +822,15 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared dbConf.copy(enableChangelogCheckpointing = true, minVersionsToRetain = 30, minDeltasForSnapshot = 1) withDB(remoteDir, conf = enableChangelogCheckpointingConf, - useColumnFamilies = colFamiliesEnabled) { db => + useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 1 to 30) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) assert(db.iterator().map(toStr).toSet === Set((version.toString, version.toString))) } for (version <- 30 to 60) { - db.load(version - 1) + db.load(version - 1, versionToUniqueId.get(version - 1)) db.put(version.toString, version.toString) db.remove((version - 1).toString) db.commit() @@ -441,13 +838,13 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared assert(snapshotVersionsPresent(remoteDir) === (1 to 30)) assert(changelogVersionsPresent(remoteDir) === (30 to 60)) for (version <- 1 to 60) { - db.load(version, readOnly = true) + db.load(version, versionToUniqueId.get(version), readOnly = true) assert(db.iterator().map(toStr).toSet === Set((version.toString, version.toString))) } // recommit 60 to ensure that acquireLock is released for maintenance for (version <- 60 to 60) { - db.load(version - 1) + db.load(version - 1, versionToUniqueId.get(version - 1)) db.put(version.toString, version.toString) db.remove((version - 1).toString) db.commit() @@ -455,29 +852,142 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared // Check that snapshots and changelogs get purged correctly. db.doMaintenance() assert(snapshotVersionsPresent(remoteDir) === Seq(30, 60)) - assert(changelogVersionsPresent(remoteDir) === (30 to 60)) + if (enableStateStoreCheckpointIds) { + // recommit version 60 creates another changelog file with different unique id + assert(changelogVersionsPresent(remoteDir) === (30 to 60) :+ 60) + } else { + assert(changelogVersionsPresent(remoteDir) === (30 to 60)) + } + // Verify the content of retained versions. for (version <- 30 to 60) { - db.load(version, readOnly = true) + db.load(version, versionToUniqueId.get(version), readOnly = true) assert(db.iterator().map(toStr).toSet === Set((version.toString, version.toString))) } } } + testWithChangelogCheckpointingEnabled("RocksDB Fault Tolerance: correctly handle when there " + + "are multiple snapshot files for the same version") { + val enableStateStoreCheckpointIds = true + val useColumnFamily = true + val remoteDir = Utils.createTempDir().toString + new File(remoteDir).delete() // to make sure that the directory gets created + val enableChangelogCheckpointingConf = + dbConf.copy(enableChangelogCheckpointing = true, minVersionsToRetain = 20, + minDeltasForSnapshot = 3) + + // Simulate when there are multiple snapshot files for the same version + // The first DB writes to version 0 with uniqueId + val versionToUniqueId1 = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = enableChangelogCheckpointingConf, + useColumnFamilies = useColumnFamily, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId1) { db => + db.load(0, versionToUniqueId1.get(0)) + db.put("a", "1") // write key a here + db.commit() + + // Add some change log files after the snapshot + for (version <- 2 to 5) { + db.load(version - 1, versionToUniqueId1.get(version - 1)) + db.put(version.toString, version.toString) // update "1" -> "1", "2" -> "2", ... + db.commit() + } + + // doMaintenance uploads the snapshot + db.doMaintenance() + + for (version <- 6 to 10) { + db.load(version - 1, versionToUniqueId1.get(version - 1)) + db.put(version.toString, version.toString) + db.commit() + } + } + + // versionToUniqueId1 should be non-empty, meaning the id is updated from rocksDB to the map + assert(versionToUniqueId1.nonEmpty) + + // The second DB writes to version 0 with another uniqueId + val versionToUniqueId2 = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = enableChangelogCheckpointingConf, + useColumnFamilies = useColumnFamily, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId2) { db => + db.load(0, versionToUniqueId2.get(0)) + db.put("b", "2") // write key b here + db.commit() + // Add some change log files after the snapshot + for (version <- 2 to 5) { + db.load(version - 1, versionToUniqueId2.get(version - 1)) + db.put(version.toString, (version + 1).toString) // update "1" -> "2", "2" -> "3", ... + db.commit() + } + + // doMaintenance uploads the snapshot + db.doMaintenance() + + for (version <- 6 to 10) { + db.load(version - 1, versionToUniqueId2.get(version - 1)) + db.put(version.toString, (version + 1).toString) + db.commit() + } + } + + // versionToUniqueId2 should be non-empty, meaning the id is updated from rocksDB to the map + assert(versionToUniqueId2.nonEmpty) + + // During a load() with linage from the first rocksDB, + // the DB should load with data in the first db + withDB(remoteDir, conf = enableChangelogCheckpointingConf, + useColumnFamilies = useColumnFamily, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId1) { db => + db.load(10, versionToUniqueId1.get(10)) + assert(toStr(db.get("a")) === "1") + for (version <- 2 to 10) { + // first time we write version -> version + // second time we write version -> version + 1 + // here since we are loading from the first db lineage, we should see version -> version + assert(toStr(db.get(version.toString)) === version.toString) + } + } + + // During a load() with linage from the second rocksDB, + // the DB should load with data in the second db + withDB(remoteDir, conf = enableChangelogCheckpointingConf, + useColumnFamilies = useColumnFamily, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId2) { db => + db.load(10, versionToUniqueId2.get(10)) + assert(toStr(db.get("b")) === "2") + for (version <- 2 to 10) { + // first time we write version -> version + // second time we write version -> version + 1 + // here since we are loading from the second db lineage, + // we should see version -> version + 1 + assert(toStr(db.get(version.toString)) === (version + 1).toString) + } + } + } + // A rocksdb instance with changelog checkpointing disabled should be able to load // an existing checkpoint with changelog. - testWithColumnFamilies( + testWithStateStoreCheckpointIdsAndColumnFamilies( "RocksDB: changelog checkpointing forward compatibility", - TestWithChangelogCheckpointingEnabled) { colFamiliesEnabled => + TestWithChangelogCheckpointingEnabled) { (enableStateStoreCheckpointIds, colFamiliesEnabled) => val remoteDir = Utils.createTempDir().toString new File(remoteDir).delete() // to make sure that the directory gets created val enableChangelogCheckpointingConf = dbConf.copy(enableChangelogCheckpointing = true, minVersionsToRetain = 20, minDeltasForSnapshot = 3) + val versionToUniqueId = new mutable.HashMap[Long, String]() withDB(remoteDir, conf = enableChangelogCheckpointingConf, - useColumnFamilies = colFamiliesEnabled) { db => + useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 1 to 30) { - db.load(version - 1) + db.load(version - 1, versionToUniqueId.get(version - 1)) db.put(version.toString, version.toString) db.remove((version - 1).toString) db.commit() @@ -487,16 +997,18 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared // Now disable changelog checkpointing in a checkpoint created by a state store // that enable changelog checkpointing. val disableChangelogCheckpointingConf = - dbConf.copy(enableChangelogCheckpointing = false, minVersionsToRetain = 20, - minDeltasForSnapshot = 1) + dbConf.copy(enableChangelogCheckpointing = false, minVersionsToRetain = 20, + minDeltasForSnapshot = 1) withDB(remoteDir, conf = disableChangelogCheckpointingConf, - useColumnFamilies = colFamiliesEnabled) { db => + useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 1 to 30) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) assert(db.iterator().map(toStr).toSet === Set((version.toString, version.toString))) } for (version <- 31 to 60) { - db.load(version - 1) + db.load(version - 1, versionToUniqueId.get(version - 1)) db.put(version.toString, version.toString) db.remove((version - 1).toString) db.commit() @@ -504,7 +1016,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared assert(changelogVersionsPresent(remoteDir) === (1 to 30)) assert(snapshotVersionsPresent(remoteDir) === (31 to 60)) for (version <- 1 to 60) { - db.load(version, readOnly = true) + db.load(version, versionToUniqueId.get(version), readOnly = true) assert(db.iterator().map(toStr).toSet === Set((version.toString, version.toString))) } // Check that snapshots and changelogs get purged correctly. @@ -513,7 +1025,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared assert(changelogVersionsPresent(remoteDir) === Seq.empty) // Verify the content of retained versions. for (version <- 41 to 60) { - db.load(version, readOnly = true) + db.load(version, versionToUniqueId.get(version), readOnly = true) assert(db.iterator().map(toStr).toSet === Set((version.toString, version.toString))) } } @@ -535,152 +1047,172 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithColumnFamilies(s"RocksDB: get, put, iterator, commit, load", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => - def testOps(compactOnCommit: Boolean): Unit = { - val remoteDir = Utils.createTempDir().toString - new File(remoteDir).delete() // to make sure that the directory gets created + testWithStateStoreCheckpointIdsAndColumnFamilies(s"RocksDB: get, put, iterator, commit, load", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + def testOps(compactOnCommit: Boolean): Unit = { + val remoteDir = Utils.createTempDir().toString + new File(remoteDir).delete() // to make sure that the directory gets created - val conf = RocksDBConf().copy(compactOnCommit = compactOnCommit) - withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled) { db => - assert(db.get("a") === null) - assert(iterator(db).isEmpty) + val conf = RocksDBConf().copy(compactOnCommit = compactOnCommit) + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + assert(db.get("a") === null) + assert(iterator(db).isEmpty) - db.put("a", "1") - assert(toStr(db.get("a")) === "1") - db.commit() - } + db.put("a", "1") + assert(toStr(db.get("a")) === "1") + db.commit() + } - withDB(remoteDir, conf = conf, version = 0, useColumnFamilies = colFamiliesEnabled) { db => - // version 0 can be loaded again - assert(toStr(db.get("a")) === null) - assert(iterator(db).isEmpty) - } + withDB(remoteDir, conf = conf, version = 0, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + // version 0 can be loaded again + assert(toStr(db.get("a")) === null) + assert(iterator(db).isEmpty) + } - withDB(remoteDir, conf = conf, version = 1, useColumnFamilies = colFamiliesEnabled) { db => - // version 1 data recovered correctly - assert(toStr(db.get("a")) === "1") - assert(db.iterator().map(toStr).toSet === Set(("a", "1"))) + withDB(remoteDir, conf = conf, version = 1, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + // version 1 data recovered correctly + assert(toStr(db.get("a")) === "1") + assert(db.iterator().map(toStr).toSet === Set(("a", "1"))) - // make changes but do not commit version 2 - db.put("b", "2") - assert(toStr(db.get("b")) === "2") - assert(db.iterator().map(toStr).toSet === Set(("a", "1"), ("b", "2"))) - } + // make changes but do not commit version 2 + db.put("b", "2") + assert(toStr(db.get("b")) === "2") + assert(db.iterator().map(toStr).toSet === Set(("a", "1"), ("b", "2"))) + } - withDB(remoteDir, conf = conf, version = 1, useColumnFamilies = colFamiliesEnabled) { db => - // version 1 data not changed - assert(toStr(db.get("a")) === "1") - assert(db.get("b") === null) - assert(db.iterator().map(toStr).toSet === Set(("a", "1"))) + withDB(remoteDir, conf = conf, version = 1, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + // version 1 data not changed + assert(toStr(db.get("a")) === "1") + assert(db.get("b") === null) + assert(db.iterator().map(toStr).toSet === Set(("a", "1"))) - // commit version 2 - db.put("b", "2") - assert(toStr(db.get("b")) === "2") - db.commit() - assert(db.iterator().map(toStr).toSet === Set(("a", "1"), ("b", "2"))) - } + // commit version 2 + db.put("b", "2") + assert(toStr(db.get("b")) === "2") + db.commit() + assert(db.iterator().map(toStr).toSet === Set(("a", "1"), ("b", "2"))) + } - withDB(remoteDir, conf = conf, version = 1, useColumnFamilies = colFamiliesEnabled) { db => - // version 1 data not changed - assert(toStr(db.get("a")) === "1") - assert(db.get("b") === null) - } + withDB(remoteDir, conf = conf, version = 1, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + // version 1 data not changed + assert(toStr(db.get("a")) === "1") + assert(db.get("b") === null) + } - withDB(remoteDir, conf = conf, version = 2, useColumnFamilies = colFamiliesEnabled) { db => - // version 2 can be loaded again - assert(toStr(db.get("b")) === "2") - assert(db.iterator().map(toStr).toSet === Set(("a", "1"), ("b", "2"))) + withDB(remoteDir, conf = conf, version = 2, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + // version 2 can be loaded again + assert(toStr(db.get("b")) === "2") + assert(db.iterator().map(toStr).toSet === Set(("a", "1"), ("b", "2"))) - db.load(1) - assert(toStr(db.get("b")) === null) - assert(db.iterator().map(toStr).toSet === Set(("a", "1"))) + db.load(1, versionToUniqueId.get(1)) + assert(toStr(db.get("b")) === null) + assert(db.iterator().map(toStr).toSet === Set(("a", "1"))) + } } - } - for (compactOnCommit <- Seq(false, true)) { - withClue(s"compactOnCommit = $compactOnCommit") { - testOps(compactOnCommit) + for (compactOnCommit <- Seq(false, true)) { + withClue(s"compactOnCommit = $compactOnCommit") { + testOps(compactOnCommit) + } } - } } - testWithColumnFamilies(s"RocksDB: handle commit failures and aborts", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => - val hadoopConf = new Configuration() - hadoopConf.set( - SQLConf.STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, - classOf[CreateAtomicTestManager].getName) - val remoteDir = Utils.createTempDir().getAbsolutePath - withDB(remoteDir, hadoopConf = hadoopConf, useColumnFamilies = colFamiliesEnabled) { db => - // Disable failure of output stream and generate versions - CreateAtomicTestManager.shouldFailInCreateAtomic = false - for (version <- 1 to 10) { - db.load(version - 1) - db.put(version.toString, version.toString) // update "1" -> "1", "2" -> "2", ... - db.commit() - } - val version10Data = (1L to 10).map(_.toString).map(x => x -> x).toSet - - // Fail commit for next version and verify that reloading resets the files - CreateAtomicTestManager.shouldFailInCreateAtomic = true - db.load(10) - db.put("11", "11") - intercept[IOException] { - quietly { + testWithStateStoreCheckpointIdsAndColumnFamilies(s"RocksDB: handle commit failures and aborts", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val hadoopConf = new Configuration() + hadoopConf.set( + SQLConf.STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, + classOf[CreateAtomicTestManager].getName) + val remoteDir = Utils.createTempDir().getAbsolutePath + withDB(remoteDir, hadoopConf = hadoopConf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds) { db => + // Disable failure of output stream and generate versions + CreateAtomicTestManager.shouldFailInCreateAtomic = false + for (version <- 1 to 10) { + db.load(version - 1) + db.put(version.toString, version.toString) // update "1" -> "1", "2" -> "2", ... db.commit() } - } - assert(db.load(10, readOnly = true).iterator().map(toStr).toSet === version10Data) - CreateAtomicTestManager.shouldFailInCreateAtomic = false + val version10Data = (1L to 10).map(_.toString).map(x => x -> x).toSet + + // Fail commit for next version and verify that reloading resets the files + CreateAtomicTestManager.shouldFailInCreateAtomic = true + db.load(10) + db.put("11", "11") + intercept[IOException] { + quietly { + db.commit() + } + } + assert(db.load(10, readOnly = true).iterator().map(toStr).toSet === version10Data) + CreateAtomicTestManager.shouldFailInCreateAtomic = false - // Abort commit for next version and verify that reloading resets the files - db.load(10) - db.put("11", "11") - db.rollback() - assert(db.load(10, readOnly = true).iterator().map(toStr).toSet === version10Data) - } + // Abort commit for next version and verify that reloading resets the files + db.load(10) + db.put("11", "11") + db.rollback() + assert(db.load(10, readOnly = true).iterator().map(toStr).toSet === version10Data) + } } - testWithColumnFamilies("RocksDB close tests - close before doMaintenance", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - val conf = dbConf.copy(minDeltasForSnapshot = 1, compactOnCommit = false) - new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf, - useColumnFamilies = colFamiliesEnabled) { db => - db.load(0) - db.put("foo", "bar") - db.commit() - // call close first and maintenance can be still be invoked in the context of the - // maintenance task's thread pool - db.close() - db.doMaintenance() - } + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDB close tests - " + + "close before doMaintenance", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + val conf = dbConf.copy(minDeltasForSnapshot = 1, compactOnCommit = false) + new File(remoteDir).delete() // to make sure that the directory gets created + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds) { db => + db.load(0) + db.put("foo", "bar") + db.commit() + // call close first and maintenance can be still be invoked in the context of the + // maintenance task's thread pool + db.close() + db.doMaintenance() + } } - testWithColumnFamilies("RocksDB close tests - close after doMaintenance", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => - val remoteDir = Utils.createTempDir().toString - val conf = dbConf.copy(minDeltasForSnapshot = 1, compactOnCommit = false) - new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf, - useColumnFamilies = colFamiliesEnabled) { db => - db.load(0) - db.put("foo", "bar") - db.commit() - // maintenance can be invoked in the context of the maintenance task's thread pool - // and close is invoked after that - db.doMaintenance() - db.close() - } + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDB close tests - " + + "close after doMaintenance", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + val remoteDir = Utils.createTempDir().toString + val conf = dbConf.copy(minDeltasForSnapshot = 1, compactOnCommit = false) + new File(remoteDir).delete() // to make sure that the directory gets created + withDB(remoteDir, conf = conf, useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds) { db => + db.load(0) + db.put("foo", "bar") + db.commit() + // maintenance can be invoked in the context of the maintenance task's thread pool + // and close is invoked after that + db.doMaintenance() + db.close() + } } testWithChangelogCheckpointingEnabled("RocksDB: Unsupported Operations" + " with Changelog Checkpointing") { val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") val fileManager = new RocksDBFileManager( - dfsRootDir.getAbsolutePath, Utils.createTempDir(), new Configuration) + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) val changelogWriter = fileManager.getChangeLogWriter(1) assert(changelogWriter.version === 1) @@ -729,7 +1261,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared testWithChangelogCheckpointingEnabled("RocksDBFileManager: read and write changelog") { val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") val fileManager = new RocksDBFileManager( - dfsRootDir.getAbsolutePath, Utils.createTempDir(), new Configuration) + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) val changelogWriter = fileManager.getChangeLogWriter(1) assert(changelogWriter.version === 1) @@ -752,14 +1284,162 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared entries.zip(expectedEntries).map{ case (e1, e2) => assert(e1._1 === e2._1 && e1._2 === e2._2 && e1._3 === e2._3) } + + changelogReader.closeIfNeeded() + } + + testWithChangelogCheckpointingEnabled("RocksDBFileManager: StateStoreChangelogReaderFactory " + + "edge case") { + val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") + val fileManager = new RocksDBFileManager( + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) + + val checkpointUniqueId = Some(java.util.UUID.randomUUID.toString) + val lineage: Array[LineageItem] = Array( + LineageItem(1, java.util.UUID.randomUUID.toString), + LineageItem(2, java.util.UUID.randomUUID.toString), + LineageItem(3, java.util.UUID.randomUUID.toString) + ) + + // Create a v1 writer + val changelogWriterV1 = fileManager.getChangeLogWriter(101) + assert(changelogWriterV1.version === 1) + changelogWriterV1.commit() // v1 with empty content + + val changelogReaderV1 = fileManager.getChangelogReader(101) + assert(changelogReaderV1.version === 1) // getChangelogReader should return a v1 reader + changelogReaderV1.closeIfNeeded() + + // Create a v2 writer + val changelogWriterV2 = fileManager.getChangeLogWriter(102, useColumnFamilies = true) + assert(changelogWriterV2.version === 2) + changelogWriterV2.commit() // v2 with empty content + + val changelogReaderV2 = fileManager.getChangelogReader(102) + assert(changelogReaderV2.version === 2) // getChangelogReader should return a v2 reader + changelogReaderV2.closeIfNeeded() + + // Create a v3 writer + val changelogWriterV3 = fileManager.getChangeLogWriter( + 103, useColumnFamilies = false, checkpointUniqueId, Some(lineage)) + assert(changelogWriterV3.version === 3) + changelogWriterV3.commit() // v1 with empty content + + val changelogReaderV3 = fileManager.getChangelogReader( + 103, checkpointUniqueId = checkpointUniqueId) + assert(changelogReaderV3.version === 3) // getChangelogReader should return a v3 reader + assert(changelogReaderV3.lineage sameElements lineage) + changelogReaderV3.closeIfNeeded() + + // Create a v4 writer + val changelogWriterV4 = fileManager.getChangeLogWriter( + 104, useColumnFamilies = true, checkpointUniqueId, Some(lineage)) + assert(changelogWriterV4.version === 4) + changelogWriterV4.commit() // v1 with empty content + + val changelogReaderV4 = fileManager.getChangelogReader( + 104, checkpointUniqueId = checkpointUniqueId) + assert(changelogReaderV4.version === 4) // getChangelogReader should return a v4 reader + assert(changelogReaderV4.lineage sameElements lineage) + changelogReaderV4.closeIfNeeded() + } + + testWithChangelogCheckpointingEnabled("RocksDBFileManager: changelog reader / writer " + + "failure cases") { + val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") + val fileManager = new RocksDBFileManager( + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) + // Failure case 1: reader writer version mismatch + // Create a v1 writer + val changelogWriterV1 = fileManager.getChangeLogWriter(101) + assert(changelogWriterV1.version === 1) + + (1 to 5).foreach(i => changelogWriterV1.put(i.toString, i.toString)) + (2 to 4).foreach(j => changelogWriterV1.delete(j.toString)) + + changelogWriterV1.commit() + // Success case, when reading from the same file, a V1 reader should be constructed. + val changelogReaderV1 = fileManager.getChangelogReader(101) + assert(changelogReaderV1.version === 1) + changelogReaderV1.closeIfNeeded() + + // Failure case, force creating a V3 reader. + val dfsChangelogFile = PrivateMethod[Path](Symbol("dfsChangelogFile")) + val codec = PrivateMethod[CompressionCodec](Symbol("codec")) + var changelogFile = fileManager invokePrivate dfsChangelogFile(101L, None) + val compressionCodec = fileManager invokePrivate codec() + val fm = CheckpointFileManager.create(new Path(dfsRootDir.getAbsolutePath), new Configuration) + val e = intercept[AssertionError] { + new StateStoreChangelogReaderV3(fm, changelogFile, compressionCodec) + } + assert(e.getMessage.contains("Changelog version mismatch")) + + changelogFile = fileManager invokePrivate dfsChangelogFile(1L, None) + // Failure case 2: readerFactory throw when reading from ckpt built in future Spark version + // Create a v101 writer + val changelogWriter = new TestStateStoreChangelogWriterV101( + fm, changelogFile, compressionCodec) + assert(changelogWriter.version === 101) + + changelogWriter.commit() + + // Failure case, force creating a V3 reader. + val ex = intercept[SparkException] { + fileManager.getChangelogReader(1) + } + checkError( + ex, + condition = "CANNOT_LOAD_STATE_STORE.INVALID_CHANGE_LOG_READER_VERSION", + parameters = Map("version" -> 101.toString) + ) + assert(ex.getMessage.contains("please upgrade your Spark")) + } + + testWithChangelogCheckpointingEnabled("RocksDBFileManager: read and write changelog " + + "with state checkpoint id enabled") { + val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") + val fileManager = new RocksDBFileManager( + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) + val checkpointUniqueId = Some(java.util.UUID.randomUUID.toString) + val lineage: Array[LineageItem] = Array( + LineageItem(1, java.util.UUID.randomUUID.toString), + LineageItem(2, java.util.UUID.randomUUID.toString), + LineageItem(3, java.util.UUID.randomUUID.toString) + ) + val changelogWriter = fileManager.getChangeLogWriter( + 3, useColumnFamilies = false, checkpointUniqueId, Some(lineage)) + assert(changelogWriter.version === 3) + + (1 to 5).foreach(i => changelogWriter.put(i.toString, i.toString)) + (2 to 4).foreach(j => changelogWriter.delete(j.toString)) + + changelogWriter.commit() + val changelogReader = fileManager.getChangelogReader(3, checkpointUniqueId) + assert(changelogReader.version === 3) + assert(changelogReader.lineage sameElements lineage) + val entries = changelogReader.toSeq + val expectedEntries = (1 to 5).map { i => + (RecordType.PUT_RECORD, i.toString.getBytes, + i.toString.getBytes, StateStore.DEFAULT_COL_FAMILY_NAME) + } ++ (2 to 4).map { j => + (RecordType.DELETE_RECORD, j.toString.getBytes, + null, StateStore.DEFAULT_COL_FAMILY_NAME) + } + + assert(entries.size == expectedEntries.size) + entries.zip(expectedEntries).map{ + case (e1, e2) => assert(e1._1 === e2._1 && e1._2 === e2._2 && e1._3 === e2._3) + } + + changelogReader.closeIfNeeded() } testWithChangelogCheckpointingEnabled( "RocksDBFileManager: read and write v2 changelog with default col family") { val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") val fileManager = new RocksDBFileManager( - dfsRootDir.getAbsolutePath, Utils.createTempDir(), new Configuration) - val changelogWriter = fileManager.getChangeLogWriter(1, true) + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) + val changelogWriter = fileManager.getChangeLogWriter(1, useColumnFamilies = true) assert(changelogWriter.version === 2) (1 to 5).foreach { i => changelogWriter.put(i.toString, i.toString) @@ -773,7 +1453,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } changelogWriter.commit() - val changelogReader = fileManager.getChangelogReader(1, true) + val changelogReader = fileManager.getChangelogReader(1) assert(changelogReader.version === 2) val entries = changelogReader.toSeq val expectedEntries = (1 to 5).map { i => @@ -788,6 +1468,54 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared entries.zip(expectedEntries).map{ case (e1, e2) => assert(e1._1 === e2._1 && e1._2 === e2._2 && e1._3 === e2._3) } + + changelogReader.closeIfNeeded() + } + + testWithChangelogCheckpointingEnabled("RocksDBFileManager: read and write v2 changelog with " + + "default col family and state checkpoint id enabled") { + val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") + val fileManager = new RocksDBFileManager( + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) + val checkpointUniqueId = Some(java.util.UUID.randomUUID.toString) + val lineage: Array[LineageItem] = Array( + LineageItem(1, java.util.UUID.randomUUID.toString), + LineageItem(2, java.util.UUID.randomUUID.toString), + LineageItem(3, java.util.UUID.randomUUID.toString) + ) + val changelogWriter = fileManager.getChangeLogWriter( + 1, useColumnFamilies = true, checkpointUniqueId, Some(lineage)) + assert(changelogWriter.version === 4) + (1 to 5).foreach { i => + changelogWriter.put(i.toString, i.toString) + } + (1 to 5).foreach { i => + changelogWriter.merge(i.toString, i.toString) + } + + (2 to 4).foreach { j => + changelogWriter.delete(j.toString) + } + + changelogWriter.commit() + val changelogReader = fileManager.getChangelogReader(1, checkpointUniqueId) + assert(changelogReader.version === 4) + assert(changelogReader.lineage sameElements lineage) + val entries = changelogReader.toSeq + val expectedEntries = (1 to 5).map { i => + (RecordType.PUT_RECORD, i.toString.getBytes, i.toString.getBytes) + } ++ (1 to 5).map { i => + (RecordType.MERGE_RECORD, i.toString.getBytes, i.toString.getBytes) + } ++ (2 to 4).map { j => + (RecordType.DELETE_RECORD, j.toString.getBytes, null) + } + + assert(entries.size == expectedEntries.size) + entries.zip(expectedEntries).map{ + case (e1, e2) => assert(e1._1 === e2._1 && e1._2 === e2._2 && e1._3 === e2._3) + } + + changelogReader.closeIfNeeded() } testWithColumnFamilies("RocksDBFileManager: create init dfs directory with " + @@ -797,7 +1525,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared try { val verificationDir = Utils.createTempDir().getAbsolutePath val fileManager = new RocksDBFileManager( - dfsRootDir.getAbsolutePath, Utils.createTempDir(), new Configuration) + dfsRootDir.getAbsolutePath, Utils.createTempDir(), hadoopConf) // Save a version of empty checkpoint files val cpFiles = Seq() generateFiles(verificationDir, cpFiles) @@ -890,17 +1618,18 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithColumnFamilies("RocksDBFileManager: delete orphan files", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDBFileManager: delete orphan files", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => withTempDir { dir => val dfsRootDir = dir.getAbsolutePath // Use 2 file managers here to emulate concurrent execution // that checkpoint the same version of state val fileManager = new RocksDBFileManager( - dfsRootDir, Utils.createTempDir(), new Configuration) + dfsRootDir, Utils.createTempDir(), hadoopConf) val rocksDBFileMapping = new RocksDBFileMapping() val fileManager_ = new RocksDBFileManager( - dfsRootDir, Utils.createTempDir(), new Configuration) + dfsRootDir, Utils.createTempDir(), hadoopConf) val sstDir = s"$dfsRootDir/SSTs" def numRemoteSSTFiles: Int = listFiles(sstDir).length val logDir = s"$dfsRootDir/logs" @@ -915,9 +1644,12 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared "archive/00001.log" -> 1000, "archive/00002.log" -> 2000 ) - + val uuid = enableStateStoreCheckpointIds match { + case false => None + case true => Some(UUID.randomUUID().toString) + } saveCheckpointFiles(fileManager, cpFiles1, version = 1, - numKeys = 101, rocksDBFileMapping) + numKeys = 101, rocksDBFileMapping, uuid) assert(fileManager.getLatestVersion() === 1) assert(numRemoteSSTFiles == 2) // 2 sst files copied assert(numRemoteLogFiles == 2) @@ -932,7 +1664,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared "archive/00003.log" -> 2000 ) saveCheckpointFiles(fileManager_, cpFiles1_, version = 1, - numKeys = 101, new RocksDBFileMapping()) + numKeys = 101, new RocksDBFileMapping(), uuid) assert(fileManager_.getLatestVersion() === 1) assert(numRemoteSSTFiles == 4) assert(numRemoteLogFiles == 4) @@ -952,7 +1684,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared "archive/00005.log" -> 2000 ) saveCheckpointFiles(fileManager_, cpFiles2, - version = 2, numKeys = 121, new RocksDBFileMapping()) + version = 2, numKeys = 121, new RocksDBFileMapping(), uuid) fileManager_.deleteOldVersions(1) assert(numRemoteSSTFiles <= 4) // delete files recorded in 1.zip assert(numRemoteLogFiles <= 5) // delete files recorded in 1.zip and orphan 00001.log @@ -967,7 +1699,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared "archive/00007.log" -> 2000 ) saveCheckpointFiles(fileManager_, cpFiles3, - version = 3, numKeys = 131, new RocksDBFileMapping()) + version = 3, numKeys = 131, new RocksDBFileMapping(), uuid) assert(fileManager_.getLatestVersion() === 3) fileManager_.deleteOldVersions(1) assert(numRemoteSSTFiles == 1) @@ -975,13 +1707,14 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithColumnFamilies("RocksDBFileManager: don't delete orphan files " + - s"when there is only 1 version", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDBFileManager: don't delete " + + s"orphan files when there is only 1 version", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => withTempDir { dir => val dfsRootDir = dir.getAbsolutePath val fileManager = new RocksDBFileManager( - dfsRootDir, Utils.createTempDir(), new Configuration) + dfsRootDir, Utils.createTempDir(), hadoopConf) (new File(dfsRootDir, "SSTs")).mkdir() (new File(dfsRootDir, "logs")).mkdir() @@ -1005,8 +1738,14 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared "archive/00002.log" -> 2000 ) val rocksDBFileMapping = new RocksDBFileMapping() - saveCheckpointFiles(fileManager, cpFiles1, - version = 1, numKeys = 101, rocksDBFileMapping) + val uuid = if (enableStateStoreCheckpointIds) { + Some(UUID.randomUUID().toString) + } else { + None + } + + saveCheckpointFiles( + fileManager, cpFiles1, version = 1, numKeys = 101, rocksDBFileMapping, uuid) fileManager.deleteOldVersions(1) // Should not delete orphan files even when they are older than all existing files // when there is only 1 version. @@ -1023,8 +1762,8 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared "archive/00003.log" -> 1000, "archive/00004.log" -> 2000 ) - saveCheckpointFiles(fileManager, cpFiles2, - version = 2, numKeys = 101, rocksDBFileMapping) + saveCheckpointFiles( + fileManager, cpFiles2, version = 2, numKeys = 101, rocksDBFileMapping, uuid) assert(numRemoteSSTFiles == 5) assert(numRemoteLogFiles == 5) fileManager.deleteOldVersions(1) @@ -1034,122 +1773,131 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithColumnFamilies("RocksDBFileManager: upload only new immutable files", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => - withTempDir { dir => - val dfsRootDir = dir.getAbsolutePath - val verificationDir = Utils.createTempDir().getAbsolutePath // local dir to load checkpoints - val fileManager = new RocksDBFileManager( - dfsRootDir, Utils.createTempDir(), new Configuration) - val sstDir = s"$dfsRootDir/SSTs" - def numRemoteSSTFiles: Int = listFiles(sstDir).length - val logDir = s"$dfsRootDir/logs" - def numRemoteLogFiles: Int = listFiles(logDir).length - val fileMapping = new RocksDBFileMapping - - // Verify behavior before any saved checkpoints - assert(fileManager.getLatestVersion() === 0) - - // Try to load incorrect versions - intercept[FileNotFoundException] { - fileManager.loadCheckpointFromDfs(1, Utils.createTempDir(), fileMapping) - } - - // Save a version of checkpoint files - val cpFiles1 = Seq( - "sst-file1.sst" -> 10, - "sst-file2.sst" -> 20, - "other-file1" -> 100, - "other-file2" -> 200, - "archive/00001.log" -> 1000, - "archive/00002.log" -> 2000 - ) - saveCheckpointFiles(fileManager, cpFiles1, - version = 1, numKeys = 101, fileMapping) - assert(fileManager.getLatestVersion() === 1) - assert(numRemoteSSTFiles == 2) // 2 sst files copied - assert(numRemoteLogFiles == 2) // 2 log files copied - - // Load back the checkpoint files into another local dir with existing files and verify - generateFiles(verificationDir, Seq( - "sst-file1.sst" -> 11, // files with same name but different sizes, should get overwritten - "other-file1" -> 101, - "archive/00001.log" -> 1001, - "random-sst-file.sst" -> 100, // unnecessary files, should get deleted - "random-other-file" -> 9, - "00005.log" -> 101, - "archive/00007.log" -> 101 - )) + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDBFileManager: upload only " + + "new immutable files", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => + withTempDir { dir => + val dfsRootDir = dir.getAbsolutePath + val verificationDir = Utils.createTempDir().getAbsolutePath // local dir to load checkpoints + val fileManager = new RocksDBFileManager( + dfsRootDir, Utils.createTempDir(), hadoopConf) + val sstDir = s"$dfsRootDir/SSTs" + def numRemoteSSTFiles: Int = listFiles(sstDir).length + val logDir = s"$dfsRootDir/logs" + def numRemoteLogFiles: Int = listFiles(logDir).length + val fileMapping = new RocksDBFileMapping + + // Verify behavior before any saved checkpoints + assert(fileManager.getLatestVersion() === 0) + + // Try to load incorrect versions + intercept[FileNotFoundException] { + fileManager.loadCheckpointFromDfs(1, Utils.createTempDir(), fileMapping) + } - // as we are loading version 1 again, the previously committed 1,zip and - // SST files would not be reused. - loadAndVerifyCheckpointFiles(fileManager, verificationDir, - version = 1, cpFiles1, 101, fileMapping) + // Save a version of checkpoint files + val cpFiles1 = Seq( + "sst-file1.sst" -> 10, + "sst-file2.sst" -> 20, + "other-file1" -> 100, + "other-file2" -> 200, + "archive/00001.log" -> 1000, + "archive/00002.log" -> 2000 + ) - // Save SAME version again with different checkpoint files and load back again to verify - // whether files were overwritten. - val cpFiles1_ = Seq( - "sst-file1.sst" -> 10, // same SST file as before, but will be uploaded again - "sst-file2.sst" -> 25, // new SST file with same name as before, but different length - "sst-file3.sst" -> 30, // new SST file - "other-file1" -> 100, // same non-SST file as before, should not get copied - "other-file2" -> 210, // new non-SST file with same name as before, but different length - "other-file3" -> 300, // new non-SST file - "archive/00001.log" -> 1000, // same log file as before, this should get reused - "archive/00002.log" -> 2500, // new log file with same name as before, but different length - "archive/00003.log" -> 3000 // new log file - ) + val uuid = if (enableStateStoreCheckpointIds) { + Some(UUID.randomUUID().toString) + } else { + None + } - // upload version 1 again, new checkpoint will be created and SST files from - // previously committed version 1 will not be reused. - saveCheckpointFiles(fileManager, cpFiles1_, - version = 1, numKeys = 1001, fileMapping) - assert(numRemoteSSTFiles === 5, "shouldn't reuse old version 1 SST files" + - " while uploading version 1 again") // 2 old + 3 new SST files - assert(numRemoteLogFiles === 5, "shouldn't reuse old version 1 log files" + - " while uploading version 1 again") // 2 old + 3 new log files + saveCheckpointFiles( + fileManager, cpFiles1, version = 1, numKeys = 101, fileMapping, uuid) + assert(fileManager.getLatestVersion() === 1) + assert(numRemoteSSTFiles == 2) // 2 sst files copied + assert(numRemoteLogFiles == 2) // 2 log files copied + + // Load back the checkpoint files into another local dir with existing files and verify + generateFiles(verificationDir, Seq( + "sst-file1.sst" -> 11, // files with same name but different sizes, should get overwritten + "other-file1" -> 101, + "archive/00001.log" -> 1001, + "random-sst-file.sst" -> 100, // unnecessary files, should get deleted + "random-other-file" -> 9, + "00005.log" -> 101, + "archive/00007.log" -> 101 + )) + + // as we are loading version 1 again, the previously committed 1.zip and + // SST files would not be reused. + loadAndVerifyCheckpointFiles( + fileManager, verificationDir, version = 1, cpFiles1, 101, fileMapping, uuid) + + // Save SAME version again with different checkpoint files and load back again to verify + // whether files were overwritten. + val cpFiles1_ = Seq( + "sst-file1.sst" -> 10, // same SST file as before, but will be uploaded again + "sst-file2.sst" -> 25, // new SST file with same name as before, but different length + "sst-file3.sst" -> 30, // new SST file + "other-file1" -> 100, // same non-SST file as before, should not get copied + "other-file2" -> 210, // new non-SST file with same name as before, but different length + "other-file3" -> 300, // new non-SST file + "archive/00001.log" -> 1000, // same log file as before, this should get reused + "archive/00002.log" -> 2500, // new log file with same name but different length + "archive/00003.log" -> 3000 // new log file + ) - // verify checkpoint state is correct - loadAndVerifyCheckpointFiles(fileManager, verificationDir, - version = 1, cpFiles1_, 1001, fileMapping) + // upload version 1 again, new checkpoint will be created and SST files from + // previously committed version 1 will not be reused. + saveCheckpointFiles(fileManager, cpFiles1_, + version = 1, numKeys = 1001, fileMapping, uuid) + assert(numRemoteSSTFiles === 5, "shouldn't reuse old version 1 SST files" + + " while uploading version 1 again") // 2 old + 3 new SST files + assert(numRemoteLogFiles === 5, "shouldn't reuse old version 1 log files" + + " while uploading version 1 again") // 2 old + 3 new log files - // Save another version and verify - val cpFiles2 = Seq( - "sst-file1.sst" -> 10, // same SST file as version 1, should be reused - "sst-file2.sst" -> 25, // same SST file as version 1, should be reused - "sst-file3.sst" -> 30, // same SST file as version 1, should be reused - "sst-file4.sst" -> 40, // new sst file, should be uploaded - "other-file4" -> 400, - "archive/00004.log" -> 4000 - ) + // verify checkpoint state is correct + loadAndVerifyCheckpointFiles(fileManager, verificationDir, + version = 1, cpFiles1_, 1001, fileMapping, uuid) + + // Save another version and verify + val cpFiles2 = Seq( + "sst-file1.sst" -> 10, // same SST file as version 1, should be reused + "sst-file2.sst" -> 25, // same SST file as version 1, should be reused + "sst-file3.sst" -> 30, // same SST file as version 1, should be reused + "sst-file4.sst" -> 40, // new sst file, should be uploaded + "other-file4" -> 400, + "archive/00004.log" -> 4000 + ) + saveCheckpointFiles(fileManager, cpFiles2, + version = 2, numKeys = 1501, fileMapping, uuid) + assert(numRemoteSSTFiles === 6) // 1 new file over earlier 5 files + assert(numRemoteLogFiles === 6) // 1 new file over earlier 6 files + loadAndVerifyCheckpointFiles(fileManager, verificationDir, + version = 2, cpFiles2, 1501, fileMapping, uuid) - saveCheckpointFiles(fileManager, cpFiles2, - version = 2, numKeys = 1501, fileMapping) - assert(numRemoteSSTFiles === 6) // 1 new file over earlier 5 files - assert(numRemoteLogFiles === 6) // 1 new file over earlier 6 files - loadAndVerifyCheckpointFiles(fileManager, verificationDir, - version = 2, cpFiles2, 1501, fileMapping) + // Loading an older version should work + loadAndVerifyCheckpointFiles( + fileManager, verificationDir, version = 1, cpFiles1_, 1001, fileMapping, uuid) - // Loading an older version should work - loadAndVerifyCheckpointFiles(fileManager, verificationDir, - version = 1, cpFiles1_, 1001, fileMapping) + // Loading incorrect version should fail + intercept[FileNotFoundException] { + loadAndVerifyCheckpointFiles( + fileManager, verificationDir, version = 3, Nil, 1001, fileMapping, uuid) + } - // Loading incorrect version should fail - intercept[FileNotFoundException] { - loadAndVerifyCheckpointFiles(fileManager, verificationDir, - version = 3, Nil, 1001, fileMapping) + // Loading 0 should delete all files + require(verificationDir.list().length > 0) + loadAndVerifyCheckpointFiles( + fileManager, verificationDir, version = 0, Nil, 0, fileMapping, uuid) } - - // Loading 0 should delete all files - require(verificationDir.list().length > 0) - loadAndVerifyCheckpointFiles(fileManager, verificationDir, - version = 0, Nil, 0, fileMapping) - } } - testWithColumnFamilies("RocksDBFileManager: error writing [version].zip " + - s"cancels the output stream", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => + testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDBFileManager: error writing " + + s"[version].zip cancels the output stream", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => quietly { val hadoopConf = new Configuration() hadoopConf.set( @@ -1159,30 +1907,40 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared val fileManager = new RocksDBFileManager(dfsRootDir, Utils.createTempDir(), hadoopConf) val cpFiles = Seq("sst-file1.sst" -> 10, "sst-file2.sst" -> 20, "other-file1" -> 100) CreateAtomicTestManager.shouldFailInCreateAtomic = true + val uuid = if (enableStateStoreCheckpointIds) { + Some(UUID.randomUUID().toString) + } else { + None + } intercept[IOException] { - saveCheckpointFiles(fileManager, cpFiles, - version = 1, numKeys = 101, new RocksDBFileMapping()) + saveCheckpointFiles( + fileManager, cpFiles, version = 1, numKeys = 101, new RocksDBFileMapping(), uuid) } assert(CreateAtomicTestManager.cancelCalledInCreateAtomic) } } - testWithColumnFamilies("disallow concurrent updates to the same RocksDB instance", - TestWithBothChangelogCheckpointingEnabledAndDisabled) { colFamiliesEnabled => + testWithStateStoreCheckpointIdsAndColumnFamilies("disallow concurrent updates to the same " + + "RocksDB instance", + TestWithBothChangelogCheckpointingEnabledAndDisabled) { + case (enableStateStoreCheckpointIds, colFamiliesEnabled) => quietly { + val versionToUniqueId = new mutable.HashMap[Long, String]() withDB( Utils.createTempDir().toString, conf = dbConf.copy(lockAcquireTimeoutMs = 20), - useColumnFamilies = colFamiliesEnabled) { db => + useColumnFamilies = colFamiliesEnabled, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => // DB has been loaded so current thread has already // acquired the lock on the RocksDB instance - db.load(0) // Current thread should be able to load again + db.load(0, versionToUniqueId.get(0)) // Current thread should be able to load again // Another thread should not be able to load while current thread is using it var ex = intercept[SparkException] { ThreadUtils.runInNewThread("concurrent-test-thread-1") { - db.load(0) + db.load(0, versionToUniqueId.get(0)) } } checkError( @@ -1202,15 +1960,15 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared // Commit should release the instance allowing other threads to load new version db.commit() ThreadUtils.runInNewThread("concurrent-test-thread-2") { - db.load(1) + db.load(1, versionToUniqueId.get(1)) db.commit() } // Another thread should not be able to load while current thread is using it - db.load(2) + db.load(2, versionToUniqueId.get(2)) ex = intercept[SparkException] { ThreadUtils.runInNewThread("concurrent-test-thread-2") { - db.load(2) + db.load(2, versionToUniqueId.get(2)) } } checkError( @@ -1230,7 +1988,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared // Rollback should release the instance allowing other threads to load new version db.rollback() ThreadUtils.runInNewThread("concurrent-test-thread-3") { - db.load(1) + db.load(1, versionToUniqueId.get(1)) db.commit() } } @@ -1675,27 +2433,33 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - test("time travel - validate successful RocksDB load") { + testWithStateStoreCheckpointIds("time travel - " + + "validate successful RocksDB load") { enableStateStoreCheckpointIds => val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = 1, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf) { db => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 0 to 1) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } // upload snapshot 2.zip db.doMaintenance() for (version <- Seq(2)) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } // upload snapshot 3.zip db.doMaintenance() // simulate db in another executor that override the zip file - withDB(remoteDir, conf = conf) { db1 => + // In checkpoint V2, reusing the same versionToUniqueId to simulate when two executors + // are scheduled with the same uniqueId in the same microbatch + withDB(remoteDir, conf = conf, enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db1 => for (version <- 0 to 1) { db1.load(version) db1.put(version.toString, version.toString) @@ -1703,41 +2467,48 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } db1.doMaintenance() } - db.load(2) + db.load(2, versionToUniqueId.get(2)) for (version <- Seq(2)) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } // upload snapshot 3.zip db.doMaintenance() // rollback to version 2 - db.load(2) + db.load(2, versionToUniqueId.get(2)) } } - test("time travel 2 - validate successful RocksDB load") { + testWithStateStoreCheckpointIds("time travel 2 - " + + "validate successful RocksDB load") { enableStateStoreCheckpointIds => Seq(1, 2).map(minDeltasForSnapshot => { val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = minDeltasForSnapshot, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf) { db => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 0 to 1) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } // upload snapshot 2.zip db.doMaintenance() for (version <- 2 to 3) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } - db.load(0) + db.load(0, versionToUniqueId.get(0)) // simulate db in another executor that override the zip file - withDB(remoteDir, conf = conf) { db1 => + // In checkpoint V2, reusing the same versionToUniqueId to simulate when two executors + // are scheduled with the same uniqueId in the same microbatch + withDB(remoteDir, conf = conf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db1 => for (version <- 0 to 1) { db1.load(version) db1.put(version.toString, version.toString) @@ -1746,7 +2517,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db1.doMaintenance() } for (version <- 2 to 3) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } @@ -1758,20 +2529,23 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared }) } - test("time travel 3 - validate successful RocksDB load") { + testWithStateStoreCheckpointIds("time travel 3 - validate" + + " successful RocksDB load") { enableStateStoreCheckpointIds => val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = 0, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf) { db => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 0 to 2) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } // upload snapshot 2.zip db.doMaintenance() for (version <- 1 to 3) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } @@ -1783,20 +2557,22 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithChangelogCheckpointingEnabled("time travel 4 -" + - " validate successful RocksDB load when metadata file is overwritten") { + testWithStateStoreCheckpointIdsAndChangelogEnabled("time travel 4 - validate successful" + + " RocksDB load when metadata file is overwritten") { enableStateStoreCheckpointIds => val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = 2, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf) { db => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => for (version <- 0 to 1) { - db.load(version) + db.load(version, versionToUniqueId.get(version)) db.put(version.toString, version.toString) db.commit() } // load previous version, and recreate the snapshot - db.load(1) + db.load(1, versionToUniqueId.get(1)) db.put("3", "3") // upload any latest snapshots so far @@ -1811,8 +2587,8 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithChangelogCheckpointingEnabled("time travel 5 -" + - "validate successful RocksDB load when metadata file is not overwritten") { + testWithStateStoreCheckpointIdsAndChangelogEnabled("time travel 5 - validate successful " + + "RocksDB load when metadata file is not overwritten") { enableStateStoreCheckpointIds => val fmClass = "org.apache.spark.sql.execution.streaming.state." + "NoOverwriteFileSystemBasedCheckpointFileManager" Seq(Some(fmClass), None).foreach { fm => @@ -1822,13 +2598,16 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared fm.foreach(value => hadoopConf.set(STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, value)) val remoteDir = dir.getCanonicalPath - withDB(remoteDir, conf = conf, hadoopConf = hadoopConf) { db => - db.load(0) + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => + db.load(0, versionToUniqueId.get(0)) db.put("a", "1") db.commit() // load previous version, will recreate snapshot on commit - db.load(0) + db.load(0, versionToUniqueId.get(0)) db.put("a", "1") // upload version 1 snapshot created previously @@ -1853,14 +2632,17 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - testWithChangelogCheckpointingEnabled("reloading the same version") { + testWithStateStoreCheckpointIdsAndChangelogEnabled("reloading the " + + "same version") { enableStateStoreCheckpointIds => // Keep executing the same batch for two or more times. Some queries with ForEachBatch // will cause this behavior. // The test was accidentally fixed by SPARK-48586 (https://github.com/apache/spark/pull/47130) val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = 2, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf) { db => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = conf, enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => // load the same version of pending snapshot uploading // This is possible because after committing version x, we can continue to x+1, and replay // x+1. The replay will load a checkpoint by version x. At this moment, the snapshot @@ -1871,13 +2653,13 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared // This test was accidentally fixed by // SPARK-48931 (https://github.com/apache/spark/pull/47393) - db.load(0) + db.load(0, versionToUniqueId.get(0)) db.put("foo", "bar") // Snapshot checkpoint not needed db.commit() // Continue using local DB - db.load(1) + db.load(1, versionToUniqueId.get(1)) db.put("foo", "bar") // Should create a local RocksDB snapshot db.commit() @@ -1885,19 +2667,19 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db.doMaintenance() // This will reload Db from the cloud. - db.load(1) + db.load(1, versionToUniqueId.get(1)) db.put("foo", "bar") // Should create another local snapshot db.commit() // Continue using local DB - db.load(2) + db.load(2, versionToUniqueId.get(2)) db.put("foo", "bar") // Snapshot checkpoint not needed db.commit() // Reload DB from the cloud, loading from 2.zip - db.load(2) + db.load(2, versionToUniqueId.get(2)) db.put("foo", "bar") // Snapshot checkpoint not needed db.commit() @@ -1906,14 +2688,14 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db.doMaintenance() // Reload new 2.zip just uploaded to validate it is not corrupted. - db.load(2) + db.load(2, versionToUniqueId.get(2)) db.put("foo", "bar") db.commit() // Test the maintenance thread is delayed even after the next snapshot is created. // There will be two outstanding snapshots. for (batchVersion <- 3 to 6) { - db.load(batchVersion) + db.load(batchVersion, versionToUniqueId.get(batchVersion)) db.put("foo", "bar") // In batchVersion 3 and 5, it will generate a local snapshot but won't be uploaded. db.commit() @@ -1924,7 +2706,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared // maintenance tasks finish quickly. for (batchVersion <- 7 to 10) { for (j <- 0 to 1) { - db.load(batchVersion) + db.load(batchVersion, versionToUniqueId.get(batchVersion)) db.put("foo", "bar") db.commit() db.doMaintenance() @@ -1935,22 +2717,27 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared for (randomSeed <- 1 to 8) { for (ifTestSkipBatch <- 0 to 1) { - testWithChangelogCheckpointingEnabled( - s"randomized snapshotting $randomSeed ifTestSkipBatch $ifTestSkipBatch") { - // The unit test simulates the case where batches can be reloaded and maintenance tasks + testWithStateStoreCheckpointIdsAndChangelogEnabled("randomized snapshotting " + + s"$randomSeed ifTestSkipBatch $ifTestSkipBatch") { enableStateStoreCheckpointIds => + // The unit test simulates the case where batches can be reloaded and maintenance tasks // can be delayed. After each batch, we randomly decide whether we would move onto the - // next batch, and whetehr maintenance task is executed. + // next batch, and whether maintenance task is executed. val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = 3, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created - withDB(remoteDir, conf = conf) { db => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = dbConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db => // A second DB is opened to simulate another executor that runs some batches that // skipped in the current DB. - withDB(remoteDir, conf = conf) { db2 => + withDB(remoteDir, conf = dbConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db2 => val random = new Random(randomSeed) var curVer: Int = 0 for (i <- 1 to 100) { - db.load(curVer) + db.load(curVer, versionToUniqueId.get(curVer)) db.put("foo", "bar") db.commit() // For a one in five chance, maintenance task is executed. The chance is created to @@ -1985,8 +2772,8 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } - test("validate Rocks DB SST files do not have a VersionIdMismatch" + - " when metadata file is not overwritten - scenario 1") { + testWithStateStoreCheckpointIds("validate Rocks DB SST files do not have a VersionIdMismatch" + + " when metadata file is not overwritten - scenario 1") { enableStateStoreCheckpointIds => val fmClass = "org.apache.spark.sql.execution.streaming.state." + "NoOverwriteFileSystemBasedCheckpointFileManager" withTempDir { dir => @@ -1995,84 +2782,94 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared hadoopConf.set(STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, fmClass) val remoteDir = dir.getCanonicalPath - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db1 => - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db2 => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db1 => + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db2 => // commit version 1 via db1 - db1.load(0) + db1.load(0, versionToUniqueId.get(0)) db1.put("a", "1") db1.put("b", "1") db1.commit() // commit version 1 via db2 - db2.load(0) + db2.load(0, versionToUniqueId.get(0)) db2.put("a", "1") db2.put("b", "1") db2.commit() // commit version 2 via db2 - db2.load(1) + db2.load(1, versionToUniqueId.get(1)) db2.put("a", "2") db2.put("b", "2") db2.commit() // reload version 1, this should succeed - db2.load(1) - db1.load(1) + db2.load(1, versionToUniqueId.get(1)) + db1.load(1, versionToUniqueId.get(1)) // reload version 2, this should succeed - db2.load(2) - db1.load(2) + db2.load(2, versionToUniqueId.get(2)) + db1.load(2, versionToUniqueId.get(2)) } } } } - test("validate Rocks DB SST files do not have a VersionIdMismatch" + - " when metadata file is overwritten - scenario 1") { + testWithStateStoreCheckpointIds("validate Rocks DB SST files do not have a VersionIdMismatch" + + " when metadata file is overwritten - scenario 1") { enableStateStoreCheckpointIds => withTempDir { dir => val dbConf = RocksDBConf(StateStoreConf(new SQLConf())) val hadoopConf = new Configuration() val remoteDir = dir.getCanonicalPath - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db1 => - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db2 => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db1 => + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db2 => // commit version 1 via db1 - db1.load(0) + db1.load(0, versionToUniqueId.get(0)) db1.put("a", "1") db1.put("b", "1") db1.commit() // commit version 1 via db2 - db2.load(0) + db2.load(0, versionToUniqueId.get(0)) db2.put("a", "1") db2.put("b", "1") db2.commit() // commit version 2 via db2 - db2.load(1) + db2.load(1, versionToUniqueId.get(1)) db2.put("a", "2") db2.put("b", "2") db2.commit() // reload version 1, this should succeed - db2.load(1) - db1.load(1) + db2.load(1, versionToUniqueId.get(1)) + db1.load(1, versionToUniqueId.get(1)) // reload version 2, this should succeed - db2.load(2) - db1.load(2) + db2.load(2, versionToUniqueId.get(2)) + db1.load(2, versionToUniqueId.get(2)) } } } } - test("validate Rocks DB SST files do not have a VersionIdMismatch" + - " when metadata file is not overwritten - scenario 2") { + testWithStateStoreCheckpointIds("validate Rocks DB SST files do not have a VersionIdMismatch" + + " when metadata file is not overwritten - scenario 2") { enableStateStoreCheckpointIds => val fmClass = "org.apache.spark.sql.execution.streaming.state." + "NoOverwriteFileSystemBasedCheckpointFileManager" withTempDir { dir => @@ -2081,77 +2878,87 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared hadoopConf.set(STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, fmClass) val remoteDir = dir.getCanonicalPath - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db1 => - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db2 => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db1 => + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db2 => // commit version 1 via db2 - db2.load(0) + db2.load(0, versionToUniqueId.get(0)) db2.put("a", "1") db2.put("b", "1") db2.commit() // commit version 1 via db1 - db1.load(0) + db1.load(0, versionToUniqueId.get(0)) db1.put("a", "1") db1.put("b", "1") db1.commit() // commit version 2 via db2 - db2.load(1) + db2.load(1, versionToUniqueId.get(1)) db2.put("a", "2") db2.put("b", "2") db2.commit() // reload version 1, this should succeed - db2.load(1) - db1.load(1) + db2.load(1, versionToUniqueId.get(1)) + db1.load(1, versionToUniqueId.get(1)) // reload version 2, this should succeed - db2.load(2) - db1.load(2) + db2.load(2, versionToUniqueId.get(2)) + db1.load(2, versionToUniqueId.get(2)) } } } } - test("validate Rocks DB SST files do not have a VersionIdMismatch" + - " when metadata file is overwritten - scenario 2") { + testWithStateStoreCheckpointIds("validate Rocks DB SST files do not have a VersionIdMismatch" + + " when metadata file is overwritten - scenario 2") { enableStateStoreCheckpointIds => withTempDir { dir => val dbConf = RocksDBConf(StateStoreConf(new SQLConf())) val hadoopConf = new Configuration() val remoteDir = dir.getCanonicalPath - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db1 => - withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf) { db2 => + val versionToUniqueId = new mutable.HashMap[Long, String]() + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db1 => + withDB(remoteDir, conf = dbConf, hadoopConf = hadoopConf, + enableStateStoreCheckpointIds = enableStateStoreCheckpointIds, + versionToUniqueId = versionToUniqueId) { db2 => // commit version 1 via db2 - db2.load(0) + db2.load(0, versionToUniqueId.get(0)) db2.put("a", "1") db2.put("b", "1") db2.commit() // commit version 1 via db1 - db1.load(0) + db1.load(0, versionToUniqueId.get(0)) db1.put("a", "1") db1.put("b", "1") db1.commit() // commit version 2 via db2 - db2.load(1) + db2.load(1, versionToUniqueId.get(1)) db2.put("a", "2") db2.put("b", "2") db2.commit() // reload version 1, this should succeed - db2.load(1) - db1.load(1) + db2.load(1, versionToUniqueId.get(1)) + db1.load(1, versionToUniqueId.get(1)) // reload version 2, this should succeed - db2.load(2) - db1.load(2) + db2.load(2, versionToUniqueId.get(2)) + db1.load(2, versionToUniqueId.get(2)) } } } @@ -2429,25 +3236,77 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared private def dbConf = RocksDBConf(StateStoreConf(SQLConf.get.clone())) + class RocksDBCheckpointFormatV2( + dfsRootDir: String, + conf: RocksDBConf, + localRootDir: File = Utils.createTempDir(), + hadoopConf: Configuration = new Configuration, + loggingId: String = "", + useColumnFamilies: Boolean = false, + val versionToUniqueId : mutable.Map[Long, String] = mutable.Map[Long, String]()) + extends RocksDB(dfsRootDir, conf, localRootDir, hadoopConf, loggingId, + useColumnFamilies, enableStateStoreCheckpointIds = true) { + + override def load( + version: Long, + ckptId: Option[String] = None, + readOnly: Boolean = false): RocksDB = { + // When a ckptId is defined, it means the test is explicitly using v2 semantic + // When it is not, it is possible that implicitly uses it. + // So still do a versionToUniqueId.get + ckptId match { + case Some(_) => super.load(version, ckptId, readOnly) + case None => super.load(version, versionToUniqueId.get(version), readOnly) + } + } + + override def commit(): Long = { + val ret = super.commit() + // update versionToUniqueId from lineageManager + lineageManager.getLineageForCurrVersion().foreach { + case LineageItem(version, id) => versionToUniqueId.getOrElseUpdate(version, id) + } + ret + } + } + + // withDB override with checkpoint format v2 def withDB[T]( remoteDir: String, version: Int = 0, conf: RocksDBConf = dbConf, - hadoopConf: Configuration = new Configuration(), + hadoopConf: Configuration = hadoopConf, useColumnFamilies: Boolean = false, + enableStateStoreCheckpointIds: Boolean = false, + // versionToUniqueId is used in checkpoint format v2, it simulates the lineage + // stored in the commit log. The lineage will be automatically updated in db.commit() + // When testing V2, please create a versionToUniqueId map + // and call versionToUniqueId.get(version) in the db.load() function. + // In V1, versionToUniqueId is not used and versionToUniqueId.get(version) returns None. + versionToUniqueId : mutable.Map[Long, String] = mutable.Map[Long, String](), localDir: File = Utils.createTempDir())( func: RocksDB => T): T = { var db: RocksDB = null try { - db = new RocksDB( - remoteDir, - conf = conf, - localRootDir = localDir, - hadoopConf = hadoopConf, - loggingId = s"[Thread-${Thread.currentThread.getId}]", - useColumnFamilies = useColumnFamilies - ) - db.load(version) + db = if (enableStateStoreCheckpointIds) { + new RocksDBCheckpointFormatV2( + remoteDir, + conf = conf, + localRootDir = localDir, + hadoopConf = hadoopConf, + loggingId = s"[Thread-${Thread.currentThread.getId}]", + useColumnFamilies = useColumnFamilies, + versionToUniqueId = versionToUniqueId) + } else { + new RocksDB( + remoteDir, + conf = conf, + localRootDir = localDir, + hadoopConf = hadoopConf, + loggingId = s"[Thread-${Thread.currentThread.getId}]", + useColumnFamilies = useColumnFamilies) + } + db.load(version, versionToUniqueId.get(version)) func(db) } finally { if (db != null) { @@ -2468,7 +3327,8 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared fileToLengths: Seq[(String, Int)], version: Int, numKeys: Int, - fileMapping: RocksDBFileMapping): Unit = { + fileMapping: RocksDBFileMapping, + checkpointUniqueId: Option[String] = None): Unit = { val checkpointDir = Utils.createTempDir().getAbsolutePath // local dir to create checkpoints generateFiles(checkpointDir, fileToLengths) val (dfsFileSuffix, immutableFileMapping) = fileMapping.createSnapshotFileMapping( @@ -2477,7 +3337,9 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared checkpointDir, version, numKeys, - immutableFileMapping) + immutableFileMapping, + checkpointUniqueId = checkpointUniqueId) + val snapshotInfo = RocksDBVersionSnapshotInfo(version, dfsFileSuffix) fileMapping.snapshotsPendingUpload.remove(snapshotInfo) } @@ -2488,9 +3350,10 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared version: Int, expectedFiles: Seq[(String, Int)], expectedNumKeys: Int, - fileMapping: RocksDBFileMapping): Unit = { - val metadata = fileManager.loadCheckpointFromDfs(version, - verificationDir, fileMapping) + fileMapping: RocksDBFileMapping, + checkpointUniqueId: Option[String] = None): Unit = { + val metadata = fileManager.loadCheckpointFromDfs( + version, verificationDir, fileMapping, checkpointUniqueId) val filesAndLengths = listFiles(verificationDir).map(f => f.getName -> f.length).toSet ++ listFiles(verificationDir + "/archive").map(f => s"archive/${f.getName}" -> f.length()).toSet diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/TimerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/TimerSuite.scala index 24a120be9d9af..428845d5ebcbb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/TimerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/TimerSuite.scala @@ -72,8 +72,9 @@ class TimerSuite extends StateVariableSuiteBase { assert(timerState1.listTimers().toSet === Set(15000L, 1000L)) assert(timerState1.getExpiredTimers(Long.MaxValue).toSeq === Seq(("test_key", 1000L), ("test_key", 15000L))) - // if timestamp equals to expiryTimestampsMs, will not considered expired - assert(timerState1.getExpiredTimers(15000L).toSeq === Seq(("test_key", 1000L))) + // if timestamp equals to expiryTimestampsMs, it will be considered expired + assert(timerState1.getExpiredTimers(15000L).toSeq === + Seq(("test_key", 1000L), ("test_key", 15000L))) assert(timerState1.listTimers().toSet === Set(15000L, 1000L)) timerState1.registerTimer(20L * 1000) @@ -128,7 +129,7 @@ class TimerSuite extends StateVariableSuiteBase { timerTimerstamps.foreach(timerState.registerTimer) assert(timerState.getExpiredTimers(Long.MaxValue).toSeq.map(_._2) === timerTimerstamps.sorted) assert(timerState.getExpiredTimers(4200L).toSeq.map(_._2) === - timerTimerstamps.sorted.takeWhile(_ < 4200L)) + timerTimerstamps.sorted.takeWhile(_ <= 4200L)) assert(timerState.getExpiredTimers(Long.MinValue).toSeq === Seq.empty) ImplicitGroupingKeyTracker.removeImplicitKey() } @@ -162,7 +163,7 @@ class TimerSuite extends StateVariableSuiteBase { (timerTimestamps1 ++ timerTimestamps2 ++ timerTimerStamps3).sorted) assert(timerState1.getExpiredTimers(Long.MinValue).toSeq === Seq.empty) assert(timerState1.getExpiredTimers(8000L).toSeq.map(_._2) === - (timerTimestamps1 ++ timerTimestamps2 ++ timerTimerStamps3).sorted.takeWhile(_ < 8000L)) + (timerTimestamps1 ++ timerTimestamps2 ++ timerTimerStamps3).sorted.takeWhile(_ <= 8000L)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ValueStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ValueStateSuite.scala index 55d08cd8f12a7..037fed045e8ca 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ValueStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/ValueStateSuite.scala @@ -327,8 +327,8 @@ class ValueStateSuite extends StateVariableSuiteBase { var ttlValue = testState.getTTLValue() assert(ttlValue.isDefined) assert(ttlValue.get._2 === ttlExpirationMs) - var ttlStateValueIterator = testState.getValuesInTTLState() - assert(ttlStateValueIterator.hasNext) + var ttlStateValueIterator = testState.getValueInTTLState() + assert(ttlStateValueIterator.isDefined) // increment batchProcessingTime, or watermark and ensure expired value is not returned val nextBatchHandle = new StatefulProcessorHandleImpl(store, UUID.randomUUID(), @@ -349,10 +349,9 @@ class ValueStateSuite extends StateVariableSuiteBase { ttlValue = nextBatchTestState.getTTLValue() assert(ttlValue.isDefined) assert(ttlValue.get._2 === ttlExpirationMs) - ttlStateValueIterator = nextBatchTestState.getValuesInTTLState() - assert(ttlStateValueIterator.hasNext) - assert(ttlStateValueIterator.next() === ttlExpirationMs) - assert(ttlStateValueIterator.isEmpty) + ttlStateValueIterator = nextBatchTestState.getValueInTTLState() + assert(ttlStateValueIterator.isDefined) + assert(ttlStateValueIterator.get === ttlExpirationMs) // getWithoutTTL should still return the expired value assert(nextBatchTestState.getWithoutEnforcingTTL().get === "v1") @@ -412,8 +411,8 @@ class ValueStateSuite extends StateVariableSuiteBase { val ttlValue = testState.getTTLValue() assert(ttlValue.isDefined) assert(ttlValue.get._2 === ttlExpirationMs) - val ttlStateValueIterator = testState.getValuesInTTLState() - assert(ttlStateValueIterator.hasNext) + val ttlStateValueIterator = testState.getValueInTTLState() + assert(ttlStateValueIterator.isDefined) } } } @@ -423,7 +422,7 @@ class ValueStateSuite extends StateVariableSuiteBase { * types (ValueState, ListState, MapState) used in arbitrary stateful operators. */ abstract class StateVariableSuiteBase extends SharedSparkSession - with BeforeAndAfter { + with BeforeAndAfter with AlsoTestWithEncodingTypes { before { StateStore.stop() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala index 0cc4f7bf2548e..0edbfd10d8cde 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala @@ -274,6 +274,19 @@ class ColumnVectorSuite extends SparkFunSuite with SQLHelper { } } + testVectors("mutable ColumnarRow with TimestampNTZType", 10, TimestampNTZType) { testVector => + val mutableRow = new MutableColumnarRow(Array(testVector)) + (0 until 10).foreach { i => + mutableRow.rowId = i + mutableRow.setLong(0, 10 - i) + } + (0 until 10).foreach { i => + mutableRow.rowId = i + assert(mutableRow.get(0, TimestampNTZType) === (10 - i)) + assert(mutableRow.copy().get(0, TimestampNTZType) === (10 - i)) + } + } + val arrayType: ArrayType = ArrayType(IntegerType, containsNull = true) testVectors("array", 10, arrayType) { testVector => @@ -384,18 +397,24 @@ class ColumnVectorSuite extends SparkFunSuite with SQLHelper { } val structType: StructType = new StructType().add("int", IntegerType).add("double", DoubleType) + .add("ts", TimestampNTZType) testVectors("struct", 10, structType) { testVector => val c1 = testVector.getChild(0) val c2 = testVector.getChild(1) + val c3 = testVector.getChild(2) c1.putInt(0, 123) c2.putDouble(0, 3.45) + c3.putLong(0, 1000L) c1.putInt(1, 456) c2.putDouble(1, 5.67) + c3.putLong(1, 2000L) assert(testVector.getStruct(0).get(0, IntegerType) === 123) assert(testVector.getStruct(0).get(1, DoubleType) === 3.45) + assert(testVector.getStruct(0).get(2, TimestampNTZType) === 1000L) assert(testVector.getStruct(1).get(0, IntegerType) === 456) assert(testVector.getStruct(1).get(1, DoubleType) === 5.67) + assert(testVector.getStruct(1).get(2, TimestampNTZType) === 2000L) } testVectors("SPARK-44805: getInts with dictionary", 3, IntegerType) { testVector => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala index a6fc43aa087da..a7af22a0554e9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala @@ -79,7 +79,8 @@ class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession { assert(info.getSource === "built-in") val validSources = Seq( - "built-in", "hive", "python_udf", "scala_udf", "java_udf", "python_udtf", "internal") + "built-in", "hive", "python_udf", "scala_udf", "java_udf", "python_udtf", "internal", + "sql_udf") validSources.foreach { source => val info = new ExpressionInfo( "testClass", null, "testName", null, "", "", "", "", "", "", source) @@ -229,6 +230,7 @@ class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession { // Requires dynamic class loading not available in this test suite. "org.apache.spark.sql.catalyst.expressions.FromAvro", "org.apache.spark.sql.catalyst.expressions.ToAvro", + "org.apache.spark.sql.catalyst.expressions.SchemaOfAvro", "org.apache.spark.sql.catalyst.expressions.FromProtobuf", "org.apache.spark.sql.catalyst.expressions.ToProtobuf", classOf[CurrentUser].getName, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToExpressionConverterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToExpressionConverterSuite.scala index 76fcdfc380950..d72e86450de22 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToExpressionConverterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ColumnNodeToExpressionConverterSuite.scala @@ -405,4 +405,5 @@ private[internal] case class Nope(override val origin: Origin = CurrentOrigin.ge extends ColumnNode { override private[internal] def normalize(): Nope = this override def sql: String = "nope" + override private[internal] def children: Seq[ColumnNodeLike] = Seq.empty } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingE2eSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingE2eSuite.scala new file mode 100644 index 0000000000000..afcdfd343e33b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingE2eSuite.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.scripting + +import org.apache.spark.SparkConf +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} +import org.apache.spark.sql.catalyst.plans.logical.CompoundBody +import org.apache.spark.sql.catalyst.util.QuotingUtils.toSQLConf +import org.apache.spark.sql.exceptions.SqlScriptingException +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + + +/** + * End-to-end tests for SQL Scripting. + * This suite is not intended to heavily test the SQL scripting (parser & interpreter) logic. + * It is rather focused on testing the sql() API - whether it can handle SQL scripts correctly, + * results are returned in expected manner, config flags are applied properly, etc. + * For full functionality tests, see SqlScriptingParserSuite and SqlScriptingInterpreterSuite. + */ +class SqlScriptingE2eSuite extends QueryTest with SharedSparkSession { + // Helpers + private def verifySqlScriptResult(sqlText: String, expected: Seq[Row]): Unit = { + val df = spark.sql(sqlText) + checkAnswer(df, expected) + } + + private def verifySqlScriptResultWithNamedParams( + sqlText: String, + expected: Seq[Row], + args: Map[String, Any]): Unit = { + val df = spark.sql(sqlText, args) + checkAnswer(df, expected) + } + + // Tests setup + override protected def sparkConf: SparkConf = { + super.sparkConf.set(SQLConf.SQL_SCRIPTING_ENABLED.key, "true") + } + + // Tests + test("SQL Scripting not enabled") { + withSQLConf(SQLConf.SQL_SCRIPTING_ENABLED.key -> "false") { + val sqlScriptText = + """ + |BEGIN + | SELECT 1; + |END""".stripMargin + checkError( + exception = intercept[SqlScriptingException] { + spark.sql(sqlScriptText).asInstanceOf[CompoundBody] + }, + condition = "UNSUPPORTED_FEATURE.SQL_SCRIPTING", + parameters = Map("sqlScriptingEnabled" -> toSQLConf(SQLConf.SQL_SCRIPTING_ENABLED.key))) + } + } + + test("single select") { + val sqlText = "SELECT 1;" + verifySqlScriptResult(sqlText, Seq(Row(1))) + } + + test("multiple selects") { + val sqlText = + """ + |BEGIN + | SELECT 1; + | SELECT 2; + |END""".stripMargin + verifySqlScriptResult(sqlText, Seq(Row(2))) + } + + test("multi statement - simple") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + | INSERT INTO t VALUES (1, 'a', 1.0); + | SELECT a FROM t; + |END + |""".stripMargin + verifySqlScriptResult(sqlScript, Seq(Row(1))) + } + } + + test("script without result statement") { + val sqlScript = + """ + |BEGIN + | DECLARE x INT; + | SET x = 1; + | DROP TEMPORARY VARIABLE x; + |END + |""".stripMargin + verifySqlScriptResult(sqlScript, Seq.empty) + } + + test("empty script") { + val sqlScript = + """ + |BEGIN + |END + |""".stripMargin + verifySqlScriptResult(sqlScript, Seq.empty) + } + + test("named params") { + val sqlScriptText = + """ + |BEGIN + | SELECT 1; + | IF :param_1 > 10 THEN + | SELECT :param_2; + | ELSE + | SELECT :param_3; + | END IF; + |END""".stripMargin + // Define a map with SQL parameters + val args: Map[String, Any] = Map( + "param_1" -> 5, + "param_2" -> "greater", + "param_3" -> "smaller" + ) + verifySqlScriptResultWithNamedParams(sqlScriptText, Seq(Row("smaller")), args) + } + + test("positional params") { + val sqlScriptText = + """ + |BEGIN + | SELECT 1; + | IF ? > 10 THEN + | SELECT ?; + | ELSE + | SELECT ?; + | END IF; + |END""".stripMargin + // Define an array with SQL parameters in the correct order. + val args: Array[Any] = Array(5, "greater", "smaller") + checkError( + exception = intercept[SqlScriptingException] { + spark.sql(sqlScriptText, args).asInstanceOf[CompoundBody] + }, + condition = "UNSUPPORTED_FEATURE.SQL_SCRIPTING_WITH_POSITIONAL_PARAMETERS", + parameters = Map.empty) + } + + test("named params with positional params - should fail") { + val sqlScriptText = + """ + |BEGIN + | SELECT ?; + | IF :param > 10 THEN + | SELECT 1; + | ELSE + | SELECT 2; + | END IF; + |END""".stripMargin + // Define a map with SQL parameters. + val args: Map[String, Any] = Map("param" -> 5) + checkError( + exception = intercept[AnalysisException] { + spark.sql(sqlScriptText, args).asInstanceOf[CompoundBody] + }, + condition = "UNBOUND_SQL_PARAMETER", + parameters = Map("name" -> "_16"), + context = ExpectedContext( + fragment = "?", + start = 16, + stop = 16)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNodeSuite.scala index baad5702f4f22..325c8ce380c63 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNodeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNodeSuite.scala @@ -18,11 +18,12 @@ package org.apache.spark.sql.scripting import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, OneRowRelation, Project} +import org.apache.spark.sql.catalyst.plans.logical.{DropVariable, LeafNode, OneRowRelation, Project} import org.apache.spark.sql.catalyst.trees.Origin import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StructField, StructType} /** * Unit tests for execution nodes from SqlScriptingExecutionNode.scala. @@ -31,6 +32,35 @@ import org.apache.spark.sql.test.SharedSparkSession */ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSession { // Helpers + case class TestCompoundBody( + statements: Seq[CompoundStatementExec], + label: Option[String] = None, + isScope: Boolean = false, + context: SqlScriptingExecutionContext = null) + extends CompoundBodyExec(statements, label, isScope, context) { + + // No-op to remove unnecessary logic for these tests. + override def enterScope(): Unit = () + + // No-op to remove unnecessary logic for these tests. + override def exitScope(): Unit = () + } + + case class TestForStatement( + query: SingleStatementExec, + variableName: Option[String], + body: CompoundBodyExec, + override val label: Option[String], + session: SparkSession, + context: SqlScriptingExecutionContext = null) + extends ForStatementExec( + query, + variableName, + body, + label, + session, + context) + case class TestLeafStatement(testVal: String) extends LeafStatementExec { override def reset(): Unit = () } @@ -39,7 +69,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi extends SingleStatementExec( parsedPlan = Project(Seq(Alias(Literal(condVal), description)()), OneRowRelation()), Origin(startIndex = Some(0), stopIndex = Some(description.length)), - isInternal = false) + Map.empty, + isInternal = false, + null + ) case class DummyLogicalPlan() extends LeafNode { override def output: Seq[Attribute] = Seq.empty @@ -50,7 +83,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi extends SingleStatementExec( parsedPlan = DummyLogicalPlan(), Origin(startIndex = Some(0), stopIndex = Some(description.length)), - isInternal = false) + Map.empty, + isInternal = false, + null + ) class LoopBooleanConditionEvaluator(condition: TestLoopCondition) { private var callCount: Int = 0 @@ -68,7 +104,7 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi case class TestWhile( condition: TestLoopCondition, - body: CompoundBodyExec, + body: TestCompoundBody, label: Option[String] = None) extends WhileStatementExec(condition, body, label, spark) { @@ -80,9 +116,9 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } case class TestRepeat( - condition: TestLoopCondition, - body: CompoundBodyExec, - label: Option[String] = None) + condition: TestLoopCondition, + body: TestCompoundBody, + label: Option[String] = None) extends RepeatStatementExec(condition, body, label, spark) { private val evaluator = new LoopBooleanConditionEvaluator(condition) @@ -92,6 +128,24 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi statement: LeafStatementExec): Boolean = evaluator.evaluateLoopBooleanCondition() } + case class MockQuery(numberOfRows: Int, columnName: String, description: String) + extends SingleStatementExec( + DummyLogicalPlan(), + Origin(startIndex = Some(0), stopIndex = Some(description.length)), + Map.empty, + isInternal = false, + null) { + override def buildDataFrame(session: SparkSession): DataFrame = { + val data = Seq.range(0, numberOfRows).map(Row(_)) + val schema = List(StructField(columnName, IntegerType)) + + spark.createDataFrame( + spark.sparkContext.parallelize(data), + StructType(schema) + ) + } + } + private def extractStatementValue(statement: CompoundStatementExec): String = statement match { case TestLeafStatement(testVal) => testVal @@ -100,18 +154,21 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi case loopStmt: LoopStatementExec => loopStmt.label.get case leaveStmt: LeaveStatementExec => leaveStmt.label case iterateStmt: IterateStatementExec => iterateStmt.label + case forStmt: TestForStatement => forStmt.label.get + case dropStmt: SingleStatementExec if dropStmt.parsedPlan.isInstanceOf[DropVariable] + => "DropVariable" case _ => fail("Unexpected statement type") } // Tests test("test body - single statement") { - val iter = new CompoundBodyExec(Seq(TestLeafStatement("one"))).getTreeIterator + val iter = TestCompoundBody(Seq(TestLeafStatement("one"))).getTreeIterator val statements = iter.map(extractStatementValue).toSeq assert(statements === Seq("one")) } test("test body - no nesting") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( Seq( TestLeafStatement("one"), TestLeafStatement("two"), @@ -122,26 +179,26 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("test body - nesting") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( Seq( - new CompoundBodyExec(Seq(TestLeafStatement("one"), TestLeafStatement("two"))), + TestCompoundBody(Seq(TestLeafStatement("one"), TestLeafStatement("two"))), TestLeafStatement("three"), - new CompoundBodyExec(Seq(TestLeafStatement("four"), TestLeafStatement("five"))))) + TestCompoundBody(Seq(TestLeafStatement("four"), TestLeafStatement("five"))))) .getTreeIterator val statements = iter.map(extractStatementValue).toSeq assert(statements === Seq("one", "two", "three", "four", "five")) } test("if else - enter body of the IF clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = true, description = "con1") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body2")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body2")))), session = spark ) )).getTreeIterator @@ -150,15 +207,15 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else - enter body of the ELSE clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body2")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body2")))), session = spark ) )).getTreeIterator @@ -167,17 +224,17 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else if - enter body of the IF clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = true, description = "con1"), TestIfElseCondition(condVal = false, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body3")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body3")))), session = spark ) )).getTreeIterator @@ -186,17 +243,17 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else if - enter body of the ELSE IF clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = true, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body3")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body3")))), session = spark ) )).getTreeIterator @@ -205,7 +262,7 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else if - enter body of the second ELSE IF clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), @@ -213,11 +270,11 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi TestIfElseCondition(condVal = true, description = "con3") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))), - new CompoundBodyExec(Seq(TestLeafStatement("body3"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))), + TestCompoundBody(Seq(TestLeafStatement("body3"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body4")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body4")))), session = spark ) )).getTreeIterator @@ -226,17 +283,17 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else if - enter body of the ELSE clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = false, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body3")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body3")))), session = spark ) )).getTreeIterator @@ -245,15 +302,15 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else if - without else (successful check)") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = true, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), elseBody = None, session = spark @@ -264,15 +321,15 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("if else if - without else (unsuccessful checks)") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new IfElseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = false, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), elseBody = None, session = spark @@ -283,10 +340,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("while - doesn't enter body") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 0, description = "con1"), - body = new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) ) )).getTreeIterator val statements = iter.map(extractStatementValue).toSeq @@ -294,10 +351,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("while - enters body once") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 1, description = "con1"), - body = new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) ) )).getTreeIterator val statements = iter.map(extractStatementValue).toSeq @@ -305,10 +362,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("while - enters body with multiple statements multiple times") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("statement1"), TestLeafStatement("statement2"))) ) @@ -319,13 +376,13 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("nested while - 2 times outer 2 times inner") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con2"), - body = new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) )) ) ) @@ -338,10 +395,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("repeat - true condition") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 0, description = "con1"), - body = new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) ) )).getTreeIterator val statements = iter.map(extractStatementValue).toSeq @@ -349,10 +406,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("repeat - condition false once") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 1, description = "con1"), - body = new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) ) )).getTreeIterator val statements = iter.map(extractStatementValue).toSeq @@ -360,10 +417,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("repeat - enters body with multiple statements multiple times") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("statement1"), TestLeafStatement("statement2"))) ) @@ -374,13 +431,13 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("nested repeat") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con2"), - body = new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) )) ) ) @@ -396,7 +453,7 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("leave compound block") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestLeafStatement("one"), new LeaveStatementExec("lbl") @@ -408,11 +465,11 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("leave while loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new LeaveStatementExec("lbl")) ), @@ -425,11 +482,11 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("leave repeat loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new LeaveStatementExec("lbl")) ), @@ -442,11 +499,11 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("iterate while loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new IterateStatementExec("lbl"), TestLeafStatement("body2")) @@ -460,11 +517,11 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("iterate repeat loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new IterateStatementExec("lbl"), TestLeafStatement("body2")) @@ -479,14 +536,14 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("leave outer loop from nested while loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con2"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new LeaveStatementExec("lbl")) ), @@ -502,14 +559,14 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("leave outer loop from nested repeat loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con2"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new LeaveStatementExec("lbl")) ), @@ -525,14 +582,14 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("iterate outer loop from nested while loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestWhile( condition = TestLoopCondition(condVal = true, reps = 2, description = "con2"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new IterateStatementExec("lbl"), TestLeafStatement("body2")) @@ -552,14 +609,14 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("iterate outer loop from nested repeat loop") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con1"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestRepeat( condition = TestLoopCondition(condVal = false, reps = 2, description = "con2"), - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new IterateStatementExec("lbl"), TestLeafStatement("body2")) @@ -579,17 +636,17 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("searched case - enter first WHEN clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new CaseStatementExec( conditions = Seq( TestIfElseCondition(condVal = true, description = "con1"), TestIfElseCondition(condVal = false, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body3")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body3")))), session = spark ) )).getTreeIterator @@ -598,15 +655,15 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("searched case - enter body of the ELSE clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new CaseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body2")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body2")))), session = spark ) )).getTreeIterator @@ -615,17 +672,17 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("searched case - enter second WHEN clause") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new CaseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = true, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), - elseBody = Some(new CompoundBodyExec(Seq(TestLeafStatement("body3")))), + elseBody = Some(TestCompoundBody(Seq(TestLeafStatement("body3")))), session = spark ) )).getTreeIterator @@ -634,15 +691,15 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("searched case - without else (successful check)") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new CaseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = true, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), elseBody = None, session = spark @@ -653,15 +710,15 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("searched case - without else (unsuccessful checks)") { - val iter = new CompoundBodyExec(Seq( + val iter = TestCompoundBody(Seq( new CaseStatementExec( conditions = Seq( TestIfElseCondition(condVal = false, description = "con1"), TestIfElseCondition(condVal = false, description = "con2") ), conditionalBodies = Seq( - new CompoundBodyExec(Seq(TestLeafStatement("body1"))), - new CompoundBodyExec(Seq(TestLeafStatement("body2"))) + TestCompoundBody(Seq(TestLeafStatement("body1"))), + TestCompoundBody(Seq(TestLeafStatement("body2"))) ), elseBody = None, session = spark @@ -672,10 +729,10 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi } test("loop statement with leave") { - val iter = new CompoundBodyExec( + val iter = TestCompoundBody( statements = Seq( new LoopStatementExec( - body = new CompoundBodyExec(Seq( + body = TestCompoundBody(Seq( TestLeafStatement("body1"), new LeaveStatementExec("lbl")) ), @@ -686,4 +743,363 @@ class SqlScriptingExecutionNodeSuite extends SparkFunSuite with SharedSparkSessi val statements = iter.map(extractStatementValue).toSeq assert(statements === Seq("body1", "lbl")) } + + test("for statement - enters body once") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(1, "intCol", "query1"), + variableName = Some("x"), + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq(TestLeafStatement("body"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "body", + "DropVariable", // drop for query var intCol + "DropVariable" // drop for loop var x + )) + } + + test("for statement - enters body with multiple statements multiple times") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = Some("x"), + label = Some("for1"), + session = spark, + body = TestCompoundBody( + Seq(TestLeafStatement("statement1"), TestLeafStatement("statement2")) + ) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "statement1", + "statement2", + "statement1", + "statement2", + "DropVariable", // drop for query var intCol + "DropVariable" // drop for loop var x + )) + } + + test("for statement - empty result") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(0, "intCol", "query1"), + variableName = Some("x"), + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq.empty[String]) + } + + test("for statement - nested") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = Some("x"), + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol1", "query2"), + variableName = Some("y"), + label = Some("for2"), + session = spark, + body = TestCompoundBody(Seq(TestLeafStatement("body"))) + ) + )) + )), + label = Some("lbl") + ).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "body", + "body", + "DropVariable", // drop for query var intCol1 + "DropVariable", // drop for loop var y + "body", + "body", + "DropVariable", // drop for query var intCol1 + "DropVariable", // drop for loop var y + "DropVariable", // drop for query var intCol + "DropVariable" // drop for loop var x + )) + } + + test("for statement no variable - enters body once") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(1, "intCol", "query1"), + variableName = None, + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq(TestLeafStatement("body"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "body", + "DropVariable" // drop for query var intCol + )) + } + + test("for statement no variable - enters body with multiple statements multiple times") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = None, + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("statement1"), + TestLeafStatement("statement2"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "statement1", "statement2", "statement1", "statement2", + "DropVariable" // drop for query var intCol + )) + } + + test("for statement no variable - empty result") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(0, "intCol", "query1"), + variableName = None, + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq(TestLeafStatement("body1"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq.empty[String]) + } + + test("for statement no variable - nested") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = None, + label = Some("for1"), + session = spark, + body = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol1", "query2"), + variableName = None, + label = Some("for2"), + session = spark, + body = TestCompoundBody(Seq(TestLeafStatement("body"))) + ) + )) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "body", "body", + "DropVariable", // drop for query var intCol1 + "body", "body", + "DropVariable", // drop for query var intCol1 + "DropVariable" // drop for query var intCol + )) + } + + test("for statement - iterate") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = Some("x"), + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("statement1"), + new IterateStatementExec("lbl1"), + TestLeafStatement("statement2"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "statement1", + "lbl1", + "statement1", + "lbl1", + "DropVariable", // drop for query var intCol + "DropVariable" // drop for loop var x + )) + } + + test("for statement - leave") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = Some("x"), + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("statement1"), + new LeaveStatementExec("lbl1"), + TestLeafStatement("statement2"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq("statement1", "lbl1")) + } + + test("for statement - nested - iterate outer loop") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = Some("x"), + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("outer_body"), + TestForStatement( + query = MockQuery(2, "intCol1", "query2"), + variableName = Some("y"), + label = Some("lbl2"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("body1"), + new IterateStatementExec("lbl1"), + TestLeafStatement("body2"))) + ) + )) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "outer_body", + "body1", + "lbl1", + "outer_body", + "body1", + "lbl1", + "DropVariable", // drop for query var intCol + "DropVariable" // drop for loop var x + )) + } + + test("for statement - nested - leave outer loop") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = Some("x"), + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query2"), + variableName = Some("y"), + label = Some("lbl2"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("body1"), + new LeaveStatementExec("lbl1"), + TestLeafStatement("body2"))) + ) + )) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq("body1", "lbl1")) + } + + test("for statement no variable - iterate") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = None, + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("statement1"), + new IterateStatementExec("lbl1"), + TestLeafStatement("statement2"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "statement1", "lbl1", "statement1", "lbl1", + "DropVariable" // drop for query var intCol + )) + } + + test("for statement no variable - leave") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = None, + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("statement1"), + new LeaveStatementExec("lbl1"), + TestLeafStatement("statement2"))) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq("statement1", "lbl1")) + } + + test("for statement no variable - nested - iterate outer loop") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = None, + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("outer_body"), + TestForStatement( + query = MockQuery(2, "intCol1", "query2"), + variableName = None, + label = Some("lbl2"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("body1"), + new IterateStatementExec("lbl1"), + TestLeafStatement("body2"))) + ) + )) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq( + "outer_body", "body1", "lbl1", "outer_body", "body1", "lbl1", + "DropVariable" // drop for query var intCol + )) + } + + test("for statement no variable - nested - leave outer loop") { + val iter = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol", "query1"), + variableName = None, + label = Some("lbl1"), + session = spark, + body = TestCompoundBody(Seq( + TestForStatement( + query = MockQuery(2, "intCol1", "query2"), + variableName = None, + label = Some("lbl2"), + session = spark, + body = TestCompoundBody(Seq( + TestLeafStatement("body1"), + new LeaveStatementExec("lbl1"), + TestLeafStatement("body2"))) + ) + )) + ) + )).getTreeIterator + val statements = iter.map(extractStatementValue).toSeq + assert(statements === Seq("body1", "lbl1")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionSuite.scala new file mode 100644 index 0000000000000..5b5285ea13275 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionSuite.scala @@ -0,0 +1,1069 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.scripting + +import scala.collection.mutable.ListBuffer + +import org.apache.spark.SparkConf +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.CompoundBody +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +/** + * SQL Scripting interpreter tests. + * Output from the parser is provided to the interpreter. + * Output from the interpreter (iterator over executable statements) is then checked - statements + * are executed and output DataFrames are compared with expected outputs. + */ +class SqlScriptingExecutionSuite extends QueryTest with SharedSparkSession { + + // Tests setup + override protected def sparkConf: SparkConf = { + super.sparkConf.set(SQLConf.SQL_SCRIPTING_ENABLED.key, "true") + } + + // Helpers + private def runSqlScript( + sqlText: String, + args: Map[String, Expression] = Map.empty): Seq[Array[Row]] = { + val compoundBody = spark.sessionState.sqlParser.parsePlan(sqlText).asInstanceOf[CompoundBody] + val sse = new SqlScriptingExecution(compoundBody, spark, args) + val result: ListBuffer[Array[Row]] = ListBuffer.empty + + var df = sse.getNextResult + while (df.isDefined) { + // Collect results from the current DataFrame. + result.append(df.get.collect()) + df = sse.getNextResult + } + result.toSeq + } + + private def verifySqlScriptResult(sqlText: String, expected: Seq[Seq[Row]]): Unit = { + val result = runSqlScript(sqlText) + assert(result.length == expected.length) + result.zip(expected).foreach { + case (actualAnswer, expectedAnswer) => + assert(actualAnswer.sameElements(expectedAnswer)) + } + } + + // Tests + test("multi statement - simple") { + withTable("t") { + val sqlScript = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |INSERT INTO t VALUES (1, 'a', 1.0); + |SELECT a, b FROM t WHERE a = 12; + |SELECT a FROM t; + |END + |""".stripMargin + val expected = Seq( + Seq.empty[Row], // select + Seq(Row(1)) // select + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("multi statement - count") { + withTable("t") { + val sqlScript = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |INSERT INTO t VALUES (1, 'a', 1.0); + |INSERT INTO t VALUES (1, 'a', 1.0); + |SELECT + | CASE WHEN COUNT(*) > 10 THEN true + | ELSE false + | END AS MoreThanTen + |FROM t; + |END + |""".stripMargin + val expected = Seq(Seq(Row(false))) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("session vars - set and read (SET VAR)") { + val sqlScript = + """ + |BEGIN + |DECLARE var = 1; + |SET VAR var = var + 1; + |SELECT var; + |END + |""".stripMargin + val expected = Seq(Seq(Row(2))) + verifySqlScriptResult(sqlScript, expected) + } + + test("session vars - set and read (SET)") { + val sqlScript = + """ + |BEGIN + |DECLARE var = 1; + |SET var = var + 1; + |SELECT var; + |END + |""".stripMargin + val expected = Seq(Seq(Row(2))) + verifySqlScriptResult(sqlScript, expected) + } + + test("session vars - set and read scoped") { + val sqlScript = + """ + |BEGIN + | BEGIN + | DECLARE var = 1; + | SELECT var; + | END; + | BEGIN + | DECLARE var = 2; + | SELECT var; + | END; + | BEGIN + | DECLARE var = 3; + | SET VAR var = var + 1; + | SELECT var; + | END; + |END + |""".stripMargin + val expected = Seq( + Seq(Row(1)), // select + Seq(Row(2)), // select + Seq(Row(4)) // select + ) + verifySqlScriptResult(sqlScript, expected) + } + + test("session vars - drop var statement") { + val sqlScript = + """ + |BEGIN + |DECLARE var = 1; + |SET VAR var = var + 1; + |SELECT var; + |DROP TEMPORARY VARIABLE var; + |END + |""".stripMargin + val expected = Seq(Seq(Row(2))) + verifySqlScriptResult(sqlScript, expected) + } + + test("if") { + val commands = + """ + |BEGIN + | IF 1=1 THEN + | SELECT 42; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("if nested") { + val commands = + """ + |BEGIN + | IF 1=1 THEN + | IF 2=1 THEN + | SELECT 41; + | ELSE + | SELECT 42; + | END IF; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("if else going in if") { + val commands = + """ + |BEGIN + | IF 1=1 + | THEN + | SELECT 42; + | ELSE + | SELECT 43; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("if else if going in else if") { + val commands = + """ + |BEGIN + | IF 1=2 + | THEN + | SELECT 42; + | ELSE IF 1=1 + | THEN + | SELECT 43; + | ELSE + | SELECT 44; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + + test("if else going in else") { + val commands = + """ + |BEGIN + | IF 1=2 + | THEN + | SELECT 42; + | ELSE + | SELECT 43; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + + test("if else if going in else") { + val commands = + """ + |BEGIN + | IF 1=2 + | THEN + | SELECT 42; + | ELSE IF 1=3 + | THEN + | SELECT 43; + | ELSE + | SELECT 44; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(44))) + verifySqlScriptResult(commands, expected) + } + + test("if with count") { + withTable("t") { + val commands = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |INSERT INTO t VALUES (1, 'a', 1.0); + |INSERT INTO t VALUES (1, 'a', 1.0); + |IF (SELECT COUNT(*) > 2 FROM t) THEN + | SELECT 42; + | ELSE + | SELECT 43; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + } + + test("if else if with count") { + withTable("t") { + val commands = + """ + |BEGIN + | CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + | INSERT INTO t VALUES (1, 'a', 1.0); + | INSERT INTO t VALUES (1, 'a', 1.0); + | IF (SELECT COUNT(*) > 2 FROM t) THEN + | SELECT 42; + | ELSE IF (SELECT COUNT(*) > 1 FROM t) THEN + | SELECT 43; + | ELSE + | SELECT 44; + | END IF; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + } + + test("searched case") { + val commands = + """ + |BEGIN + | CASE + | WHEN 1 = 1 THEN + | SELECT 42; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("searched case nested") { + val commands = + """ + |BEGIN + | CASE + | WHEN 1=1 THEN + | CASE + | WHEN 2=1 THEN + | SELECT 41; + | ELSE + | SELECT 42; + | END CASE; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("searched case second case") { + val commands = + """ + |BEGIN + | CASE + | WHEN 1 = (SELECT 2) THEN + | SELECT 1; + | WHEN 2 = 2 THEN + | SELECT 42; + | WHEN (SELECT * FROM t) THEN + | SELECT * FROM b; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("searched case going in else") { + val commands = + """ + |BEGIN + | CASE + | WHEN 2 = 1 THEN + | SELECT 1; + | WHEN 3 IN (1,2) THEN + | SELECT 2; + | ELSE + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + + test("searched case with count") { + withTable("t") { + val commands = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |INSERT INTO t VALUES (1, 'a', 1.0); + |INSERT INTO t VALUES (1, 'a', 1.0); + |CASE + | WHEN (SELECT COUNT(*) > 2 FROM t) THEN + | SELECT 42; + | ELSE + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + } + + test("searched case else with count") { + withTable("t") { + val commands = + """ + |BEGIN + | CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + | INSERT INTO t VALUES (1, 'a', 1.0); + | INSERT INTO t VALUES (1, 'a', 1.0); + | CASE + | WHEN (SELECT COUNT(*) > 2 FROM t) THEN + | SELECT 42; + | WHEN (SELECT COUNT(*) > 1 FROM t) THEN + | SELECT 43; + | ELSE + | SELECT 44; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + } + + test("searched case no cases matched no else") { + val commands = + """ + |BEGIN + | CASE + | WHEN 1 = 2 THEN + | SELECT 42; + | WHEN 1 = 3 THEN + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq.empty + verifySqlScriptResult(commands, expected) + } + + test("simple case") { + val commands = + """ + |BEGIN + | CASE 1 + | WHEN 1 THEN + | SELECT 42; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("simple case nested") { + val commands = + """ + |BEGIN + | CASE 1 + | WHEN 1 THEN + | CASE 2 + | WHEN (SELECT 3) THEN + | SELECT 41; + | ELSE + | SELECT 42; + | END CASE; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("simple case second case") { + val commands = + """ + |BEGIN + | CASE (SELECT 2) + | WHEN 1 THEN + | SELECT 1; + | WHEN 2 THEN + | SELECT 42; + | WHEN (SELECT * FROM t) THEN + | SELECT * FROM b; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + + test("simple case going in else") { + val commands = + """ + |BEGIN + | CASE 1 + | WHEN 2 THEN + | SELECT 1; + | WHEN 3 THEN + | SELECT 2; + | ELSE + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + + test("simple case with count") { + withTable("t") { + val commands = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |INSERT INTO t VALUES (1, 'a', 1.0); + |INSERT INTO t VALUES (1, 'a', 1.0); + |CASE (SELECT COUNT(*) FROM t) + | WHEN 1 THEN + | SELECT 41; + | WHEN 2 THEN + | SELECT 42; + | ELSE + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(42))) + verifySqlScriptResult(commands, expected) + } + } + + test("simple case else with count") { + withTable("t") { + val commands = + """ + |BEGIN + | CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + | INSERT INTO t VALUES (1, 'a', 1.0); + | INSERT INTO t VALUES (2, 'b', 2.0); + | CASE (SELECT COUNT(*) FROM t) + | WHEN 1 THEN + | SELECT 42; + | WHEN 3 THEN + | SELECT 43; + | ELSE + | SELECT 44; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(44))) + verifySqlScriptResult(commands, expected) + } + } + + test("simple case no cases matched no else") { + val commands = + """ + |BEGIN + | CASE 1 + | WHEN 2 THEN + | SELECT 42; + | WHEN 3 THEN + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq.empty + verifySqlScriptResult(commands, expected) + } + + test("simple case compare with null") { + withTable("t") { + val commands = + """ + |BEGIN + | CREATE TABLE t (a INT) USING parquet; + | CASE (SELECT COUNT(*) FROM t) + | WHEN 1 THEN + | SELECT 42; + | ELSE + | SELECT 43; + | END CASE; + |END + |""".stripMargin + val expected = Seq(Seq(Row(43))) + verifySqlScriptResult(commands, expected) + } + } + + test("while") { + val commands = + """ + |BEGIN + | DECLARE i = 0; + | WHILE i < 3 DO + | SELECT i; + | SET VAR i = i + 1; + | END WHILE; + |END + |""".stripMargin + val expected = Seq( + Seq(Row(0)), // select i + Seq(Row(1)), // select i + Seq(Row(2)) // select i + ) + verifySqlScriptResult(commands, expected) + } + + test("while: not entering body") { + val commands = + """ + |BEGIN + | DECLARE i = 3; + | WHILE i < 3 DO + | SELECT i; + | SET VAR i = i + 1; + | END WHILE; + |END + |""".stripMargin + val expected = Seq.empty + verifySqlScriptResult(commands, expected) + } + + test("nested while") { + val commands = + """ + |BEGIN + | DECLARE i = 0; + | DECLARE j = 0; + | WHILE i < 2 DO + | SET VAR j = 0; + | WHILE j < 2 DO + | SELECT i, j; + | SET VAR j = j + 1; + | END WHILE; + | SET VAR i = i + 1; + | END WHILE; + |END + |""".stripMargin + val expected = Seq( + Seq(Row(0, 0)), // select i, j + Seq(Row(0, 1)), // select i, j + Seq(Row(1, 0)), // select i, j + Seq(Row(1, 1)) // select i, j + ) + verifySqlScriptResult(commands, expected) + } + + test("while with count") { + withTable("t") { + val commands = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |WHILE (SELECT COUNT(*) < 2 FROM t) DO + | SELECT 42; + | INSERT INTO t VALUES (1, 'a', 1.0); + |END WHILE; + |END + |""".stripMargin + val expected = Seq( + Seq(Row(42)), // select + Seq(Row(42)) // select + ) + verifySqlScriptResult(commands, expected) + } + } + + test("repeat") { + val commands = + """ + |BEGIN + | DECLARE i = 0; + | REPEAT + | SELECT i; + | SET VAR i = i + 1; + | UNTIL + | i = 3 + | END REPEAT; + |END + |""".stripMargin + val expected = Seq( + Seq(Row(0)), // select i + Seq(Row(1)), // select i + Seq(Row(2)) // select i + ) + verifySqlScriptResult(commands, expected) + } + + test("repeat: enters body only once") { + val commands = + """ + |BEGIN + | DECLARE i = 3; + | REPEAT + | SELECT i; + | SET VAR i = i + 1; + | UNTIL + | 1 = 1 + | END REPEAT; + |END + |""".stripMargin + + val expected = Seq(Seq(Row(3))) + verifySqlScriptResult(commands, expected) + } + + test("nested repeat") { + val commands = + """ + |BEGIN + | DECLARE i = 0; + | DECLARE j = 0; + | REPEAT + | SET VAR j = 0; + | REPEAT + | SELECT i, j; + | SET VAR j = j + 1; + | UNTIL j >= 2 + | END REPEAT; + | SET VAR i = i + 1; + | UNTIL i >= 2 + | END REPEAT; + |END + |""".stripMargin + + val expected = Seq( + Seq(Row(0, 0)), // select i, j + Seq(Row(0, 1)), // select i, j + Seq(Row(1, 0)), // select i, j + Seq(Row(1, 1)) // select i, j + ) + verifySqlScriptResult(commands, expected) + } + + test("repeat with count") { + withTable("t") { + val commands = + """ + |BEGIN + |CREATE TABLE t (a INT, b STRING, c DOUBLE) USING parquet; + |REPEAT + | SELECT 42; + | INSERT INTO t VALUES (1, 'a', 1.0); + |UNTIL (SELECT COUNT(*) >= 2 FROM t) + |END REPEAT; + |END + |""".stripMargin + + val expected = Seq( + Seq(Row(42)), // select + Seq(Row(42)) // select + ) + verifySqlScriptResult(commands, expected) + } + } + + test("leave compound block") { + val sqlScriptText = + """ + |BEGIN + | lbl: BEGIN + | SELECT 1; + | LEAVE lbl; + | SELECT 2; + | END; + |END""".stripMargin + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("leave while loop") { + val sqlScriptText = + """ + |BEGIN + | lbl: WHILE 1 = 1 DO + | SELECT 1; + | LEAVE lbl; + | END WHILE; + |END""".stripMargin + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("leave repeat loop") { + val sqlScriptText = + """ + |BEGIN + | lbl: REPEAT + | SELECT 1; + | LEAVE lbl; + | UNTIL 1 = 2 + | END REPEAT; + |END""".stripMargin + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("iterate while loop") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: WHILE x < 2 DO + | SET x = x + 1; + | ITERATE lbl; + | SET x = x + 2; + | END WHILE; + | SELECT x; + |END""".stripMargin + val expected = Seq(Seq(Row(2))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("iterate repeat loop") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: REPEAT + | SET x = x + 1; + | ITERATE lbl; + | SET x = x + 2; + | UNTIL x > 1 + | END REPEAT; + | SELECT x; + |END""".stripMargin + val expected = Seq(Seq(Row(2))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("leave outer loop from nested repeat loop") { + val sqlScriptText = + """ + |BEGIN + | lbl: REPEAT + | lbl2: REPEAT + | SELECT 1; + | LEAVE lbl; + | UNTIL 1 = 2 + | END REPEAT; + | UNTIL 1 = 2 + | END REPEAT; + |END""".stripMargin + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("leave outer loop from nested while loop") { + val sqlScriptText = + """ + |BEGIN + | lbl: WHILE 1 = 1 DO + | lbl2: WHILE 2 = 2 DO + | SELECT 1; + | LEAVE lbl; + | END WHILE; + | END WHILE; + |END""".stripMargin + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("iterate outer loop from nested while loop") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: WHILE x < 2 DO + | SET x = x + 1; + | lbl2: WHILE 2 = 2 DO + | SELECT 1; + | ITERATE lbl; + | END WHILE; + | END WHILE; + | SELECT x; + |END""".stripMargin + val expected = Seq( + Seq(Row(1)), // select 1 + Seq(Row(1)), // select 1 + Seq(Row(2)) // select x + ) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("nested compounds in loop - leave in inner compound") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: WHILE x < 2 DO + | SET x = x + 1; + | BEGIN + | SELECT 1; + | lbl2: BEGIN + | SELECT 2; + | LEAVE lbl2; + | SELECT 3; + | END; + | END; + | END WHILE; + | SELECT x; + |END""".stripMargin + val expected = Seq( + Seq(Row(1)), // select 1 + Seq(Row(2)), // select 2 + Seq(Row(1)), // select 1 + Seq(Row(2)), // select 2 + Seq(Row(2)) // select x + ) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("iterate outer loop from nested repeat loop") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: REPEAT + | SET x = x + 1; + | lbl2: REPEAT + | SELECT 1; + | ITERATE lbl; + | UNTIL 1 = 2 + | END REPEAT; + | UNTIL x > 1 + | END REPEAT; + | SELECT x; + |END""".stripMargin + val expected = Seq( + Seq(Row(1)), // select 1 + Seq(Row(1)), // select 1 + Seq(Row(2)) // select x + ) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("loop statement with leave") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: LOOP + | SET x = x + 1; + | SELECT x; + | IF x > 2 + | THEN + | LEAVE lbl; + | END IF; + | END LOOP; + | SELECT x; + |END""".stripMargin + val expected = Seq( + Seq(Row(1)), // select x + Seq(Row(2)), // select x + Seq(Row(3)), // select x + Seq(Row(3)) // select x + ) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("nested loop statement with leave") { + val commands = + """ + |BEGIN + | DECLARE x = 0; + | DECLARE y = 0; + | lbl1: LOOP + | SET VAR y = 0; + | lbl2: LOOP + | SELECT x, y; + | SET VAR y = y + 1; + | IF y >= 2 THEN + | LEAVE lbl2; + | END IF; + | END LOOP; + | SET VAR x = x + 1; + | IF x >= 2 THEN + | LEAVE lbl1; + | END IF; + | END LOOP; + |END + |""".stripMargin + + val expected = Seq( + Seq(Row(0, 0)), // select x, y + Seq(Row(0, 1)), // select x, y + Seq(Row(1, 0)), // select x, y + Seq(Row(1, 1)) // select x, y + ) + verifySqlScriptResult(commands, expected) + } + + test("iterate loop statement") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: LOOP + | SET x = x + 1; + | IF x > 1 THEN + | LEAVE lbl; + | END IF; + | ITERATE lbl; + | SET x = x + 2; + | END LOOP; + | SELECT x; + |END""".stripMargin + val expected = Seq(Seq(Row(2))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("leave outer loop from nested loop statement") { + val sqlScriptText = + """ + |BEGIN + | lbl: LOOP + | lbl2: LOOP + | SELECT 1; + | LEAVE lbl; + | END LOOP; + | END LOOP; + |END""".stripMargin + // Execution immediately leaves the outer loop after SELECT, + // so we expect only a single row in the result set. + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScriptText, expected) + } + + test("iterate outer loop from nested loop statement") { + val sqlScriptText = + """ + |BEGIN + | DECLARE x INT; + | SET x = 0; + | lbl: LOOP + | SET x = x + 1; + | IF x > 2 THEN + | LEAVE lbl; + | END IF; + | lbl2: LOOP + | SELECT 1; + | ITERATE lbl; + | SET x = 10; + | END LOOP; + | END LOOP; + | SELECT x; + |END""".stripMargin + val expected = Seq( + Seq(Row(1)), // select 1 + Seq(Row(1)), // select 1 + Seq(Row(3)) // select x + ) + verifySqlScriptResult(sqlScriptText, expected) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreterSuite.scala index b0b844d2b52ca..c7439a8934d73 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreterSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.scripting import org.apache.spark.{SparkConf, SparkException, SparkNumberFormatException} import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, QueryTest, Row} import org.apache.spark.sql.catalyst.QueryPlanningTracker +import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.CompoundBody import org.apache.spark.sql.exceptions.SqlScriptingException import org.apache.spark.sql.internal.SQLConf @@ -39,11 +40,19 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { } // Helpers - private def runSqlScript(sqlText: String): Array[DataFrame] = { - val interpreter = SqlScriptingInterpreter() + private def runSqlScript( + sqlText: String, + args: Map[String, Expression] = Map.empty): Array[DataFrame] = { + val interpreter = SqlScriptingInterpreter(spark) val compoundBody = spark.sessionState.sqlParser.parsePlan(sqlText).asInstanceOf[CompoundBody] - val executionPlan = interpreter.buildExecutionPlan(compoundBody, spark) - executionPlan.flatMap { + + // Initialize context so scopes can be entered correctly. + val context = new SqlScriptingExecutionContext() + val executionPlan = interpreter.buildExecutionPlan(compoundBody, args, context) + context.frames.append(new SqlScriptingExecutionFrame(executionPlan.getTreeIterator)) + executionPlan.enterScope() + + executionPlan.getTreeIterator.flatMap { case statement: SingleStatementExec => if (statement.isExecuted) { None @@ -107,6 +116,61 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { } } + test("empty begin end block") { + val sqlScript = + """ + |BEGIN + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(sqlScript, expected) + } + + test("empty begin end blocks") { + val sqlScript = + """ + |BEGIN + | BEGIN + | END; + | BEGIN + | END; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(sqlScript, expected) + } + + test("empty begin end blocks with single statement") { + val sqlScript = + """ + |BEGIN + | BEGIN + | END; + | SELECT 1; + | BEGIN + | END; + |END + |""".stripMargin + val expected = Seq(Seq(Row(1))) + verifySqlScriptResult(sqlScript, expected) + } + + test("empty begin end blocks - nested") { + val sqlScript = + """ + |BEGIN + | BEGIN + | BEGIN + | END; + | BEGIN + | END; + | END; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(sqlScript, expected) + } + test("session vars - set and read (SET VAR)") { val sqlScript = """ @@ -237,6 +301,40 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { verifySqlScriptResult(commands, expected) } + test("if - empty body") { + val commands = + """ + |BEGIN + | IF 1=1 THEN + | BEGIN + | END; + | END IF; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + + test("if - nested empty body") { + val commands = + """ + |BEGIN + | IF 1=1 THEN + | BEGIN + | BEGIN + | END; + | END; + | BEGIN + | BEGIN + | END; + | END; + | END IF; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + test("if nested") { val commands = """ @@ -386,6 +484,42 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { verifySqlScriptResult(commands, expected) } + test("searched case - empty body") { + val commands = + """ + |BEGIN + | CASE + | WHEN 1 = 1 THEN + | BEGIN + | END; + | END CASE; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + + test("searched case - nested empty body") { + val commands = + """ + |BEGIN + | CASE + | WHEN 1 = 1 THEN + | BEGIN + | BEGIN + | END; + | END; + | BEGIN + | BEGIN + | END; + | END; + | END CASE; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + test("searched case nested") { val commands = """ @@ -586,6 +720,42 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { verifySqlScriptResult(commands, expected) } + test("simple case - empty body") { + val commands = + """ + |BEGIN + | CASE 1 + | WHEN 1 THEN + | BEGIN + | END; + | END CASE; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + + test("simple case - nested empty body") { + val commands = + """ + |BEGIN + | CASE 1 + | WHEN 1 THEN + | BEGIN + | BEGIN + | END; + | END; + | BEGIN + | BEGIN + | END; + | END; + | END CASE; + |END + |""".stripMargin + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + test("simple case nested") { val commands = """ @@ -982,6 +1152,42 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { verifySqlScriptResult(commands, expected) } + test("repeat - empty body") { + val commands = + """ + |BEGIN + | REPEAT + | BEGIN + | END; + | UNTIL 1 = 1 + | END REPEAT; + |END + |""".stripMargin + + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + + test("repeat - nested empty body") { + val commands = + """ + |BEGIN + | REPEAT + | BEGIN + | BEGIN + | END; + | END; + | BEGIN + | END; + | UNTIL 1 = 1 + | END REPEAT; + |END + |""".stripMargin + + val expected = Seq.empty[Seq[Row]] + verifySqlScriptResult(commands, expected) + } + test("nested repeat") { val commands = """ @@ -1547,4 +1753,1116 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession { ) verifySqlScriptResult(sqlScriptText, expected) } + + test("for statement - enters body once") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING, doubleCol DOUBLE) using parquet; + | INSERT INTO t VALUES (1, 'first', 1.0); + | FOR row AS SELECT * FROM t DO + | SELECT row.intCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(1)), // select row.intCol + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - enters body with multiple statements multiple times") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING, doubleCol DOUBLE) using parquet; + | INSERT INTO t VALUES (1, 'first', 1.0); + | INSERT INTO t VALUES (2, 'second', 2.0); + | FOR row AS SELECT * FROM t ORDER BY intCol DO + | SELECT row.intCol; + | SELECT intCol; + | SELECT row.stringCol; + | SELECT stringCol; + | SELECT row.doubleCol; + | SELECT doubleCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(1)), // select row.intCol + Seq(Row(1)), // select intCol + Seq(Row("first")), // select row.stringCol + Seq(Row("first")), // select stringCol + Seq(Row(1.0)), // select row.doubleCol + Seq(Row(1.0)), // select doubleCol + Seq(Row(2)), // select row.intCol + Seq(Row(2)), // select intCol + Seq(Row("second")), // select row.stringCol + Seq(Row("second")), // select stringCol + Seq(Row(2.0)), // select row.doubleCol + Seq(Row(2.0)), // select doubleCol + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - sum of column from table") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | DECLARE sumOfCols = 0; + | CREATE TABLE t (intCol INT) using parquet; + | INSERT INTO t VALUES (1), (2), (3), (4); + | FOR row AS SELECT * FROM t DO + | SET sumOfCols = sumOfCols + row.intCol; + | END FOR; + | SELECT sumOfCols; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // declare sumOfCols + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq(Row(10)), // select sumOfCols + Seq.empty[Row] // drop sumOfCols + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - map, struct, array") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (int_column INT, map_column MAP, + | struct_column STRUCT, array_column ARRAY); + | INSERT INTO t VALUES + | (1, MAP('a', 1), STRUCT('John', 25), ARRAY('apricot', 'quince')), + | (2, MAP('b', 2), STRUCT('Jane', 30), ARRAY('plum', 'pear')); + | FOR row AS SELECT * FROM t ORDER BY int_column DO + | SELECT row.map_column; + | SELECT map_column; + | SELECT row.struct_column; + | SELECT struct_column; + | SELECT row.array_column; + | SELECT array_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Map("a" -> 1))), // select row.map_column + Seq(Row(Map("a" -> 1))), // select map_column + Seq(Row(Row("John", 25))), // select row.struct_column + Seq(Row(Row("John", 25))), // select struct_column + Seq(Row(Array("apricot", "quince"))), // select row.array_column + Seq(Row(Array("apricot", "quince"))), // select array_column + Seq(Row(Map("b" -> 2))), // select row.map_column + Seq(Row(Map("b" -> 2))), // select map_column + Seq(Row(Row("Jane", 30))), // select row.struct_column + Seq(Row(Row("Jane", 30))), // select struct_column + Seq(Row(Array("plum", "pear"))), // select row.array_column + Seq(Row(Array("plum", "pear"))), // select array_column + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested struct") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t + | (int_column INT, + | struct_column STRUCT>>); + | INSERT INTO t VALUES + | (1, STRUCT(1, STRUCT(STRUCT("one")))), + | (2, STRUCT(2, STRUCT(STRUCT("two")))); + | FOR row AS SELECT * FROM t ORDER BY int_column DO + | SELECT row.struct_column; + | SELECT struct_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Row(1, Row(Row("one"))))), // select row.struct_column + Seq(Row(Row(1, Row(Row("one"))))), // select struct_column + Seq(Row(Row(2, Row(Row("two"))))), // select row.struct_column + Seq(Row(Row(2, Row(Row("two"))))), // select struct_column + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested map") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (int_column INT, map_column MAP>>); + | INSERT INTO t VALUES + | (1, MAP('a', MAP(1, MAP(false, 10)))), + | (2, MAP('b', MAP(2, MAP(true, 20)))); + | FOR row AS SELECT * FROM t ORDER BY int_column DO + | SELECT row.map_column; + | SELECT map_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Map("a" -> Map(1 -> Map(false -> 10))))), // select row.map_column + Seq(Row(Map("a" -> Map(1 -> Map(false -> 10))))), // select map_column + Seq(Row(Map("b" -> Map(2 -> Map(true -> 20))))), // select row.map_column + Seq(Row(Map("b" -> Map(2 -> Map(true -> 20))))), // select map_column + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested array") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t + | (int_column INT, array_column ARRAY>>); + | INSERT INTO t VALUES + | (1, ARRAY(ARRAY(ARRAY(1, 2), ARRAY(3, 4)), ARRAY(ARRAY(5, 6)))), + | (2, ARRAY(ARRAY(ARRAY(7, 8), ARRAY(9, 10)), ARRAY(ARRAY(11, 12)))); + | FOR row AS SELECT * FROM t ORDER BY int_column DO + | SELECT row.array_column; + | SELECT array_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Seq(Seq(Seq(1, 2), Seq(3, 4)), Seq(Seq(5, 6))))), // row.array_column + Seq(Row(Seq(Seq(Seq(1, 2), Seq(3, 4)), Seq(Seq(5, 6))))), // array_column + Seq(Row(Array(Seq(Seq(7, 8), Seq(9, 10)), Seq(Seq(11, 12))))), // row.array_column + Seq(Row(Array(Seq(Seq(7, 8), Seq(9, 10)), Seq(Seq(11, 12))))), // array_column + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - empty result") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | FOR row AS SELECT * FROM t ORDER BY intCol DO + | SELECT row.intCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row] // create table + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - empty body") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING, doubleCol DOUBLE) using parquet; + | INSERT INTO t VALUES (1, 'first', 1.0); + | FOR row AS SELECT * FROM t DO + | BEGIN + | END; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested empty body") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING, doubleCol DOUBLE) using parquet; + | INSERT INTO t VALUES (1, 'first', 1.0); + | FOR row AS SELECT * FROM t DO + | BEGIN + | BEGIN + | END; + | END; + | BEGIN + | BEGIN + | END; + | END; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement iterate") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING) using parquet; + | INSERT INTO t VALUES (1, 'first'), (2, 'second'), (3, 'third'), (4, 'fourth'); + | + | lbl: FOR x AS SELECT * FROM t ORDER BY intCol DO + | IF x.intCol = 2 THEN + | ITERATE lbl; + | END IF; + | SELECT stringCol; + | SELECT x.stringCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row("first")), // select stringCol + Seq(Row("first")), // select x.stringCol + Seq(Row("third")), // select stringCol + Seq(Row("third")), // select x.stringCol + Seq(Row("fourth")), // select stringCol + Seq(Row("fourth")), // select x.stringCol + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement leave") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING) using parquet; + | INSERT INTO t VALUES (1, 'first'), (2, 'second'), (3, 'third'), (4, 'fourth'); + | + | lbl: FOR x AS SELECT * FROM t ORDER BY intCol DO + | IF x.intCol = 3 THEN + | LEAVE lbl; + | END IF; + | SELECT stringCol; + | SELECT x.stringCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row("first")), // select stringCol + Seq(Row("first")), // select x.stringCol + Seq(Row("second")), // select stringCol + Seq(Row("second")) // select x.stringCol + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested - in while") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | DECLARE cnt = 0; + | CREATE TABLE t (intCol INT) using parquet; + | INSERT INTO t VALUES (0); + | WHILE cnt < 2 DO + | SET cnt = cnt + 1; + | FOR x AS SELECT * FROM t ORDER BY intCol DO + | SELECT x.intCol; + | END FOR; + | INSERT INTO t VALUES (cnt); + | END WHILE; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // declare cnt + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq(Row(1)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop cnt + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested - in other for") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | FOR x as SELECT * FROM t ORDER BY intCol DO + | FOR y AS SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT x.intCol; + | SELECT intCol; + | SELECT y.intCol2; + | SELECT intCol2; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(0)), // select x.intCol + Seq(Row(0)), // select intCol + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)), // select intCol2 + Seq(Row(0)), // select x.intCol + Seq(Row(0)), // select intCol + Seq(Row(2)), // select y.intCol2 + Seq(Row(2)), // select intCol2 + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq(Row(1)), // select x.intCol + Seq(Row(1)), // select intCol + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)), // select intCol2 + Seq(Row(1)), // select x.intCol + Seq(Row(1)), // select intCol + Seq(Row(2)), // select y.intCol2 + Seq(Row(2)), // select intCol2 + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop outer var + Seq.empty[Row] // drop outer var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + // ignored until loops are fixed to support empty bodies + ignore("for statement - nested - empty result set") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | REPEAT + | FOR x AS SELECT * FROM t ORDER BY intCol DO + | SELECT x.intCol; + | END FOR; + | UNTIL 1 = 1 + | END REPEAT; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // declare cnt + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq(Row(1)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop cnt + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested - iterate outer loop") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | lbl1: FOR x as SELECT * FROM t ORDER BY intCol DO + | lbl2: FOR y AS SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT y.intCol2; + | SELECT intCol2; + | ITERATE lbl1; + | SELECT 1; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)), // select intCol2 + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)), // select intCol2 + Seq.empty[Row], // drop outer var + Seq.empty[Row] // drop outer var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested - leave outer loop") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | lbl1: FOR x as SELECT * FROM t ORDER BY intCol DO + | lbl2: FOR y AS SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT y.intCol2; + | SELECT intCol2; + | LEAVE lbl1; + | SELECT 1; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)) // select intCol2 + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - nested - leave inner loop") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | lbl1: FOR x as SELECT * FROM t ORDER BY intCol DO + | lbl2: FOR y AS SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT y.intCol2; + | SELECT intCol2; + | LEAVE lbl2; + | SELECT 1; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)), // select intCol2 + Seq(Row(3)), // select y.intCol2 + Seq(Row(3)), // select intCol2 + Seq.empty[Row], // drop outer var + Seq.empty[Row] // drop outer var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - enters body once") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING, doubleCol DOUBLE) using parquet; + | INSERT INTO t VALUES (1, 'first', 1.0); + | FOR SELECT * FROM t DO + | SELECT intCol; + | SELECT stringCol; + | SELECT doubleCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(1)), // select intCol + Seq(Row("first")), // select stringCol + Seq(Row(1.0)), // select doubleCol + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - enters body with multiple statements multiple times") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING, doubleCol DOUBLE) using parquet; + | INSERT INTO t VALUES (1, 'first', 1.0); + | INSERT INTO t VALUES (2, 'second', 2.0); + | FOR SELECT * FROM t ORDER BY intCol DO + | SELECT intCol; + | SELECT stringCol; + | SELECT doubleCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(1)), // select intCol + Seq(Row("first")), // select stringCol + Seq(Row(1.0)), // select doubleCol + Seq(Row(2)), // select intCol + Seq(Row("second")), // select stringCol + Seq(Row(2.0)), // select doubleCol + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - sum of column from table") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | DECLARE sumOfCols = 0; + | CREATE TABLE t (intCol INT) using parquet; + | INSERT INTO t VALUES (1), (2), (3), (4); + | FOR SELECT * FROM t DO + | SET sumOfCols = sumOfCols + intCol; + | END FOR; + | SELECT sumOfCols; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // declare sumOfCols + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // set sumOfCols + Seq.empty[Row], // drop local var + Seq(Row(10)), // select sumOfCols + Seq.empty[Row] // drop sumOfCols + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - map, struct, array") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (int_column INT, map_column MAP, + | struct_column STRUCT, array_column ARRAY); + | INSERT INTO t VALUES + | (1, MAP('a', 1), STRUCT('John', 25), ARRAY('apricot', 'quince')), + | (2, MAP('b', 2), STRUCT('Jane', 30), ARRAY('plum', 'pear')); + | FOR SELECT * FROM t ORDER BY int_column DO + | SELECT map_column; + | SELECT struct_column; + | SELECT array_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Map("a" -> 1))), // select map_column + Seq(Row(Row("John", 25))), // select struct_column + Seq(Row(Array("apricot", "quince"))), // select array_column + Seq(Row(Map("b" -> 2))), // select map_column + Seq(Row(Row("Jane", 30))), // select struct_column + Seq(Row(Array("plum", "pear"))), // select array_column + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested struct") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (int_column INT, + | struct_column STRUCT>>); + | INSERT INTO t VALUES + | (1, STRUCT(1, STRUCT(STRUCT("one")))), + | (2, STRUCT(2, STRUCT(STRUCT("two")))); + | FOR SELECT * FROM t ORDER BY int_column DO + | SELECT struct_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Row(1, Row(Row("one"))))), // select struct_column + Seq(Row(Row(2, Row(Row("two"))))), // select struct_column + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested map") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (int_column INT, map_column MAP>>); + | INSERT INTO t VALUES + | (1, MAP('a', MAP(1, MAP(false, 10)))), + | (2, MAP('b', MAP(2, MAP(true, 20)))); + | FOR SELECT * FROM t ORDER BY int_column DO + | SELECT map_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Map("a" -> Map(1 -> Map(false -> 10))))), // select map_column + Seq(Row(Map("b" -> Map(2 -> Map(true -> 20))))), // select map_column + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested array") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t + | (int_column INT, array_column ARRAY>>); + | INSERT INTO t VALUES + | (1, ARRAY(ARRAY(ARRAY(1, 2), ARRAY(3, 4)), ARRAY(ARRAY(5, 6)))), + | (2, ARRAY(ARRAY(ARRAY(7, 8), ARRAY(9, 10)), ARRAY(ARRAY(11, 12)))); + | FOR SELECT * FROM t ORDER BY int_column DO + | SELECT array_column; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row(Seq(Seq(Seq(1, 2), Seq(3, 4)), Seq(Seq(5, 6))))), // array_column + Seq(Row(Array(Seq(Seq(7, 8), Seq(9, 10)), Seq(Seq(11, 12))))), // array_column + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - empty result") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | FOR SELECT * FROM t ORDER BY intCol DO + | SELECT intCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row] // create table + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - iterate") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING) using parquet; + | INSERT INTO t VALUES (1, 'first'), (2, 'second'), (3, 'third'), (4, 'fourth'); + | + | lbl: FOR SELECT * FROM t ORDER BY intCol DO + | IF intCol = 2 THEN + | ITERATE lbl; + | END IF; + | SELECT stringCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row("first")), // select stringCol + Seq(Row("third")), // select stringCol + Seq(Row("fourth")), // select stringCol + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop local var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - leave") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT, stringCol STRING) using parquet; + | INSERT INTO t VALUES (1, 'first'), (2, 'second'), (3, 'third'), (4, 'fourth'); + | + | lbl: FOR SELECT * FROM t ORDER BY intCol DO + | IF intCol = 3 THEN + | LEAVE lbl; + | END IF; + | SELECT stringCol; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq(Row("first")), // select stringCol + Seq(Row("second")) // select stringCol + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested - in while") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | DECLARE cnt = 0; + | CREATE TABLE t (intCol INT) using parquet; + | INSERT INTO t VALUES (0); + | WHILE cnt < 2 DO + | SET cnt = cnt + 1; + | FOR SELECT * FROM t ORDER BY intCol DO + | SELECT intCol; + | END FOR; + | INSERT INTO t VALUES (cnt); + | END WHILE; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // declare cnt + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq(Row(1)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop cnt + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested - in other for") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | FOR SELECT * FROM t ORDER BY intCol DO + | FOR SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT intCol; + | SELECT intCol2; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(0)), // select intCol + Seq(Row(3)), // select intCol2 + Seq(Row(0)), // select intCol + Seq(Row(2)), // select intCol2 + Seq.empty[Row], // drop local var + Seq(Row(1)), // select intCol + Seq(Row(3)), // select intCol2 + Seq(Row(1)), // select intCol + Seq(Row(2)), // select intCol2 + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop outer var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + // ignored until loops are fixed to support empty bodies + ignore("for statement - no variable - nested - empty result set") { + withTable("t") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | REPEAT + | FOR SELECT * FROM t ORDER BY intCol DO + | SELECT intCol; + | END FOR; + | UNTIL 1 = 1 + | END REPEAT; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // declare cnt + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row], // set cnt + Seq(Row(0)), // select intCol + Seq(Row(1)), // select intCol + Seq.empty[Row], // insert + Seq.empty[Row], // drop local var + Seq.empty[Row] // drop cnt + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested - iterate outer loop") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | lbl1: FOR SELECT * FROM t ORDER BY intCol DO + | lbl2: FOR SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT intCol2; + | ITERATE lbl1; + | SELECT 1; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(3)), // select intCol2 + Seq(Row(3)), // select intCol2 + Seq.empty[Row] // drop outer var + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested - leave outer loop") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | lbl1: FOR SELECT * FROM t ORDER BY intCol DO + | lbl2: FOR SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT intCol2; + | LEAVE lbl1; + | SELECT 1; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(3)) // select intCol2 + ) + verifySqlScriptResult(sqlScript, expected) + } + } + + test("for statement - no variable - nested - leave inner loop") { + withTable("t", "t2") { + val sqlScript = + """ + |BEGIN + | CREATE TABLE t (intCol INT) using parquet; + | CREATE TABLE t2 (intCol2 INT) using parquet; + | INSERT INTO t VALUES (0), (1); + | INSERT INTO t2 VALUES (2), (3); + | lbl1: FOR SELECT * FROM t ORDER BY intCol DO + | lbl2: FOR SELECT * FROM t2 ORDER BY intCol2 DESC DO + | SELECT intCol2; + | LEAVE lbl2; + | SELECT 1; + | END FOR; + | END FOR; + |END + |""".stripMargin + + val expected = Seq( + Seq.empty[Row], // create table + Seq.empty[Row], // create table + Seq.empty[Row], // insert + Seq.empty[Row], // insert + Seq(Row(3)), // select intCol2 + Seq(Row(3)), // select intCol2 + Seq.empty[Row] // drop outer var + ) + verifySqlScriptResult(sqlScript, expected) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index 24732223c6698..c4b09c4b289e9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -32,7 +32,6 @@ import org.apache.spark.sql.execution.datasources.BucketingUtils import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.joins.SortMergeJoinExec import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.ExpressionUtils.column import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} @@ -229,7 +228,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils with Adapti checkPrunedAnswers( bucketSpec, bucketValues = Seq(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3), - filterCondition = column(inSetExpr), + filterCondition = Column(inSetExpr), df) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/CommitLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/CommitLogSuite.scala index 92bea82b35874..068f56839e6e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/CommitLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/CommitLogSuite.scala @@ -17,11 +17,18 @@ package org.apache.spark.sql.streaming -import java.io.{ByteArrayInputStream, FileInputStream, FileOutputStream} +import java.io.{ByteArrayInputStream, FileInputStream, FileOutputStream, InputStream, OutputStream} +import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.Path +import scala.io.{Source => IOSource} + +import org.json4s.{Formats, NoTypeHints} +import org.json4s.jackson.Serialization + import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.execution.streaming.{CommitLog, CommitMetadata} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.execution.streaming.{CommitLog, CommitMetadata, HDFSMetadataLog} import org.apache.spark.sql.test.SharedSparkSession class CommitLogSuite extends SparkFunSuite with SharedSparkSession { @@ -32,13 +39,8 @@ class CommitLogSuite extends SparkFunSuite with SharedSparkSession { "core", "src", "test", - "scala", - "org", - "apache", - "spark", - "sql", - "streaming", "resources", + "structured-streaming", "testCommitLogV2" ) } @@ -49,13 +51,8 @@ class CommitLogSuite extends SparkFunSuite with SharedSparkSession { "core", "src", "test", - "scala", - "org", - "apache", - "spark", - "sql", - "streaming", "resources", + "structured-streaming", "testCommitLogV1" ) } @@ -108,6 +105,57 @@ class CommitLogSuite extends SparkFunSuite with SharedSparkSession { assert(commitMetadata.nextBatchWatermarkMs === 233) assert(commitMetadata.stateUniqueIds === Map.empty) } + + // Test an old version of Spark can ser-de the new version of commit log, + // but running under V1 (i.e. no stateUniqueIds) + test("v1 Serde backward compatibility") { + // This is the json created by a V1 commit log + val commitLogV1WithStateUniqueId = """v1 + |{"nextBatchWatermarkMs":1,"stateUniqueIds":{}}""".stripMargin + val inputStream: ByteArrayInputStream = + new ByteArrayInputStream(commitLogV1WithStateUniqueId.getBytes("UTF-8")) + val commitMetadata: CommitMetadataLegacy = new CommitLogLegacy( + spark, testCommitLogV1FilePath.toString).deserialize(inputStream) + assert(commitMetadata.nextBatchWatermarkMs === 1) + } +} + +// DO-NOT-MODIFY-THE-CODE-BELOW +// Below are the legacy commit log code carbon copied from Spark branch-3.5, except +// adding a "Legacy" to the class names. +case class CommitMetadataLegacy(nextBatchWatermarkMs: Long = 0) { + def json: String = Serialization.write(this)(CommitMetadataLegacy.format) } +object CommitMetadataLegacy { + implicit val format: Formats = Serialization.formats(NoTypeHints) + def apply(json: String): CommitMetadataLegacy = Serialization.read[CommitMetadataLegacy](json) +} + +class CommitLogLegacy(sparkSession: SparkSession, path: String) + extends HDFSMetadataLog[CommitMetadataLegacy](sparkSession, path) { + + private val VERSION = 1 + private val EMPTY_JSON = "{}" + + override def deserialize(in: InputStream): CommitMetadataLegacy = { + // called inside a try-finally where the underlying stream is closed in the caller + val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines() + if (!lines.hasNext) { + throw new IllegalStateException("Incomplete log file in the offset commit log") + } + validateVersion(lines.next().trim, VERSION) + val metadataJson = if (lines.hasNext) lines.next() else EMPTY_JSON + CommitMetadataLegacy(metadataJson) + } + + override def serialize(metadata: CommitMetadataLegacy, out: OutputStream): Unit = { + // called inside a try-finally where the underlying stream is closed in the caller + out.write(s"v${VERSION}".getBytes(UTF_8)) + out.write('\n') + + // write metadata + out.write(metadata.json.getBytes(UTF_8)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index bcf0d4ac46655..0f382f4ed77de 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -874,6 +874,26 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { ) } + testWithAllStateVersions("test that avro encoding is not supported") { + val inputData = MemoryStream[Int] + + val aggregated = + inputData.toDF() + .groupBy($"value") + .agg(count("*")) + .as[(Int, Long)] + + val ex = intercept[Exception] { + withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro") { + testStream(aggregated, Update)( + AddData(inputData, 3), + ProcessAllAvailable() + ) + } + } + assert(ex.getMessage.contains("State store encoding format as avro is not supported")) + } + private def prepareTestForChangingSchemaOfState( tempDir: File): (MemoryStream[Int], DataFrame) = { val inputData = MemoryStream[Int] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index ab9df9a1e5a6f..040b99e55cb01 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -574,6 +574,21 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest { matchPVals = true ) } + + test("test that avro encoding is not supported") { + val inputData = MemoryStream[String] + val result = inputData.toDS().dropDuplicates() + + val ex = intercept[Exception] { + withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro") { + testStream(result, Append)( + AddData(inputData, "a"), + ProcessAllAvailable() + ) + } + } + assert(ex.getMessage.contains("State store encoding format as avro is not supported")) + } } @SlowSQLTest diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationWithinWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationWithinWatermarkSuite.scala index 9a02ab3df7dd4..af86e6ec88996 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationWithinWatermarkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationWithinWatermarkSuite.scala @@ -220,4 +220,18 @@ class StreamingDeduplicationWithinWatermarkSuite extends StateStoreMetricsTest { ) } } + + test("SPARK-50492: drop event time column after dropDuplicatesWithinWatermark") { + val inputData = MemoryStream[(Int, Int)] + val result = inputData.toDS() + .withColumn("first", timestamp_seconds($"_1")) + .withWatermark("first", "10 seconds") + .dropDuplicatesWithinWatermark("_2") + .select("_2") + + testStream(result, Append)( + AddData(inputData, (1, 2)), + CheckAnswer(2) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateSuite.scala index 88862e2ad0791..f7606cd45949f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.streaming import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.Encoders import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithChangelogCheckpointingEnabled, RocksDBStateStoreProvider} +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithEncodingTypes, AlsoTestWithRocksDBFeatures, RocksDBStateStoreProvider} import org.apache.spark.sql.internal.SQLConf case class InputRow(key: String, action: String, value: String) @@ -127,7 +127,7 @@ class ToggleSaveAndEmitProcessor } class TransformWithListStateSuite extends StreamTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures with AlsoTestWithEncodingTypes { import testImplicits._ test("test appending null value in list state throw exception") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateTTLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateTTLSuite.scala index 409a255ae3e64..b188b92bdbb7c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateTTLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithListStateTTLSuite.scala @@ -20,11 +20,77 @@ package org.apache.spark.sql.streaming import java.time.Duration import org.apache.spark.sql.Encoders -import org.apache.spark.sql.execution.streaming.{ListStateImplWithTTL, MemoryStream} +import org.apache.spark.sql.execution.streaming.{ListStateImplWithTTL, MapStateImplWithTTL, MemoryStream, ValueStateImplWithTTL} import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.StreamManualClock +// MultiStatefulVariableTTLProcessor is a StatefulProcessor that consumes a stream of +// strings and returns a stream of pairs. +// +// Internally, it uses several stateful variables to store the count of each string, for +// the sole purpose of verifying that these stateful variables all stay in sync and do not +// interfere with each other. +// +// The pattern of calling appendValue is to simulate the old behavior of appendValue, which +// used to add a record into the secondary index for every appendList call. +class MultiStatefulVariableTTLProcessor(ttlConfig: TTLConfig) + extends StatefulProcessor[String, String, (String, Long)]{ + @transient private var _listState: ListStateImplWithTTL[String] = _ + // Map from index to count + @transient private var _mapState: MapStateImplWithTTL[Long, Long] = _ + // Counts the number of times the string has occurred. It should always be + // equal to the size of the list state at the start and end of handleInputRows. + @transient private var _valueState: ValueStateImplWithTTL[Long] = _ + + override def init( + outputMode: OutputMode, + timeMode: TimeMode): Unit = { + _listState = getHandle + .getListState("listState", Encoders.STRING, ttlConfig) + .asInstanceOf[ListStateImplWithTTL[String]] + _mapState = getHandle + .getMapState("mapState", Encoders.scalaLong, Encoders.scalaLong, ttlConfig) + .asInstanceOf[MapStateImplWithTTL[Long, Long]] + _valueState = getHandle + .getValueState("valueState", Encoders.scalaLong, ttlConfig) + .asInstanceOf[ValueStateImplWithTTL[Long]] + } + override def handleInputRows( + key: String, + inputRows: Iterator[String], + timerValues: TimerValues): Iterator[(String, Long)] = { + assertSanity() + val iter = inputRows.map { row => + // Update the list state + _listState.appendValue(row) + + // Update the map state + val mapStateCurrentSize = _mapState.iterator().size + _mapState.updateValue(mapStateCurrentSize + 1, mapStateCurrentSize + 1) + + // Update the value state + val currentCountFromValueState = _valueState.get() + _valueState.update(currentCountFromValueState + 1) + + assertSanity() + + (key, _listState.get().size.toLong) + } + + iter + } + + // Asserts that the list state, map state, and value state are all in sync. + private def assertSanity(): Unit = { + val listSize = _listState.get().size + val mapSize = _mapState.iterator().size + val valueState = _valueState.get() + assert(listSize == mapSize) + assert(listSize == valueState) + } +} + class ListStateTTLProcessor(ttlConfig: TTLConfig) extends StatefulProcessor[String, InputEvent, OutputEvent] { @@ -80,10 +146,17 @@ class ListStateTTLProcessor(ttlConfig: TTLConfig) } else if (row.action == "append") { listState.appendValue(row.value) } else if (row.action == "get_values_in_ttl_state") { - val ttlValues = listState.getValuesInTTLState() + val ttlValues = listState.getValueInTTLState() ttlValues.foreach { v => results = OutputEvent(key, -1, isTTLValue = true, ttlValue = v) :: results } + } else if (row.action == "get_values_in_min_state") { + val minValues = listState.getMinValues() + minValues.foreach { minExpirationMs => + results = OutputEvent(key, -1, isTTLValue = true, ttlValue = minExpirationMs) :: results + } + } else if (row.action == "clear") { + listState.clear() } results.iterator @@ -94,7 +167,8 @@ class ListStateTTLProcessor(ttlConfig: TTLConfig) * Test suite for testing list state with TTL. * We use the base TTL suite with a list state processor. */ -class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { +class TransformWithListStateTTLSuite extends TransformWithStateTTLTest + with StateStoreMetricsTest { import testImplicits._ @@ -105,6 +179,68 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { override def getStateTTLMetricName: String = "numListStateWithTTLVars" + test("verify the list state secondary index has at most one record per key") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + val ttlConfig = TTLConfig(ttlDuration = Duration.ofMinutes(10)) + val inputStream = MemoryStream[String] + val result = inputStream.toDS() + .groupByKey(x => x) + .transformWithState( + new MultiStatefulVariableTTLProcessor(ttlConfig), + TimeMode.ProcessingTime(), + OutputMode.Append()) + val clock = new StreamManualClock + + testStream(result)( + StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock), + + // We want all of the inputs to have different timestamps, so that each record + // gets its own unique TTL, and thus, its own unique secondary index record. Each + // is also processed in its own microbatch to ensure a unique batchTimestampMs. + AddData(inputStream, "k1"), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(("k1", 1)), + + AddData(inputStream, "k2"), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(("k2", 1)), + + AddData(inputStream, "k1"), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(("k1", 2)), + + AddData(inputStream, "k2"), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(("k2", 2)), + + AddData(inputStream, "k1"), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(("k1", 3)), + + AddData(inputStream, "k2"), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(("k2", 3)), + + // For each unique key that occurs t times, the MultiStatefulVariableTTLProcessor maintains: + // - Map state: t records in the primary, and t records in the TTL index + // - List state: 1 record in the primary, TTL, min, and count indexes + // - Value state: 1 record in the primary, and 1 record in the TTL index + // + // So in total, that amounts to 2t + 4 + 2 = 2t + 6 records. + // + // In this test, we have 2 unique keys, and each key occurs 3 times. Thus, the total number + // of keys in state is 2 * (2t + 6) where t = 3, which is 24. + // + // The number of updated rows is the total across the last time assertNumStateRows + // was called, and we only update numRowsUpdated for primary key updates. We ran 6 batches + // and each wrote 3 primary keys, so the total number of updated rows is 6 * 3 = 18. + assertNumStateRows(total = 24, updated = 18) + ) + } + } + test("verify iterator works with expired values in beginning of list") { withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName, @@ -223,6 +359,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { // advance clock to trigger processing AdvanceManualClock(1 * 1000), CheckNewAnswer(), + // get ttl values AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1, null)), AdvanceManualClock(1 * 1000), @@ -231,6 +368,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { OutputEvent("k1", 2, isTTLValue = true, 182000), OutputEvent("k1", 3, isTTLValue = true, 182000) ), + AddData(inputStream, InputEvent("k1", "get", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( @@ -262,6 +400,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { // advance clock to trigger processing AdvanceManualClock(1 * 1000), CheckNewAnswer(), + // get all elements without enforcing ttl AddData(inputStream, InputEvent("k1", "get_without_enforcing_ttl", -1, null)), AdvanceManualClock(1 * 1000), @@ -273,6 +412,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { OutputEvent("k1", 5, isTTLValue = false, -1), OutputEvent("k1", 6, isTTLValue = false, -1) ), + AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( @@ -297,15 +437,14 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { // advance clock to trigger processing AdvanceManualClock(1 * 1000), CheckNewAnswer(), + // advance clock to expire the middle three elements AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( - OutputEvent("k1", -1, isTTLValue = true, 20000), - OutputEvent("k1", -1, isTTLValue = true, 181000), - OutputEvent("k1", -1, isTTLValue = true, 182000), - OutputEvent("k1", -1, isTTLValue = true, 188000) + OutputEvent("k1", -1, isTTLValue = true, 20000) ), + // progress batch timestamp from 9000 to 54000, expiring the middle // three elements. AdvanceManualClock(45 * 1000), @@ -320,6 +459,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { OutputEvent("k1", 8, isTTLValue = false, -1), OutputEvent("k1", 9, isTTLValue = false, -1) ), + AddData(inputStream, InputEvent("k1", "get_without_enforcing_ttl", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( @@ -330,12 +470,11 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { OutputEvent("k1", 8, isTTLValue = false, -1), OutputEvent("k1", 9, isTTLValue = false, -1) ), + AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( - OutputEvent("k1", -1, isTTLValue = true, 181000), - OutputEvent("k1", -1, isTTLValue = true, 182000), - OutputEvent("k1", -1, isTTLValue = true, 188000) + OutputEvent("k1", -1, isTTLValue = true, 181000) ), StopStream ) @@ -343,6 +482,104 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { } } + // If we have a list for a key k1 -> [(v1, t1), (v2, t2), (v3, t3)] and they _all_ expire, + // then there should be no remaining records in any primary (or secondary index) for that key. + // However, if we have a separate key k2 -> [(v1, t4)] and the time is less than t4, then it + // should still be present after the clearing for k1. + test("verify min-expiry index doesn't insert when the new minimum is None") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + withTempDir { checkpointLocation => + val inputStream = MemoryStream[InputEvent] + val ttlConfig1 = TTLConfig(ttlDuration = Duration.ofMinutes(1)) + val result1 = inputStream + .toDS() + .groupByKey(x => x.key) + .transformWithState( + getProcessor(ttlConfig1), + TimeMode.ProcessingTime(), + OutputMode.Append() + ) + + val clock = new StreamManualClock + testStream(result1)( + StartStream( + Trigger.ProcessingTime("1 second"), + triggerClock = clock, + checkpointLocation = checkpointLocation.getAbsolutePath + ), + + // Add 3 elements all with different eviction timestamps. + AddData(inputStream, InputEvent("k1", "append", 1)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(), + + AddData(inputStream, InputEvent("k1", "append", 2)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(), + + AddData(inputStream, InputEvent("k1", "append", 3)), + AdvanceManualClock(1 * 1000), // Time is 3000 + CheckNewAnswer(), + + // Add a separate key; this should not be affected by k1 expiring. + // It will have an expiration of 64000. + AddData(inputStream, InputEvent("k2", "put", 1)), + + // Now, we should have: k1 -> [1, 2, 3] with TTLs [61000, 62000, 63000] respectively + AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1, null)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( // Time is 4000 for this micro-batch + OutputEvent("k1", 1, isTTLValue = true, 61000), + OutputEvent("k1", 2, isTTLValue = true, 62000), + OutputEvent("k1", 3, isTTLValue = true, 63000) + ), + + AddData(inputStream, InputEvent("k1", "get_values_in_min_state", -1, null)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( // Time is 5000 for this micro-batch + OutputEvent("k1", -1, isTTLValue = true, 61000) + ), + + // The k1 records expire at 63000, and the current time is 5000. So, we advance the + // clock by 63 - 5 = 58 seconds to expire those. + AdvanceManualClock((63 - 5) * 1000), + CheckNewAnswer(), + + // There should be 4 state rows left over: the primary, TTL, min-expiry, and count + // indexes for k2. + // + // It's important to check with assertNumStateRows, since the InputEvents + // only return values for the current grouping key, not the entirety of RocksDB. + assertNumStateRows(total = 4, updated = 4), + + // The k1 calls should both return no values. However, the k2 calls should return + // one record each. We put these into one AddData call since we want them all to + // run when the batchTimestampMs is 65000. + AddData(inputStream, + // These should both return no values, since all of k1 has been expired. + InputEvent("k1", "get_values_in_ttl_state", -1, null), + InputEvent("k1", "get_values_in_min_state", -1, null), + + // However, k2 still has a record. + InputEvent("k2", "get_values_in_ttl_state", -1, null), + InputEvent("k2", "get_values_in_min_state", -1, null) + ), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( // Time is 65000 for this micro-batch + OutputEvent("k2", -1, isTTLValue = true, 64000), + OutputEvent("k2", -1, isTTLValue = true, 64000) + ), + + assertNumStateRows(total = 0, updated = 0), + + StopStream + ) + } + } + } + test("verify iterator works with expired values in end of list") { withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName, @@ -380,14 +617,23 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { // advance clock to trigger processing AdvanceManualClock(1 * 1000), CheckNewAnswer(), + // get ttl values - AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1, null)), + AddData(inputStream, + InputEvent("k1", "get_ttl_value_from_state", -1, null), + InputEvent("k1", "get_values_in_min_state", -1) + ), AdvanceManualClock(1 * 1000), CheckNewAnswer( + // From the get_ttl_value_from_state call OutputEvent("k1", 1, isTTLValue = true, 121000), OutputEvent("k1", 2, isTTLValue = true, 122000), - OutputEvent("k1", 3, isTTLValue = true, 122000) + OutputEvent("k1", 3, isTTLValue = true, 122000), + + // From the get_values_in_min_state call + OutputEvent("k1", -1, isTTLValue = true, 121000) ), + AddData(inputStream, InputEvent("k1", "get", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( @@ -410,6 +656,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { // advance clock to trigger processing AdvanceManualClock(1 * 1000), CheckNewAnswer(), + // get ttl values AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1, null)), AdvanceManualClock(1 * 1000), @@ -423,9 +670,8 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { ), AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1, null)), AdvanceManualClock(1 * 1000), + CheckNewAnswer( - OutputEvent("k1", -1, isTTLValue = true, 121000), - OutputEvent("k1", -1, isTTLValue = true, 122000), OutputEvent("k1", -1, isTTLValue = true, 65000) ), // expire end values, batch timestamp from 7000 to 67000 @@ -447,8 +693,7 @@ class TransformWithListStateTTLSuite extends TransformWithStateTTLTest { AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1, null)), AdvanceManualClock(1 * 1000), CheckNewAnswer( - OutputEvent("k1", -1, isTTLValue = true, 121000), - OutputEvent("k1", -1, isTTLValue = true, 122000) + OutputEvent("k1", -1, isTTLValue = true, 121000) ), StopStream ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateSuite.scala index 76c5cbeee424b..6884ef577f8ef 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.streaming import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.Encoders import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithChangelogCheckpointingEnabled, RocksDBStateStoreProvider} +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithEncodingTypes, AlsoTestWithRocksDBFeatures, RocksDBStateStoreProvider} import org.apache.spark.sql.internal.SQLConf case class InputMapRow(key: String, action: String, value: (String, String)) @@ -81,7 +81,7 @@ class TestMapStateProcessor * operators such as transformWithState. */ class TransformWithMapStateSuite extends StreamTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures with AlsoTestWithEncodingTypes { import testImplicits._ private def testMapStateWithNullUserKey(inputMapRow: InputMapRow): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala index 022280eb3bcef..2cb15263459ea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala @@ -83,6 +83,8 @@ class MapStateSingleKeyTTLProcessor(ttlConfig: TTLConfig) ttlValues.foreach { v => results = OutputEvent(key, -1, isTTLValue = true, ttlValue = v._2) :: results } + } else if (row.action == "clear") { + mapState.clear() } results.iterator @@ -308,7 +310,6 @@ class TransformWithMapStateTTLSuite extends TransformWithStateTTLTest { AddData(inputStream, MapInputEvent("k1", "", "get_values_in_ttl_state", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer( - MapOutputEvent("k1", "key3", -1, isTTLValue = true, 123000), MapOutputEvent("k1", "key3", -1, isTTLValue = true, 126000), MapOutputEvent("k1", "key4", -1, isTTLValue = true, 123000), MapOutputEvent("k1", "key5", -1, isTTLValue = true, 123000) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala index 6888fcba45f3e..0e963bec41b4d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.{SparkRuntimeException, SparkThrowable} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamExecution} -import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithEncodingTypes, AlsoTestWithRocksDBFeatures, RocksDBStateStoreProvider} import org.apache.spark.sql.functions.window import org.apache.spark.sql.internal.SQLConf @@ -103,47 +103,53 @@ case class AggEventRow( window: Window, count: Long) -class TransformWithStateChainingSuite extends StreamTest { +class TransformWithStateChainingSuite extends StreamTest + with AlsoTestWithRocksDBFeatures + with AlsoTestWithEncodingTypes { import testImplicits._ + private def isAvroEnabled: Boolean = SQLConf.get.stateStoreEncodingFormat == "avro" + test("watermark is propagated correctly for next stateful operator" + " after transformWithState") { - withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> - classOf[RocksDBStateStoreProvider].getName) { - val inputData = MemoryStream[InputEventRow] + if (!isAvroEnabled) { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] - val result = inputData.toDS() - .withWatermark("eventTime", "1 minute") - .groupByKey(x => x.key) - .transformWithState[OutputRow]( - new TestStatefulProcessor(), - "outputEventTime", - OutputMode.Append()) - .groupBy(window($"outputEventTime", "1 minute")) - .count() - .as[AggEventRow] + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + .groupBy(window($"outputEventTime", "1 minute")) + .count() + .as[AggEventRow] - testStream(result, OutputMode.Append())( - AddData(inputData, InputEventRow("k1", timestamp("2024-01-01 00:00:00"), "e1")), - // watermark should be 1 minute behind `2024-01-01 00:00:00`, nothing is - // emitted as all records have timestamp > epoch - CheckNewAnswer(), - Execute("assertWatermarkEquals") { q => - assertWatermarkEquals(q, timestamp("2023-12-31 23:59:00")) - }, - AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), - // global watermark should now be 1 minute behind `2024-02-01 00:00:00`. - CheckNewAnswer(AggEventRow( - Window(timestamp("2024-01-01 00:00:00"), timestamp("2024-01-01 00:01:00")), 1) - ), - Execute("assertWatermarkEquals") { q => - assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) - }, - AddData(inputData, InputEventRow("k1", timestamp("2024-02-02 00:00:00"), "e1")), - CheckNewAnswer(AggEventRow( - Window(timestamp("2024-02-01 00:00:00"), timestamp("2024-02-01 00:01:00")), 1) + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-01-01 00:00:00"), "e1")), + // watermark should be 1 minute behind `2024-01-01 00:00:00`, nothing is + // emitted as all records have timestamp > epoch + CheckNewAnswer(), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2023-12-31 23:59:00")) + }, + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + // global watermark should now be 1 minute behind `2024-02-01 00:00:00`. + CheckNewAnswer(AggEventRow( + Window(timestamp("2024-01-01 00:00:00"), timestamp("2024-01-01 00:01:00")), 1) + ), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + }, + AddData(inputData, InputEventRow("k1", timestamp("2024-02-02 00:00:00"), "e1")), + CheckNewAnswer(AggEventRow( + Window(timestamp("2024-02-01 00:00:00"), timestamp("2024-02-01 00:01:00")), 1) + ) ) - ) + } } } @@ -166,33 +172,35 @@ class TransformWithStateChainingSuite extends StreamTest { } } - test("missing eventTime column to transformWithState fails the query if" + - " another stateful operator is added") { - withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> - classOf[RocksDBStateStoreProvider].getName) { - val inputData = MemoryStream[InputEventRow] + test("missing eventTime column to transformWithState fails the query if " + + "another stateful operator is added") { + if (!isAvroEnabled) { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] - val result = inputData.toDS() - .withWatermark("eventTime", "1 minute") - .groupByKey(x => x.key) - .transformWithState[OutputRow]( - new TestStatefulProcessor(), - TimeMode.None(), - OutputMode.Append()) - .groupBy(window($"outputEventTime", "1 minute")) - .count() - - checkError( - exception = intercept[AnalysisException] { - testStream(result, OutputMode.Append())( - StartStream() - ) - }, - condition = "STREAMING_OUTPUT_MODE.UNSUPPORTED_OPERATION", - sqlState = "42KDE", - parameters = Map( - "outputMode" -> "append", - "operation" -> "streaming aggregations without watermark")) + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + TimeMode.None(), + OutputMode.Append()) + .groupBy(window($"outputEventTime", "1 minute")) + .count() + + checkError( + exception = intercept[AnalysisException] { + testStream(result, OutputMode.Append())( + StartStream() + ) + }, + condition = "STREAMING_OUTPUT_MODE.UNSUPPORTED_OPERATION", + sqlState = "42KDE", + parameters = Map( + "outputMode" -> "append", + "operation" -> "streaming aggregations without watermark")) + } } } @@ -234,25 +242,27 @@ class TransformWithStateChainingSuite extends StreamTest { test("dropDuplicateWithWatermark after transformWithState operator" + " fails if watermark column is not provided") { - withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> - classOf[RocksDBStateStoreProvider].getName) { - val inputData = MemoryStream[InputEventRow] - val result = inputData.toDS() - .withWatermark("eventTime", "1 minute") - .groupByKey(x => x.key) - .transformWithState[OutputRow]( - new TestStatefulProcessor(), - TimeMode.None(), - OutputMode.Append()) - .dropDuplicatesWithinWatermark() + if (!isAvroEnabled) { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + TimeMode.None(), + OutputMode.Append()) + .dropDuplicatesWithinWatermark() - val ex = intercept[ExtendedAnalysisException] { - testStream(result, OutputMode.Append())( - StartStream() - ) + val ex = intercept[ExtendedAnalysisException] { + testStream(result, OutputMode.Append())( + StartStream() + ) + } + assert(ex.getMessage.contains("dropDuplicatesWithinWatermark is not supported on" + + " streaming DataFrames/DataSets without watermark")) } - assert(ex.getMessage.contains("dropDuplicatesWithinWatermark is not supported on" + - " streaming DataFrames/DataSets without watermark")) } } @@ -269,14 +279,25 @@ class TransformWithStateChainingSuite extends StreamTest { OutputMode.Append()) .dropDuplicatesWithinWatermark() - testStream(result, OutputMode.Append())( - AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1"), - InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), - CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"), 2)), - Execute("assertWatermarkEquals") { q => - assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + if (!isAvroEnabled) { + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1"), + InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"), 2)), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + } + ) + } else { + val ex = intercept[Exception] { + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1"), + InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + ProcessAllAvailable() + ) } - ) + assert(ex.getMessage.contains("State store encoding format as avro is not supported")) + } } } @@ -340,7 +361,7 @@ class TransformWithStateChainingSuite extends StreamTest { val inputData = MemoryStream[InputEventRow] inputData.toDS() .withWatermark("eventTime", "1 minute") - .createTempView("tempViewWithWatermark") + .createOrReplaceTempView("tempViewWithWatermark") val result = spark.readStream.table("tempViewWithWatermark") .as[InputEventRow] @@ -365,7 +386,7 @@ class TransformWithStateChainingSuite extends StreamTest { classOf[RocksDBStateStoreProvider].getName) { val inputData = MemoryStream[InputEventRow] inputData.toDS() - .createTempView("tempViewWithoutWatermark") + .createOrReplaceTempView("tempViewWithoutWatermark") val ex = intercept[AnalysisException] { val result = spark.readStream.table("tempViewWithoutWatermark") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateClusterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateClusterSuite.scala new file mode 100644 index 0000000000000..3e2899f7c6ee7 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateClusterSuite.scala @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.sql.{Dataset, Encoders, Row, SparkSession} +import org.apache.spark.sql.LocalSparkSession.withSparkSession +import org.apache.spark.sql.execution.streaming.MemoryStream +import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider +import org.apache.spark.sql.internal.SQLConf + +case class FruitState( + name: String, + count: Long, + family: String +) + +class FruitCountStatefulProcessor(useImplicits: Boolean) + extends StatefulProcessor[String, String, (String, Long, String)] { + import implicits._ + + @transient protected var _fruitState: ValueState[FruitState] = _ + + override def init(outputMode: OutputMode, timeMode: TimeMode): Unit = { + if (useImplicits) { + _fruitState = getHandle.getValueState[FruitState]("fruitState", TTLConfig.NONE) + } else { + _fruitState = getHandle.getValueState("fruitState", Encoders.product[FruitState], + TTLConfig.NONE) + } + } + + private def getFamily(fruitName: String): String = { + if (fruitName == "orange" || fruitName == "lemon" || fruitName == "lime") { + "citrus" + } else { + "non-citrus" + } + } + + override def handleInputRows(key: String, inputRows: Iterator[String], timerValues: TimerValues): + Iterator[(String, Long, String)] = { + val new_cnt = _fruitState.getOption().map(x => x.count).getOrElse(0L) + inputRows.size + val family = getFamily(key) + _fruitState.update(FruitState(key, new_cnt, family)) + Iterator.single((key, new_cnt, family)) + } +} + +class FruitCountStatefulProcessorWithInitialState(useImplicits: Boolean) + extends StatefulProcessorWithInitialState[String, String, (String, Long, String), String] { + import implicits._ + + @transient protected var _fruitState: ValueState[FruitState] = _ + + override def init(outputMode: OutputMode, timeMode: TimeMode): Unit = { + if (useImplicits) { + _fruitState = getHandle.getValueState[FruitState]("fruitState", TTLConfig.NONE) + } else { + _fruitState = getHandle.getValueState("fruitState", Encoders.product[FruitState], + TTLConfig.NONE) + } + } + + private def getFamily(fruitName: String): String = { + if (fruitName == "orange" || fruitName == "lemon" || fruitName == "lime") { + "citrus" + } else { + "non-citrus" + } + } + + override def handleInitialState(key: String, initialState: String, + timerValues: TimerValues): Unit = { + val new_cnt = _fruitState.getOption().map(x => x.count).getOrElse(0L) + 1 + val family = getFamily(key) + _fruitState.update(FruitState(key, new_cnt, family)) + } + + override def handleInputRows(key: String, inputRows: Iterator[String], timerValues: TimerValues): + Iterator[(String, Long, String)] = { + val new_cnt = _fruitState.getOption().map(x => x.count).getOrElse(0L) + inputRows.size + val family = getFamily(key) + _fruitState.update(FruitState(key, new_cnt, family)) + Iterator.single((key, new_cnt, family)) + } +} + +trait TransformWithStateClusterSuiteBase extends SparkFunSuite { + def getSparkConf(): SparkConf = { + val conf = new SparkConf() + .setMaster("local-cluster[2, 2, 1024]") + .set(SQLConf.STATE_STORE_PROVIDER_CLASS.key, + classOf[RocksDBStateStoreProvider].getCanonicalName) + .set(SQLConf.SHUFFLE_PARTITIONS.key, + TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS.toString) + .set(SQLConf.STREAMING_STOP_TIMEOUT, 5000L) + conf + } + + // Start a new test with cluster containing two executors and streaming stop timeout set to 5s + val testSparkConf = getSparkConf() + + protected def testWithAndWithoutImplicitEncoders(name: String) + (func: (SparkSession, Boolean) => Any): Unit = { + Seq(false, true).foreach { useImplicits => + test(s"$name - useImplicits = $useImplicits") { + withSparkSession(SparkSession.builder().config(testSparkConf).getOrCreate()) { spark => + func(spark, useImplicits) + } + } + } + } +} + +/** + * Test suite spawning local cluster with multiple executors to test serde of stateful + * processors along with use of implicit encoders, if applicable in transformWithState operator. + */ +class TransformWithStateClusterSuite extends StreamTest with TransformWithStateClusterSuiteBase { + testWithAndWithoutImplicitEncoders("streaming with transformWithState - " + + "without initial state") { (spark, useImplicits) => + import spark.implicits._ + val input = MemoryStream(Encoders.STRING, spark.sqlContext) + val agg = input.toDS() + .groupByKey(x => x) + .transformWithState(new FruitCountStatefulProcessor(useImplicits), + TimeMode.None(), + OutputMode.Update() + ) + + val query = agg.writeStream + .format("memory") + .outputMode("update") + .queryName("output") + .start() + + input.addData("apple", "apple", "orange", "orange", "orange") + query.processAllAvailable() + + checkAnswer(spark.sql("select * from output"), + Seq(Row("apple", 2, "non-citrus"), + Row("orange", 3, "citrus"))) + + input.addData("lemon", "lime") + query.processAllAvailable() + checkAnswer(spark.sql("select * from output"), + Seq(Row("apple", 2, "non-citrus"), + Row("orange", 3, "citrus"), + Row("lemon", 1, "citrus"), + Row("lime", 1, "citrus"))) + + query.stop() + } + + testWithAndWithoutImplicitEncoders("streaming with transformWithState - " + + "with initial state") { (spark, useImplicits) => + import spark.implicits._ + + val fruitCountInitialDS: Dataset[String] = Seq( + "apple", "apple", "orange", "orange", "orange").toDS() + + val fruitCountInitial = fruitCountInitialDS + .groupByKey(x => x) + + val input = MemoryStream(Encoders.STRING, spark.sqlContext) + val agg = input.toDS() + .groupByKey(x => x) + .transformWithState(new FruitCountStatefulProcessorWithInitialState(useImplicits), + TimeMode.None(), + OutputMode.Update(), fruitCountInitial) + + val query = agg.writeStream + .format("memory") + .outputMode("update") + .queryName("output") + .start() + + input.addData("apple", "apple", "orange", "orange", "orange") + query.processAllAvailable() + + checkAnswer(spark.sql("select * from output"), + Seq(Row("apple", 4, "non-citrus"), + Row("orange", 6, "citrus"))) + + input.addData("lemon", "lime") + query.processAllAvailable() + checkAnswer(spark.sql("select * from output"), + Seq(Row("apple", 4, "non-citrus"), + Row("orange", 6, "citrus"), + Row("lemon", 1, "citrus"), + Row("lime", 1, "citrus"))) + + query.stop() + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateInitialStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateInitialStateSuite.scala index 806d2f19f6f5c..cf304301565ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateInitialStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateInitialStateSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.streaming import org.apache.spark.sql.{DataFrame, Dataset, Encoders, KeyValueGroupedDataset} import org.apache.spark.sql.execution.datasources.v2.state.StateSourceOptions import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithChangelogCheckpointingEnabled, RocksDBStateStoreProvider} +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithRocksDBFeatures, RocksDBStateStoreProvider} import org.apache.spark.sql.functions.{col, timestamp_seconds} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.StreamManualClock @@ -47,6 +47,8 @@ case class UnionUnflattenInitialStateRow( abstract class StatefulProcessorWithInitialStateTestClass[V] extends StatefulProcessorWithInitialState[ String, InitInputRow, (String, String, Double), V] { + import implicits._ + @transient var _valState: ValueState[Double] = _ @transient var _listState: ListState[Double] = _ @transient var _mapState: MapState[Double, Int] = _ @@ -54,13 +56,9 @@ abstract class StatefulProcessorWithInitialStateTestClass[V] override def init( outputMode: OutputMode, timeMode: TimeMode): Unit = { - _valState = getHandle.getValueState[Double]("testValueInit", Encoders.scalaDouble, - TTLConfig.NONE) - _listState = getHandle.getListState[Double]("testListInit", Encoders.scalaDouble, - TTLConfig.NONE) - _mapState = getHandle.getMapState[Double, Int]( - "testMapInit", Encoders.scalaDouble, Encoders.scalaInt, - TTLConfig.NONE) + _valState = getHandle.getValueState[Double]("testValueInit", TTLConfig.NONE) + _listState = getHandle.getListState[Double]("testListInit", TTLConfig.NONE) + _mapState = getHandle.getMapState[Double, Int]("testMapInit", TTLConfig.NONE) } override def handleInputRows( @@ -363,7 +361,7 @@ class StatefulProcessorWithInitialStateEventTimerClass * streaming operator with user-defined initial state */ class TransformWithStateInitialStateSuite extends StateStoreMetricsTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures { import testImplicits._ @@ -379,6 +377,8 @@ class TransformWithStateInitialStateSuite extends StateStoreMetricsTest withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName) { + val clock = new StreamManualClock + val inputData = MemoryStream[InitInputRow] val kvDataSet = inputData.toDS() .groupByKey(x => x.key) @@ -390,10 +390,12 @@ class TransformWithStateInitialStateSuite extends StateStoreMetricsTest TimeMode.None(), OutputMode.Append(), initStateDf) testStream(query, OutputMode.Update())( + StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock), // non-exist key test AddData(inputData, InitInputRow("k1", "update", 37.0)), AddData(inputData, InitInputRow("k2", "update", 40.0)), AddData(inputData, InitInputRow("non-exist", "getOption", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("non-exist", "getOption", -1.0)), Execute { q => assert(q.lastProgress @@ -402,59 +404,80 @@ class TransformWithStateInitialStateSuite extends StateStoreMetricsTest AddData(inputData, InitInputRow("k1", "appendList", 37.0)), AddData(inputData, InitInputRow("k2", "appendList", 40.0)), AddData(inputData, InitInputRow("non-exist", "getList", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(), AddData(inputData, InitInputRow("k1", "incCount", 37.0)), AddData(inputData, InitInputRow("k2", "incCount", 40.0)), AddData(inputData, InitInputRow("non-exist", "getCount", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("non-exist", "getCount", 0.0)), + AddData(inputData, InitInputRow("k2", "incCount", 40.0)), AddData(inputData, InitInputRow("k2", "getCount", 40.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("k2", "getCount", 2.0)), // test every row in initial State is processed AddData(inputData, InitInputRow("init_1", "getOption", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getOption", 40.0)), + AddData(inputData, InitInputRow("init_2", "getOption", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_2", "getOption", 100.0)), AddData(inputData, InitInputRow("init_1", "getList", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getList", 40.0)), + AddData(inputData, InitInputRow("init_2", "getList", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_2", "getList", 100.0)), AddData(inputData, InitInputRow("init_1", "getCount", 40.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getCount", 1.0)), + AddData(inputData, InitInputRow("init_2", "getCount", 100.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_2", "getCount", 1.0)), // Update row with key in initial row will work AddData(inputData, InitInputRow("init_1", "update", 50.0)), AddData(inputData, InitInputRow("init_1", "getOption", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getOption", 50.0)), + AddData(inputData, InitInputRow("init_1", "remove", -1.0)), AddData(inputData, InitInputRow("init_1", "getOption", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getOption", -1.0)), AddData(inputData, InitInputRow("init_1", "appendList", 50.0)), AddData(inputData, InitInputRow("init_1", "getList", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getList", 50.0), ("init_1", "getList", 40.0)), AddData(inputData, InitInputRow("init_1", "incCount", 40.0)), AddData(inputData, InitInputRow("init_1", "getCount", 40.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getCount", 2.0)), // test remove AddData(inputData, InitInputRow("k1", "remove", -1.0)), AddData(inputData, InitInputRow("k1", "getOption", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("k1", "getOption", -1.0)), AddData(inputData, InitInputRow("init_1", "clearCount", -1.0)), AddData(inputData, InitInputRow("init_1", "getCount", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer(("init_1", "getCount", 0.0)), AddData(inputData, InitInputRow("init_1", "clearList", -1.0)), AddData(inputData, InitInputRow("init_1", "getList", -1.0)), + AdvanceManualClock(1 * 1000), CheckNewAnswer() ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala index 505775d4f6a9b..97dad5fe78a19 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.functions.timestamp_seconds import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.StreamManualClock import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} +import org.apache.spark.tags.SlowSQLTest object TransformWithStateSuiteUtils { val NUM_SHUFFLE_PARTITIONS = 5 @@ -45,13 +46,13 @@ object TransformWithStateSuiteUtils { class RunningCountStatefulProcessor extends StatefulProcessor[String, String, (String, String)] with Logging { + import implicits._ @transient protected var _countState: ValueState[Long] = _ override def init( outputMode: OutputMode, timeMode: TimeMode): Unit = { - _countState = getHandle.getValueState[Long]("countState", - Encoders.scalaLong, TTLConfig.NONE) + _countState = getHandle.getValueState[Long]("countState", TTLConfig.NONE) } override def handleInputRows( @@ -72,12 +73,13 @@ class RunningCountStatefulProcessor extends StatefulProcessor[String, String, (S class RunningCountStatefulProcessorWithTTL extends StatefulProcessor[String, String, (String, String)] with Logging { + import implicits._ @transient protected var _countState: ValueState[Long] = _ override def init( outputMode: OutputMode, timeMode: TimeMode): Unit = { - _countState = getHandle.getValueState[Long]("countState", Encoders.scalaLong, + _countState = getHandle.getValueState[Long]("countState", TTLConfig(Duration.ofMillis(1000))) } @@ -384,20 +386,32 @@ class RunningCountStatefulProcessorWithError extends RunningCountStatefulProcess } // class for verify state schema is correctly written for all state var types -class StatefulProcessorWithCompositeTypes extends RunningCountStatefulProcessor { +class StatefulProcessorWithCompositeTypes(useImplicits: Boolean) + extends RunningCountStatefulProcessor { + import implicits._ @transient private var _listState: ListState[TestClass] = _ @transient private var _mapState: MapState[POJOTestClass, String] = _ override def init( outputMode: OutputMode, timeMode: TimeMode): Unit = { - _countState = getHandle.getValueState[Long]("countState", Encoders.scalaLong, - TTLConfig.NONE) - _listState = getHandle.getListState[TestClass]( - "listState", Encoders.product[TestClass], TTLConfig.NONE) - _mapState = getHandle.getMapState[POJOTestClass, String]( - "mapState", Encoders.bean(classOf[POJOTestClass]), Encoders.STRING, - TTLConfig.NONE) + + if (useImplicits) { + _countState = getHandle.getValueState[Long]("countState", TTLConfig.NONE) + _listState = getHandle.getListState[TestClass]( + "listState", TTLConfig.NONE) + _mapState = getHandle.getMapState[POJOTestClass, String]( + "mapState", Encoders.bean(classOf[POJOTestClass]), Encoders.STRING, + TTLConfig.NONE) + } else { + _countState = getHandle.getValueState[Long]("countState", Encoders.scalaLong, + TTLConfig.NONE) + _listState = getHandle.getListState[TestClass]( + "listState", Encoders.product[TestClass], TTLConfig.NONE) + _mapState = getHandle.getMapState[POJOTestClass, String]( + "mapState", Encoders.bean(classOf[POJOTestClass]), Encoders.STRING, + TTLConfig.NONE) + } } } @@ -428,12 +442,14 @@ class SleepingTimerProcessor extends StatefulProcessor[String, String, String] { /** * Class that adds tests for transformWithState stateful streaming operator */ +@SlowSQLTest class TransformWithStateSuite extends StateStoreMetricsTest - with AlsoTestWithChangelogCheckpointingEnabled { + with AlsoTestWithRocksDBFeatures with AlsoTestWithEncodingTypes { import testImplicits._ - test("transformWithState - streaming with rocksdb and invalid processor should fail") { + test("transformWithState - streaming with rocksdb and" + + " invalid processor should fail") { withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName, SQLConf.SHUFFLE_PARTITIONS.key -> @@ -688,7 +704,8 @@ class TransformWithStateSuite extends StateStoreMetricsTest } } - test("transformWithState - streaming with rocksdb and event time based timer") { + test("transformWithState - streaming with rocksdb and event " + + "time based timer") { val inputData = MemoryStream[(String, Int)] val result = inputData.toDS() @@ -778,7 +795,8 @@ class TransformWithStateSuite extends StateStoreMetricsTest ) } - test("Use statefulProcessor without transformWithState - handle should be absent") { + test("Use statefulProcessor without transformWithState -" + + " handle should be absent") { val processor = new RunningCountStatefulProcessor() val ex = intercept[Exception] { processor.getHandle @@ -1034,84 +1052,87 @@ class TransformWithStateSuite extends StateStoreMetricsTest } } - test("transformWithState - verify StateSchemaV3 writes correct SQL schema of key/value") { - withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> - classOf[RocksDBStateStoreProvider].getName, - SQLConf.SHUFFLE_PARTITIONS.key -> - TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS.toString) { - withTempDir { checkpointDir => - val metadataPathPostfix = "state/0/_stateSchema/default" - val stateSchemaPath = new Path(checkpointDir.toString, - s"$metadataPathPostfix") - val hadoopConf = spark.sessionState.newHadoopConf() - val fm = CheckpointFileManager.create(stateSchemaPath, hadoopConf) - - val keySchema = new StructType().add("value", StringType) - val schema0 = StateStoreColFamilySchema( - "countState", - keySchema, - new StructType().add("value", LongType, false), - Some(NoPrefixKeyStateEncoderSpec(keySchema)), - None - ) - val schema1 = StateStoreColFamilySchema( - "listState", - keySchema, - new StructType() + Seq(false, true).foreach { useImplicits => + test("transformWithState - verify StateSchemaV3 writes " + + s"correct SQL schema of key/value with useImplicits=$useImplicits") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> + TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS.toString) { + withTempDir { checkpointDir => + val metadataPathPostfix = "state/0/_stateSchema/default" + val stateSchemaPath = new Path(checkpointDir.toString, + s"$metadataPathPostfix") + val hadoopConf = spark.sessionState.newHadoopConf() + val fm = CheckpointFileManager.create(stateSchemaPath, hadoopConf) + + val keySchema = new StructType().add("value", StringType) + val schema0 = StateStoreColFamilySchema( + "countState", + keySchema, + new StructType().add("value", LongType, false), + Some(NoPrefixKeyStateEncoderSpec(keySchema)), + None + ) + val schema1 = StateStoreColFamilySchema( + "listState", + keySchema, + new StructType() .add("id", LongType, false) .add("name", StringType), - Some(NoPrefixKeyStateEncoderSpec(keySchema)), - None - ) - - val userKeySchema = new StructType() - .add("id", IntegerType, false) - .add("name", StringType) - val compositeKeySchema = new StructType() - .add("key", new StructType().add("value", StringType)) - .add("userKey", userKeySchema) - val schema2 = StateStoreColFamilySchema( - "mapState", - compositeKeySchema, - new StructType().add("value", StringType), - Some(PrefixKeyScanStateEncoderSpec(compositeKeySchema, 1)), - Option(userKeySchema) - ) - - val inputData = MemoryStream[String] - val result = inputData.toDS() - .groupByKey(x => x) - .transformWithState(new StatefulProcessorWithCompositeTypes(), - TimeMode.None(), - OutputMode.Update()) - - testStream(result, OutputMode.Update())( - StartStream(checkpointLocation = checkpointDir.getCanonicalPath), - AddData(inputData, "a", "b"), - CheckNewAnswer(("a", "1"), ("b", "1")), - Execute { q => - q.lastProgress.runId - val schemaFilePath = fm.list(stateSchemaPath).toSeq.head.getPath - val providerId = StateStoreProviderId(StateStoreId( - checkpointDir.getCanonicalPath, 0, 0), q.lastProgress.runId) - val checker = new StateSchemaCompatibilityChecker(providerId, - hadoopConf, Some(schemaFilePath)) - val colFamilySeq = checker.readSchemaFile() - - assert(TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS == - q.lastProgress.stateOperators.head.customMetrics.get("numValueStateVars").toInt) - assert(TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS == - q.lastProgress.stateOperators.head.customMetrics.get("numListStateVars").toInt) - assert(TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS == - q.lastProgress.stateOperators.head.customMetrics.get("numMapStateVars").toInt) - - assert(colFamilySeq.length == 3) - assert(colFamilySeq.map(_.toString).toSet == Set( - schema0, schema1, schema2 - ).map(_.toString)) - }, - StopStream - ) + Some(NoPrefixKeyStateEncoderSpec(keySchema)), + None + ) + + val userKeySchema = new StructType() + .add("id", IntegerType, false) + .add("name", StringType) + val compositeKeySchema = new StructType() + .add("key", new StructType().add("value", StringType)) + .add("userKey", userKeySchema) + val schema2 = StateStoreColFamilySchema( + "mapState", + compositeKeySchema, + new StructType().add("value", StringType), + Some(PrefixKeyScanStateEncoderSpec(compositeKeySchema, 1)), + Option(userKeySchema) + ) + + val inputData = MemoryStream[String] + val result = inputData.toDS() + .groupByKey(x => x) + .transformWithState(new StatefulProcessorWithCompositeTypes(useImplicits), + TimeMode.None(), + OutputMode.Update()) + + testStream(result, OutputMode.Update())( + StartStream(checkpointLocation = checkpointDir.getCanonicalPath), + AddData(inputData, "a", "b"), + CheckNewAnswer(("a", "1"), ("b", "1")), + Execute { q => + q.lastProgress.runId + val schemaFilePath = fm.list(stateSchemaPath).toSeq.head.getPath + val providerId = StateStoreProviderId(StateStoreId( + checkpointDir.getCanonicalPath, 0, 0), q.lastProgress.runId) + val checker = new StateSchemaCompatibilityChecker(providerId, + hadoopConf, Some(schemaFilePath)) + val colFamilySeq = checker.readSchemaFile() + + assert(TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS == + q.lastProgress.stateOperators.head.customMetrics.get("numValueStateVars").toInt) + assert(TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS == + q.lastProgress.stateOperators.head.customMetrics.get("numListStateVars").toInt) + assert(TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS == + q.lastProgress.stateOperators.head.customMetrics.get("numMapStateVars").toInt) + + assert(colFamilySeq.length == 3) + assert(colFamilySeq.map(_.toString).toSet == Set( + schema0, schema1, schema2 + ).map(_.toString)) + }, + StopStream + ) + } } } } @@ -1605,7 +1626,8 @@ class TransformWithStateSuite extends StateStoreMetricsTest } } - test("transformWithState - verify that schema file is kept after metadata is purged") { + test("transformWithState - verify that schema file " + + "is kept after metadata is purged") { withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName, SQLConf.SHUFFLE_PARTITIONS.key -> diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateTTLTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateTTLTest.scala index 2ddf69aa49e04..e1df2d640f1fc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateTTLTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateTTLTest.scala @@ -21,7 +21,7 @@ import java.sql.Timestamp import java.time.Duration import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithEncodingTypes, AlsoTestWithRocksDBFeatures, RocksDBStateStoreProvider} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.StreamManualClock @@ -41,7 +41,8 @@ case class OutputEvent( * Test suite base for TransformWithState with TTL support. */ abstract class TransformWithStateTTLTest - extends StreamTest { + extends StreamTest with AlsoTestWithRocksDBFeatures + with AlsoTestWithEncodingTypes { import testImplicits._ def getProcessor(ttlConfig: TTLConfig): StatefulProcessor[String, InputEvent, OutputEvent] @@ -143,18 +144,24 @@ abstract class TransformWithStateTTLTest AddData(inputStream, InputEvent("k1", "put", 1)), // advance clock to trigger processing AdvanceManualClock(1 * 1000), + // In the primary index, we should have that k1 -> [(1, 61000)]. + // The TTL index has (61000, k1) -> empty. The min-expiry index has k1 -> 61000. CheckNewAnswer(), + // get this state, and make sure we get unexpired value AddData(inputStream, InputEvent("k1", "get", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer(OutputEvent("k1", 1, isTTLValue = false, -1)), + // ensure ttl values were added correctly AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer(OutputEvent("k1", 1, isTTLValue = true, 61000)), + AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer(OutputEvent("k1", -1, isTTLValue = true, 61000)), + // advance clock and update expiration time AdvanceManualClock(30 * 1000), AddData(inputStream, InputEvent("k1", "put", 1)), @@ -162,24 +169,30 @@ abstract class TransformWithStateTTLTest // advance clock to trigger processing AdvanceManualClock(1 * 1000), // validate value is not expired + // + // In the primary index, we still get that k1 -> [(1, 95000)]. + // The TTL index should now have (95000, k1) -> empty, and the min-expiry index + // should have k1 -> 95000. CheckNewAnswer(OutputEvent("k1", 1, isTTLValue = false, -1)), + // validate ttl value is updated in the state AddData(inputStream, InputEvent("k1", "get_ttl_value_from_state", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer(OutputEvent("k1", 1, isTTLValue = true, 95000)), - // validate ttl state has both ttl values present + + // validate ttl state has only the newer ttl value present AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1)), AdvanceManualClock(1 * 1000), - CheckNewAnswer(OutputEvent("k1", -1, isTTLValue = true, 61000), - OutputEvent("k1", -1, isTTLValue = true, 95000) - ), - // advance clock after older expiration value + CheckNewAnswer( OutputEvent("k1", -1, isTTLValue = true, 95000)), + + // advance clock after original expiration value; this shouldn't do anything AdvanceManualClock(30 * 1000), // ensure unexpired value is still present in the state AddData(inputStream, InputEvent("k1", "get", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer(OutputEvent("k1", 1, isTTLValue = false, -1)), - // validate that the older expiration value is removed from ttl state + + // validate that the ttl index still has the newer value AddData(inputStream, InputEvent("k1", "get_values_in_ttl_state", -1)), AdvanceManualClock(1 * 1000), CheckNewAnswer(OutputEvent("k1", -1, isTTLValue = true, 95000)) @@ -285,4 +298,59 @@ abstract class TransformWithStateTTLTest ) } } + + test("validate that clear only clears the current grouping key") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + val inputStream = MemoryStream[InputEvent] + val ttlConfig = TTLConfig(ttlDuration = Duration.ofMinutes(1)) + val result = inputStream.toDS() + .groupByKey(x => x.key) + .transformWithState( + getProcessor(ttlConfig), + TimeMode.ProcessingTime(), + OutputMode.Append()) + + val clock = new StreamManualClock + testStream(result)( + StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock), + AddData(inputStream, + InputEvent("k1", "put", 1), + InputEvent("k2", "put", 2), + InputEvent("k3", "put", 3) + ), + // advance clock to trigger processing + AdvanceManualClock(1 * 1000), + CheckNewAnswer(), + + AddData( + inputStream, + InputEvent("k1", "clear", -1), + InputEvent("k1", "get_ttl_value_from_state", -1), + InputEvent("k1", "get_values_in_ttl_state", -1) + ), + // advance clock to trigger processing + AdvanceManualClock(1 * 1000), + CheckNewAnswer(), + + AddData(inputStream, + InputEvent("k2", "get_ttl_value_from_state", -1), + InputEvent("k2", "get_values_in_ttl_state", -1), + + InputEvent("k3", "get_ttl_value_from_state", -1), + InputEvent("k3", "get_values_in_ttl_state", -1) + ), + // advance clock to trigger processing + AdvanceManualClock(1 * 1000), + CheckNewAnswer( + OutputEvent("k2", 2, isTTLValue = true, 61000), + OutputEvent("k2", -1, isTTLValue = true, 61000), + + OutputEvent("k3", 3, isTTLValue = true, 61000), + OutputEvent("k3", -1, isTTLValue = true, 61000) + ) + ) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala index 21c3beb79314c..4c7f3a06ea7b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala @@ -55,10 +55,12 @@ object TTLInputProcessFunction { } else if (row.action == "put") { valueState.update(row.value) } else if (row.action == "get_values_in_ttl_state") { - val ttlValues = valueState.getValuesInTTLState() + val ttlValues = valueState.getValueInTTLState() ttlValues.foreach { v => results = OutputEvent(key, -1, isTTLValue = true, ttlValue = v) :: results } + } else if (row.action == "clear") { + valueState.clear() } results.iterator @@ -76,6 +78,8 @@ object TTLInputProcessFunction { } } else if (row.action == "put") { valueState.update(row.value) + } else if (row.action == "clear") { + valueState.clear() } results.iterator @@ -262,7 +266,8 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest { } } - test("verify StateSchemaV3 writes correct SQL schema of key/value and with TTL") { + test("verify StateSchemaV3 writes correct SQL " + + "schema of key/value and with TTL") { withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName, SQLConf.SHUFFLE_PARTITIONS.key -> diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index fe5a0f8ee257a..c93f17701c620 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -41,10 +41,11 @@ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.PlanTestBase import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.classic.{ClassicConversions, ColumnConversions} import org.apache.spark.sql.execution.FilterExec import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution import org.apache.spark.sql.execution.datasources.DataSourceUtils -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.{ColumnNodeToExpressionConverter, SQLConf} import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.UninterruptibleThread import org.apache.spark.util.Utils @@ -239,9 +240,12 @@ private[sql] trait SQLTestUtilsBase * This is because we create the `SparkSession` immediately before the first test is run, * but the implicits import is needed in the constructor. */ - protected object testImplicits extends SQLImplicits { + protected object testImplicits + extends SQLImplicits + with ClassicConversions + with ColumnConversions { override protected def session: SparkSession = self.spark - implicit def toRichColumn(c: Column): SparkSession#RichColumn = session.RichColumn(c) + override protected def converter: ColumnNodeToExpressionConverter = self.spark.converter } protected override def withSQLConf[T](pairs: (String, String)*)(f: => T): T = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala index be91f5e789e2c..7e6f10bcc46f0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala @@ -22,7 +22,7 @@ import java.lang.{Long => JLong} import scala.collection.mutable.ArrayBuffer import org.apache.spark._ -import org.apache.spark.sql.{functions, Dataset, QueryTest, Row, SparkSession} +import org.apache.spark.sql.{functions, Dataset, Encoder, Encoders, QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project} import org.apache.spark.sql.execution.{QueryExecution, WholeStageCodegenExec} @@ -30,6 +30,7 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, LeafRunnableCommand} import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand import org.apache.spark.sql.execution.datasources.json.JsonFileFormat +import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StringType @@ -339,6 +340,51 @@ class DataFrameCallbackSuite extends QueryTest } } + test("SPARK-50581: support observe with udaf") { + withUserDefinedFunction(("someUdaf", true)) { + spark.udf.register("someUdaf", functions.udaf(new Aggregator[JLong, JLong, JLong] { + def zero: JLong = 0L + def reduce(b: JLong, a: JLong): JLong = a + b + def merge(b1: JLong, b2: JLong): JLong = b1 + b2 + def finish(r: JLong): JLong = r + def bufferEncoder: Encoder[JLong] = Encoders.LONG + def outputEncoder: Encoder[JLong] = Encoders.LONG + })) + + val df = spark.range(100) + + val metricMaps = ArrayBuffer.empty[Map[String, Row]] + val listener = new QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { + if (qe.observedMetrics.nonEmpty) { + metricMaps += qe.observedMetrics + } + } + + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { + // No-op + } + } + try { + spark.listenerManager.register(listener) + + // udaf usage in observe is not working (serialization exception) + df.observe( + name = "my_metrics", + expr("someUdaf(id)").as("agg") + ) + .collect() + + sparkContext.listenerBus.waitUntilEmpty() + assert(metricMaps.size === 1) + assert(metricMaps.head("my_metrics") === Row(4950L)) + + } finally { + spark.listenerManager.unregister(listener) + } + } + } + private def validateObservedMetrics(df: Dataset[JLong]): Unit = { val metricMaps = ArrayBuffer.empty[Map[String, Row]] val listener = new QueryExecutionListener { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/vectorized/ArrowColumnVectorSuite.scala index 436cea50ad972..9180ce1aee198 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/vectorized/ArrowColumnVectorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/vectorized/ArrowColumnVectorSuite.scala @@ -515,4 +515,28 @@ class ArrowColumnVectorSuite extends SparkFunSuite { columnVector.close() allocator.close() } + + test("struct with TimestampNTZType") { + val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue) + val schema = new StructType().add("ts", TimestampNTZType) + val vector = ArrowUtils.toArrowField("struct", schema, nullable = true, null) + .createVector(allocator).asInstanceOf[StructVector] + vector.allocateNew() + val timestampVector = vector.getChildByOrdinal(0).asInstanceOf[TimeStampMicroVector] + + vector.setIndexDefined(0) + timestampVector.setSafe(0, 1000L) + + timestampVector.setValueCount(1) + vector.setValueCount(1) + + val columnVector = new ArrowColumnVector(vector) + assert(columnVector.dataType === schema) + + val row0 = columnVector.getStruct(0) + assert(row0.get(0, TimestampNTZType) === 1000L) + + columnVector.close() + allocator.close() + } } diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py index b69a903b44f90..4db22ff3b8e46 100644 --- a/sql/gen-sql-config-docs.py +++ b/sql/gen-sql-config-docs.py @@ -103,6 +103,14 @@ def generate_sql_configs_table_html(sql_configs, path): ) ) + if config.name == "spark.sql.files.ignoreInvalidPartitionPaths": + description = config.description.replace("<", "<").replace(">", ">") + elif config.name == "spark.sql.hive.quoteHiveStructFieldName": + description = config.description.replace( + "<", "<").replace(">", ">").replace("`", "`") + else: + description = config.description + f.write(dedent( """
@@ -115,7 +123,7 @@ def generate_sql_configs_table_html(sql_configs, path): .format( name=config.name, default=default, - description=markdown.markdown(config.description), + description=markdown.markdown(description), version=config.version ) )) diff --git a/sql/hive-thriftserver/src/test/resources/log4j2.properties b/sql/hive-thriftserver/src/test/resources/log4j2.properties index ebb3a6ccb2fca..e6753047c9055 100644 --- a/sql/hive-thriftserver/src/test/resources/log4j2.properties +++ b/sql/hive-thriftserver/src/test/resources/log4j2.properties @@ -32,12 +32,6 @@ appender.console.filter.1.type = Filters appender.console.filter.1.a.type = ThresholdFilter appender.console.filter.1.a.level = warn -# SPARK-34128: Suppress undesirable TTransportException warnings, due to THRIFT-4805 -appender.console.filter.1.b.type = RegexFilter -appender.console.filter.1.b.regex = .*Thrift error occurred during processing of message.* -appender.console.filter.1.b.onMatch = deny -appender.console.filter.1.b.onMismatch = neutral - #File Appender appender.file.type = File appender.file.name = File @@ -47,14 +41,9 @@ appender.file.layout.pattern = %d{HH:mm:ss.SSS} %t %p %c{1}: %m%n%ex appender.file.filter.1.type = Filters -appender.file.filter.1.a.type = RegexFilter -appender.file.filter.1.a.regex = .*Thrift error occurred during processing of message.* -appender.file.filter.1.a.onMatch = deny -appender.file.filter.1.a.onMismatch = neutral - # Set the logger level of File Appender to WARN -appender.file.filter.1.b.type = ThresholdFilter -appender.file.filter.1.b.level = debug +appender.file.filter.1.a.type = ThresholdFilter +appender.file.filter.1.a.level = debug # Some packages are noisy for no good reason. logger.parquet_recordreader.name = org.apache.parquet.hadoop.ParquetRecordReader diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index 662f43fc00399..bc367d0cc856b 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -94,6 +94,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ // SPARK-28636 "decimalArithmeticOperations.sql", "literals.sql", + "random.sql", "subquery/scalar-subquery/scalar-subquery-predicate.sql", "subquery/in-subquery/in-limit.sql", "subquery/in-subquery/in-group-by.sql", @@ -104,6 +105,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ "timestampNTZ/datetime-special-ansi.sql", // SPARK-47264 "collations.sql", + "listagg-collations.sql", "pipe-operators.sql", // VARIANT type "variant/named-function-arguments.sql" diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index d7645a3c84692..5152c2193499a 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONSTRAINT,CONTAINS,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,END,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GLOBAL,GRANT,GROUP,GROUPING,HAVING,HOUR,HOURS,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NONE,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,REDUCE,REFERENCES,REFRESH,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,START,STATISTICS,STORED,STRATIFY,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WHILE,WINDOW,WITH,WITHIN,X,YEAR,YEARS,ZONE") + assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONSTRAINT,CONTAINS,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,END,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GLOBAL,GRANT,GROUP,GROUPING,HAVING,HOUR,HOURS,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NONE,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSIVE,REDUCE,REFERENCES,REFRESH,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,START,STATISTICS,STORED,STRATIFY,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WHILE,WINDOW,WITH,WITHIN,X,YEAR,YEARS,ZONE") // scalastyle:on line.size.limit } } diff --git a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt index 6e8c140c72dcc..4c44860c4618a 100644 --- a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt +++ b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt @@ -1,11 +1,11 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor insert hive table benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -INSERT INTO DYNAMIC 3695 3849 218 0.0 360836.1 1.0X -INSERT INTO HYBRID 536 551 17 0.0 52374.2 6.9X -INSERT INTO STATIC 151 177 15 0.1 14737.4 24.5X -INSERT OVERWRITE DYNAMIC 3057 3228 241 0.0 298536.0 1.2X -INSERT OVERWRITE HYBRID 455 467 15 0.0 44443.5 8.1X -INSERT OVERWRITE STATIC 173 180 4 0.1 16911.3 21.3X +INSERT INTO DYNAMIC 3480 3775 417 0.0 339817.0 1.0X +INSERT INTO HYBRID 562 581 13 0.0 54901.2 6.2X +INSERT INTO STATIC 157 174 14 0.1 15316.1 22.2X +INSERT OVERWRITE DYNAMIC 2961 3195 331 0.0 289121.3 1.2X +INSERT OVERWRITE HYBRID 426 431 6 0.0 41557.2 8.2X +INSERT OVERWRITE STATIC 161 168 5 0.1 15682.4 21.7X diff --git a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt index 7a901f75ddb35..38e3b10eb5d00 100644 --- a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt +++ b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt @@ -1,11 +1,11 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor insert hive table benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -INSERT INTO DYNAMIC 3762 3968 292 0.0 367406.9 1.0X -INSERT INTO HYBRID 516 591 80 0.0 50355.2 7.3X -INSERT INTO STATIC 168 192 24 0.1 16403.7 22.4X -INSERT OVERWRITE DYNAMIC 3524 3643 169 0.0 344143.1 1.1X -INSERT OVERWRITE HYBRID 493 510 13 0.0 48137.8 7.6X -INSERT OVERWRITE STATIC 178 190 14 0.1 17346.8 21.2X +INSERT INTO DYNAMIC 3406 3754 493 0.0 332568.8 1.0X +INSERT INTO HYBRID 496 523 22 0.0 48481.9 6.9X +INSERT INTO STATIC 146 168 20 0.1 14228.9 23.4X +INSERT OVERWRITE DYNAMIC 3031 3148 166 0.0 295998.0 1.1X +INSERT OVERWRITE HYBRID 419 444 26 0.0 40901.7 8.1X +INSERT OVERWRITE STATIC 158 169 12 0.1 15420.2 21.6X diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt index f185c50f929bf..9f7cd0bfd8762 100644 --- a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt @@ -2,44 +2,44 @@ Hive UDAF vs Spark AF ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor hive udaf vs spark af: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hive udaf w/o group by 3232 3292 46 0.0 49313.1 1.0X -spark af w/o group by 20 26 4 3.3 303.1 162.7X -hive udaf w/ group by 2002 2055 40 0.0 30540.8 1.6X -spark af w/ group by w/o fallback 22 25 3 3.0 334.3 147.5X -spark af w/ group by w/ fallback 25 27 3 2.7 376.5 131.0X +hive udaf w/o group by 3071 3289 131 0.0 46855.4 1.0X +spark af w/o group by 21 27 5 3.2 315.4 148.6X +hive udaf w/ group by 2138 2161 26 0.0 32618.6 1.4X +spark af w/ group by w/o fallback 22 26 5 3.0 338.3 138.5X +spark af w/ group by w/ fallback 26 30 7 2.5 395.2 118.6X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - typed_count ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 23962 24276 445 4.4 228.5 1.0X -object agg w/ group by w/o fallback 7346 7389 41 14.3 70.1 3.3X -object agg w/ group by w/ fallback 15904 16415 443 6.6 151.7 1.5X -sort agg w/o group by 4041 4060 17 26.0 38.5 5.9X -object agg w/o group by w/o fallback 3872 3914 42 27.1 36.9 6.2X +sort agg w/ group by 23012 23051 55 4.6 219.5 1.0X +object agg w/ group by w/o fallback 6670 7292 278 15.7 63.6 3.5X +object agg w/ group by w/ fallback 15467 15512 48 6.8 147.5 1.5X +sort agg w/o group by 4075 4142 34 25.7 38.9 5.6X +object agg w/o group by w/o fallback 3715 3810 67 28.2 35.4 6.2X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - percentile_approx ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 417 449 16 5.0 198.9 1.0X -object agg w/ group by w/o fallback 328 339 5 6.4 156.5 1.3X -object agg w/ group by w/ fallback 467 501 15 4.5 222.4 0.9X -sort agg w/o group by 274 283 6 7.6 130.8 1.5X -object agg w/o group by w/o fallback 271 277 3 7.7 129.3 1.5X +sort agg w/ group by 412 436 14 5.1 196.4 1.0X +object agg w/ group by w/o fallback 324 333 6 6.5 154.7 1.3X +object agg w/ group by w/ fallback 414 421 7 5.1 197.3 1.0X +sort agg w/o group by 238 242 3 8.8 113.5 1.7X +object agg w/o group by w/o fallback 227 234 6 9.2 108.4 1.8X diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt index fb426c84414ba..1e143f39fbf91 100644 --- a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt @@ -2,44 +2,44 @@ Hive UDAF vs Spark AF ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor hive udaf vs spark af: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hive udaf w/o group by 3271 3305 29 0.0 49904.4 1.0X -spark af w/o group by 21 26 4 3.2 316.4 157.7X -hive udaf w/ group by 2070 2109 30 0.0 31591.0 1.6X -spark af w/ group by w/o fallback 22 26 3 3.0 335.8 148.6X -spark af w/ group by w/ fallback 25 27 3 2.6 379.4 131.5X +hive udaf w/o group by 3797 3861 45 0.0 57941.7 1.0X +spark af w/o group by 21 27 5 3.2 314.9 184.0X +hive udaf w/ group by 2569 2575 11 0.0 39194.8 1.5X +spark af w/ group by w/o fallback 22 26 3 3.0 333.7 173.6X +spark af w/ group by w/ fallback 25 28 3 2.6 388.1 149.3X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - typed_count ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 24310 24337 39 4.3 231.8 1.0X -object agg w/ group by w/o fallback 6916 7223 137 15.2 66.0 3.5X -object agg w/ group by w/ fallback 14558 14693 128 7.2 138.8 1.7X -sort agg w/o group by 4079 4125 48 25.7 38.9 6.0X -object agg w/o group by w/o fallback 3577 3608 22 29.3 34.1 6.8X +sort agg w/ group by 24523 24678 220 4.3 233.9 1.0X +object agg w/ group by w/o fallback 6979 7355 177 15.0 66.6 3.5X +object agg w/ group by w/ fallback 14572 14619 33 7.2 139.0 1.7X +sort agg w/o group by 4265 4283 19 24.6 40.7 5.7X +object agg w/o group by w/o fallback 3614 3660 29 29.0 34.5 6.8X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - percentile_approx ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 403 412 6 5.2 192.3 1.0X -object agg w/ group by w/o fallback 341 347 5 6.1 162.7 1.2X -object agg w/ group by w/ fallback 469 473 4 4.5 223.6 0.9X -sort agg w/o group by 304 310 4 6.9 144.9 1.3X -object agg w/o group by w/o fallback 297 305 3 7.1 141.4 1.4X +sort agg w/ group by 413 422 7 5.1 196.8 1.0X +object agg w/ group by w/o fallback 335 343 4 6.3 159.6 1.2X +object agg w/ group by w/ fallback 446 453 5 4.7 212.7 0.9X +sort agg w/o group by 274 280 4 7.7 130.6 1.5X +object agg w/o group by w/o fallback 266 273 4 7.9 126.9 1.6X diff --git a/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt index b941571563401..25ba0a0602b47 100644 --- a/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt +++ b/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt @@ -2,221 +2,221 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 675 696 17 23.3 42.9 1.0X -Native ORC MR 745 759 24 21.1 47.3 0.9X -Native ORC Vectorized 91 118 9 172.4 5.8 7.4X +Hive built-in ORC 711 756 43 22.1 45.2 1.0X +Native ORC MR 762 842 92 20.7 48.4 0.9X +Native ORC Vectorized 94 115 17 167.8 6.0 7.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 680 728 47 23.1 43.3 1.0X -Native ORC MR 726 755 25 21.7 46.1 0.9X -Native ORC Vectorized 83 99 11 190.0 5.3 8.2X +Hive built-in ORC 693 722 32 22.7 44.0 1.0X +Native ORC MR 738 767 35 21.3 46.9 0.9X +Native ORC Vectorized 81 100 15 193.2 5.2 8.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 696 716 28 22.6 44.3 1.0X -Native ORC MR 741 766 32 21.2 47.1 0.9X -Native ORC Vectorized 86 98 12 181.9 5.5 8.0X +Hive built-in ORC 776 792 27 20.3 49.3 1.0X +Native ORC MR 895 907 18 17.6 56.9 0.9X +Native ORC Vectorized 102 120 14 154.7 6.5 7.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 720 729 14 21.9 45.8 1.0X -Native ORC MR 766 783 16 20.5 48.7 0.9X -Native ORC Vectorized 92 108 11 171.7 5.8 7.9X +Hive built-in ORC 831 857 34 18.9 52.9 1.0X +Native ORC MR 938 996 55 16.8 59.6 0.9X +Native ORC Vectorized 100 116 22 157.1 6.4 8.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 754 792 65 20.9 47.9 1.0X -Native ORC MR 861 879 27 18.3 54.7 0.9X -Native ORC Vectorized 147 164 13 107.3 9.3 5.1X +Hive built-in ORC 768 806 36 20.5 48.8 1.0X +Native ORC MR 950 972 25 16.6 60.4 0.8X +Native ORC Vectorized 139 160 34 113.4 8.8 5.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 826 833 6 19.0 52.5 1.0X -Native ORC MR 947 975 43 16.6 60.2 0.9X -Native ORC Vectorized 218 234 24 72.0 13.9 3.8X +Hive built-in ORC 914 959 38 17.2 58.1 1.0X +Native ORC MR 994 1007 18 15.8 63.2 0.9X +Native ORC Vectorized 223 240 31 70.6 14.2 4.1X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1632 1653 30 6.4 155.6 1.0X -Native ORC MR 1523 1528 8 6.9 145.2 1.1X -Native ORC Vectorized 610 643 24 17.2 58.2 2.7X +Hive built-in ORC 1770 1819 69 5.9 168.8 1.0X +Native ORC MR 1606 1611 6 6.5 153.2 1.1X +Native ORC Vectorized 606 646 44 17.3 57.8 2.9X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column - Hive built-in ORC 937 953 14 16.8 59.6 1.0X -Data column - Native ORC MR 988 1040 73 15.9 62.8 0.9X -Data column - Native ORC Vectorized 89 107 13 177.2 5.6 10.6X -Partition column - Hive built-in ORC 640 690 55 24.6 40.7 1.5X -Partition column - Native ORC MR 695 708 16 22.6 44.2 1.3X -Partition column - Native ORC Vectorized 38 49 9 416.8 2.4 24.8X -Both columns - Hive built-in ORC 978 1015 42 16.1 62.2 1.0X -Both columns - Native ORC MR 1055 1076 29 14.9 67.1 0.9X -Both columns - Native ORC Vectorized 102 125 24 153.8 6.5 9.2X +Data column - Hive built-in ORC 989 1049 85 15.9 62.8 1.0X +Data column - Native ORC MR 1076 1078 2 14.6 68.4 0.9X +Data column - Native ORC Vectorized 103 143 29 152.9 6.5 9.6X +Partition column - Hive built-in ORC 648 687 35 24.3 41.2 1.5X +Partition column - Native ORC MR 680 716 32 23.1 43.2 1.5X +Partition column - Native ORC Vectorized 36 55 17 431.5 2.3 27.1X +Both columns - Hive built-in ORC 993 1019 38 15.8 63.1 1.0X +Both columns - Native ORC MR 1137 1173 51 13.8 72.3 0.9X +Both columns - Native ORC Vectorized 138 188 25 114.1 8.8 7.2X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 928 944 14 11.3 88.5 1.0X -Native ORC MR 711 733 25 14.8 67.8 1.3X -Native ORC Vectorized 127 139 19 82.9 12.1 7.3X +Hive built-in ORC 947 974 24 11.1 90.3 1.0X +Native ORC MR 934 950 18 11.2 89.0 1.0X +Native ORC Vectorized 127 132 6 82.8 12.1 7.5X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1539 1597 83 6.8 146.7 1.0X -Native ORC MR 1223 1232 12 8.6 116.7 1.3X -Native ORC Vectorized 286 320 27 36.6 27.3 5.4X +Hive built-in ORC 1476 1489 17 7.1 140.8 1.0X +Native ORC MR 1310 1328 25 8.0 125.0 1.1X +Native ORC Vectorized 308 350 29 34.1 29.3 4.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1381 1397 22 7.6 131.7 1.0X -Native ORC MR 1112 1124 17 9.4 106.0 1.2X -Native ORC Vectorized 363 394 30 28.9 34.6 3.8X +Hive built-in ORC 1258 1259 2 8.3 119.9 1.0X +Native ORC MR 1168 1173 7 9.0 111.4 1.1X +Native ORC Vectorized 362 408 50 29.0 34.5 3.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 733 751 24 14.3 69.9 1.0X -Native ORC MR 742 771 48 14.1 70.8 1.0X -Native ORC Vectorized 148 171 26 70.8 14.1 5.0X +Hive built-in ORC 749 774 43 14.0 71.4 1.0X +Native ORC MR 797 830 51 13.2 76.0 0.9X +Native ORC Vectorized 148 168 22 71.1 14.1 5.1X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 562 588 25 1.9 536.0 1.0X -Native ORC MR 87 109 15 12.0 83.3 6.4X -Native ORC Vectorized 30 37 6 34.9 28.7 18.7X +Hive built-in ORC 558 611 65 1.9 532.2 1.0X +Native ORC MR 90 110 25 11.7 85.5 6.2X +Native ORC Vectorized 32 40 10 33.1 30.2 17.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1022 1040 26 1.0 974.3 1.0X -Native ORC MR 100 114 11 10.5 95.2 10.2X -Native ORC Vectorized 37 44 7 28.6 35.0 27.8X +Hive built-in ORC 1029 1033 5 1.0 981.4 1.0X +Native ORC MR 98 117 20 10.7 93.2 10.5X +Native ORC Vectorized 39 50 9 26.7 37.5 26.2X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1522 1617 134 0.7 1451.1 1.0X -Native ORC MR 104 114 9 10.1 99.4 14.6X -Native ORC Vectorized 49 65 12 21.4 46.7 31.1X +Hive built-in ORC 1512 1536 35 0.7 1441.8 1.0X +Native ORC MR 106 128 17 9.9 101.1 14.3X +Native ORC Vectorized 46 67 17 22.9 43.7 33.0X ================================================================================================ Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 285 321 35 3.7 272.0 1.0X -Native ORC MR 208 274 55 5.1 198.0 1.4X -Native ORC Vectorized 97 119 25 10.8 92.8 2.9X +Hive built-in ORC 324 369 49 3.2 309.0 1.0X +Native ORC MR 213 245 34 4.9 203.5 1.5X +Native ORC Vectorized 99 118 19 10.6 94.1 3.3X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 100 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 1963 2005 59 0.5 1871.9 1.0X -Native ORC MR 1612 1677 92 0.7 1537.5 1.2X -Native ORC Vectorized 859 944 92 1.2 819.4 2.3X +Hive built-in ORC 2169 2204 48 0.5 2069.0 1.0X +Native ORC MR 1765 1841 107 0.6 1683.6 1.2X +Native ORC Vectorized 858 940 100 1.2 818.1 2.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 300 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 5793 5868 107 0.2 5524.2 1.0X -Native ORC MR 5247 5321 105 0.2 5003.5 1.1X -Native ORC Vectorized 5404 5425 30 0.2 5153.5 1.1X +Hive built-in ORC 6111 6228 166 0.2 5828.0 1.0X +Native ORC MR 5474 5540 93 0.2 5220.3 1.1X +Native ORC Vectorized 5605 5658 74 0.2 5345.5 1.1X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 600 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 12664 12690 37 0.1 12077.5 1.0X -Native ORC MR 12398 12513 162 0.1 11823.9 1.0X -Native ORC Vectorized 12552 12553 1 0.1 11970.4 1.0X +Hive built-in ORC 13063 13093 42 0.1 12458.1 1.0X +Native ORC MR 12754 12782 39 0.1 12163.1 1.0X +Native ORC Vectorized 13004 13082 111 0.1 12401.2 1.0X ================================================================================================ Nested Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1981 2003 30 0.5 1889.3 1.0X -Native ORC MR 2095 2133 54 0.5 1997.9 0.9X -Native ORC Vectorized 564 605 45 1.9 537.6 3.5X +Hive built-in ORC 2130 2182 73 0.5 2031.7 1.0X +Native ORC MR 2179 2290 156 0.5 2078.2 1.0X +Native ORC Vectorized 568 575 7 1.8 541.9 3.7X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 30 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 5412 5426 21 0.2 5161.0 1.0X -Native ORC MR 4556 4639 117 0.2 4345.2 1.2X -Native ORC Vectorized 1478 1506 39 0.7 1409.7 3.7X +Hive built-in ORC 5890 5894 5 0.2 5617.5 1.0X +Native ORC MR 5089 5121 45 0.2 4853.2 1.2X +Native ORC Vectorized 1512 1550 53 0.7 1442.1 3.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 30 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 5018 5079 87 0.2 4785.1 1.0X -Native ORC MR 5380 5388 11 0.2 5130.5 0.9X -Native ORC Vectorized 1975 2012 52 0.5 1883.8 2.5X +Hive built-in ORC 5276 5277 2 0.2 5031.7 1.0X +Native ORC MR 5272 5293 29 0.2 5027.8 1.0X +Native ORC Vectorized 1906 1913 9 0.6 1818.0 2.8X diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt index 64d738858b1a2..7eca721b2d23d 100644 --- a/sql/hive/benchmarks/OrcReadBenchmark-results.txt +++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt @@ -2,221 +2,221 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 724 754 46 21.7 46.0 1.0X -Native ORC MR 838 865 38 18.8 53.3 0.9X -Native ORC Vectorized 83 104 10 188.5 5.3 8.7X +Hive built-in ORC 738 797 51 21.3 46.9 1.0X +Native ORC MR 814 860 41 19.3 51.8 0.9X +Native ORC Vectorized 112 127 13 140.1 7.1 6.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 709 746 43 22.2 45.1 1.0X -Native ORC MR 791 822 28 19.9 50.3 0.9X -Native ORC Vectorized 85 101 9 184.6 5.4 8.3X +Hive built-in ORC 640 730 82 24.6 40.7 1.0X +Native ORC MR 713 744 35 22.0 45.4 0.9X +Native ORC Vectorized 91 110 15 173.3 5.8 7.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 766 777 17 20.5 48.7 1.0X -Native ORC MR 772 801 25 20.4 49.1 1.0X -Native ORC Vectorized 89 98 6 177.0 5.7 8.6X +Hive built-in ORC 673 679 10 23.4 42.8 1.0X +Native ORC MR 787 816 29 20.0 50.0 0.9X +Native ORC Vectorized 91 103 9 172.3 5.8 7.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 755 762 6 20.8 48.0 1.0X -Native ORC MR 811 818 10 19.4 51.6 0.9X -Native ORC Vectorized 87 101 11 181.7 5.5 8.7X +Hive built-in ORC 648 662 11 24.3 41.2 1.0X +Native ORC MR 749 768 20 21.0 47.6 0.9X +Native ORC Vectorized 88 103 11 178.9 5.6 7.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 775 794 18 20.3 49.3 1.0X -Native ORC MR 847 857 9 18.6 53.9 0.9X -Native ORC Vectorized 141 157 17 111.6 9.0 5.5X +Hive built-in ORC 715 728 23 22.0 45.4 1.0X +Native ORC MR 785 801 14 20.0 49.9 0.9X +Native ORC Vectorized 140 149 8 112.4 8.9 5.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 867 875 7 18.1 55.1 1.0X -Native ORC MR 914 940 22 17.2 58.1 0.9X -Native ORC Vectorized 219 232 15 71.8 13.9 4.0X +Hive built-in ORC 785 794 9 20.0 49.9 1.0X +Native ORC MR 871 899 43 18.1 55.4 0.9X +Native ORC Vectorized 221 239 24 71.2 14.0 3.6X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1588 1596 12 6.6 151.4 1.0X -Native ORC MR 1563 1567 6 6.7 149.1 1.0X -Native ORC Vectorized 628 676 63 16.7 59.8 2.5X +Hive built-in ORC 1494 1514 28 7.0 142.5 1.0X +Native ORC MR 1427 1433 8 7.3 136.1 1.0X +Native ORC Vectorized 602 610 12 17.4 57.4 2.5X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column - Hive built-in ORC 1117 1162 63 14.1 71.0 1.0X -Data column - Native ORC MR 1293 1306 17 12.2 82.2 0.9X -Data column - Native ORC Vectorized 91 103 10 173.7 5.8 12.3X -Partition column - Hive built-in ORC 717 722 7 21.9 45.6 1.6X -Partition column - Native ORC MR 633 673 42 24.8 40.3 1.8X -Partition column - Native ORC Vectorized 37 50 7 419.5 2.4 29.8X -Both columns - Hive built-in ORC 948 1010 69 16.6 60.3 1.2X -Both columns - Native ORC MR 1102 1109 10 14.3 70.1 1.0X -Both columns - Native ORC Vectorized 105 121 13 149.7 6.7 10.6X +Data column - Hive built-in ORC 803 826 24 19.6 51.0 1.0X +Data column - Native ORC MR 910 941 39 17.3 57.9 0.9X +Data column - Native ORC Vectorized 91 105 11 172.8 5.8 8.8X +Partition column - Hive built-in ORC 589 612 19 26.7 37.5 1.4X +Partition column - Native ORC MR 616 640 20 25.5 39.2 1.3X +Partition column - Native ORC Vectorized 37 49 8 422.4 2.4 21.6X +Both columns - Hive built-in ORC 935 945 12 16.8 59.4 0.9X +Both columns - Native ORC MR 994 1000 9 15.8 63.2 0.8X +Both columns - Native ORC Vectorized 103 123 15 153.0 6.5 7.8X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 904 909 5 11.6 86.2 1.0X -Native ORC MR 804 812 7 13.0 76.7 1.1X -Native ORC Vectorized 128 148 19 82.0 12.2 7.1X +Hive built-in ORC 830 846 27 12.6 79.1 1.0X +Native ORC MR 747 751 3 14.0 71.3 1.1X +Native ORC Vectorized 131 147 15 80.0 12.5 6.3X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1429 1453 33 7.3 136.3 1.0X -Native ORC MR 1288 1291 4 8.1 122.9 1.1X -Native ORC Vectorized 294 300 6 35.7 28.0 4.9X +Hive built-in ORC 1475 1479 5 7.1 140.7 1.0X +Native ORC MR 1230 1236 7 8.5 117.3 1.2X +Native ORC Vectorized 294 311 21 35.6 28.1 5.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1282 1290 13 8.2 122.2 1.0X -Native ORC MR 1195 1199 7 8.8 113.9 1.1X -Native ORC Vectorized 346 382 45 30.3 33.0 3.7X +Hive built-in ORC 1243 1244 2 8.4 118.5 1.0X +Native ORC MR 1157 1166 13 9.1 110.3 1.1X +Native ORC Vectorized 349 377 20 30.0 33.3 3.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 771 803 29 13.6 73.5 1.0X -Native ORC MR 776 784 12 13.5 74.0 1.0X -Native ORC Vectorized 149 166 13 70.4 14.2 5.2X +Hive built-in ORC 689 715 35 15.2 65.7 1.0X +Native ORC MR 762 767 5 13.8 72.6 0.9X +Native ORC Vectorized 149 174 20 70.5 14.2 4.6X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 400 431 29 2.6 381.3 1.0X -Native ORC MR 89 102 11 11.8 84.9 4.5X -Native ORC Vectorized 32 38 6 33.3 30.1 12.7X +Hive built-in ORC 447 510 71 2.3 426.4 1.0X +Native ORC MR 86 101 11 12.1 82.4 5.2X +Native ORC Vectorized 32 38 6 33.0 30.3 14.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 742 748 8 1.4 707.9 1.0X -Native ORC MR 95 108 12 11.0 90.6 7.8X -Native ORC Vectorized 38 44 5 27.8 36.0 19.7X +Hive built-in ORC 733 744 19 1.4 698.9 1.0X +Native ORC MR 94 109 10 11.1 89.8 7.8X +Native ORC Vectorized 38 46 7 27.7 36.1 19.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1056 1100 62 1.0 1007.5 1.0X -Native ORC MR 104 114 8 10.1 99.4 10.1X -Native ORC Vectorized 47 54 5 22.5 44.5 22.7X +Hive built-in ORC 1079 1079 0 1.0 1028.8 1.0X +Native ORC MR 103 118 14 10.2 98.2 10.5X +Native ORC Vectorized 47 55 9 22.5 44.4 23.2X ================================================================================================ Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 434 451 12 2.4 413.9 1.0X -Native ORC MR 273 294 14 3.8 260.8 1.6X -Native ORC Vectorized 104 139 22 10.0 99.6 4.2X +Hive built-in ORC 366 439 60 2.9 348.8 1.0X +Native ORC MR 268 290 23 3.9 255.2 1.4X +Native ORC Vectorized 104 116 15 10.1 99.5 3.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 100 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 2530 2556 38 0.4 2412.6 1.0X -Native ORC MR 1530 1598 97 0.7 1458.7 1.7X -Native ORC Vectorized 802 891 89 1.3 764.7 3.2X +Hive built-in ORC 2452 2519 94 0.4 2338.9 1.0X +Native ORC MR 1620 1739 169 0.6 1544.6 1.5X +Native ORC Vectorized 940 1015 106 1.1 896.2 2.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 300 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 7576 7591 20 0.1 7225.4 1.0X -Native ORC MR 5344 5377 47 0.2 5096.4 1.4X -Native ORC Vectorized 5351 5375 35 0.2 5102.9 1.4X +Hive built-in ORC 7321 7391 100 0.1 6981.8 1.0X +Native ORC MR 5367 5395 40 0.2 5118.2 1.4X +Native ORC Vectorized 5121 5225 148 0.2 4883.4 1.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 600 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 16196 16239 60 0.1 15446.2 1.0X -Native ORC MR 12920 12974 76 0.1 12321.6 1.3X -Native ORC Vectorized 12604 12735 185 0.1 12019.9 1.3X +Hive built-in ORC 15706 15710 6 0.1 14978.4 1.0X +Native ORC MR 12801 12832 45 0.1 12208.0 1.2X +Native ORC Vectorized 12607 12815 294 0.1 12023.4 1.2X ================================================================================================ Nested Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 2614 2637 32 0.4 2493.1 1.0X -Native ORC MR 2025 2027 2 0.5 1931.2 1.3X -Native ORC Vectorized 629 638 10 1.7 599.7 4.2X +Hive built-in ORC 2792 2816 34 0.4 2662.9 1.0X +Native ORC MR 2210 2291 114 0.5 2107.9 1.3X +Native ORC Vectorized 657 684 24 1.6 626.2 4.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 30 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 7193 7232 55 0.1 6860.0 1.0X -Native ORC MR 4480 4694 302 0.2 4272.6 1.6X -Native ORC Vectorized 1453 1458 6 0.7 1386.2 4.9X +Hive built-in ORC 7307 7509 286 0.1 6968.8 1.0X +Native ORC MR 4974 5189 305 0.2 4743.4 1.5X +Native ORC Vectorized 1578 1604 37 0.7 1504.9 4.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 30 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 6660 6679 27 0.2 6351.1 1.0X -Native ORC MR 5078 5085 9 0.2 4842.7 1.3X -Native ORC Vectorized 1762 1793 43 0.6 1680.6 3.8X +Hive built-in ORC 6461 6466 7 0.2 6161.5 1.0X +Native ORC MR 5289 5352 89 0.2 5043.9 1.2X +Native ORC Vectorized 2077 2086 13 0.5 1980.8 3.1X diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DataSourceWithHiveResolver.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DataSourceWithHiveResolver.scala new file mode 100644 index 0000000000000..842faba66cc30 --- /dev/null +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DataSourceWithHiveResolver.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.HiveTableRelation +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.datasources.{DataSourceResolver, LogicalRelation} + +/** + * [[DataSourceWithHiveResolver]] is a [[DataSourceResolver]] that additionally handles + * [[HiveTableRelation]] conversion using [[RelationConversions]]. + */ +class DataSourceWithHiveResolver(sparkSession: SparkSession, hiveCatalog: HiveSessionCatalog) + extends DataSourceResolver(sparkSession) { + private val relationConversions = RelationConversions(hiveCatalog) + + /** + * Invoke [[DataSourceResolver]] to resolve the input operator. If [[DataSourceResolver]] produces + * [[HiveTableRelation]], convert it to [[LogicalRelation]] if possible. + */ + override def resolveOperator: PartialFunction[LogicalPlan, LogicalPlan] = { + case operator: LogicalPlan if super.resolveOperator.isDefinedAt(operator) => + val relationAfterDataSourceResolver = super.resolveOperator(operator) + + relationAfterDataSourceResolver match { + case hiveTableRelation: HiveTableRelation => + resolveHiveTableRelation(hiveTableRelation) + case other => other + } + } + + private def resolveHiveTableRelation(hiveTableRelation: HiveTableRelation): LogicalPlan = { + if (relationConversions.doConvertHiveTableRelationForRead(hiveTableRelation)) { + val logicalRelation: LogicalRelation = + relationConversions.convertHiveTableRelationForRead(hiveTableRelation) + logicalRelation.newInstance() + } else { + hiveTableRelation.newInstance() + } + } +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index dbeb8607facc2..a1cf27510838f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.udf.generic.{AbstractGenericUDAFResolver, Gener import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.{Analyzer, EvalSubqueriesForTimeTravel, InvokeProcedures, ReplaceCharWithVarchar, ResolveSessionCatalog, ResolveTranspose} +import org.apache.spark.sql.catalyst.analysis.resolver.ResolverExtension import org.apache.spark.sql.catalyst.catalog.{ExternalCatalogWithListener, InvalidUDFClassException} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -84,6 +85,14 @@ class HiveSessionStateBuilder( * A logical query plan `Analyzer` with rules specific to Hive. */ override protected def analyzer: Analyzer = new Analyzer(catalogManager) { + override val singlePassResolverExtensions: Seq[ResolverExtension] = Seq( + new DataSourceWithHiveResolver(session, catalog) + ) + + override val singlePassMetadataResolverExtensions: Seq[ResolverExtension] = Seq( + new FileResolver(session) + ) + override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new ResolveHiveSerdeTable(session) +: new FindDataSourceTable(session) +: diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index 87ce809914e10..73d0327e2bcad 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSourceStrate import org.apache.spark.sql.hive.execution._ import org.apache.spark.sql.hive.execution.HiveScriptTransformationExec import org.apache.spark.sql.hive.execution.InsertIntoHiveTable.BY_CTAS -import org.apache.spark.sql.internal.HiveSerDe +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} /** @@ -117,6 +117,9 @@ class ResolveHiveSerdeTable(session: SparkSession) extends Rule[LogicalPlan] { } class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { + + override def conf: SQLConf = session.sessionState.conf + private def hiveTableWithStats(relation: HiveTableRelation): HiveTableRelation = { val table = relation.tableMeta val partitionCols = relation.partitionCols diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index ba03b7fe3cee1..00407f0ecc178 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -517,6 +517,8 @@ private[hive] class HiveClientImpl( val excludedTableProperties = HiveStatisticsProperties ++ Set( // The property value of "comment" is moved to the dedicated field "comment" "comment", + // The property value of "collation" is moved to the dedicated field "collation" + "collation", // For EXTERNAL_TABLE, the table properties has a particular field "EXTERNAL". This is added // in the function toHiveTable. "EXTERNAL" @@ -526,6 +528,7 @@ private[hive] class HiveClientImpl( case (key, _) => excludedTableProperties.contains(key) } val comment = properties.get("comment") + val collation = properties.get("collation") CatalogTable( identifier = TableIdentifier(h.getTableName, Option(h.getDbName)), @@ -568,6 +571,7 @@ private[hive] class HiveClientImpl( properties = filteredProperties, stats = readHiveStats(properties), comment = comment, + collation = collation, // In older versions of Spark(before 2.2.0), we expand the view original text and // store that into `viewExpandedText`, that should be used in view resolution. // We get `viewExpandedText` as viewText, and also get `viewOriginalText` in order to @@ -1181,6 +1185,7 @@ private[hive] object HiveClientImpl extends Logging { table.storage.properties.foreach { case (k, v) => hiveTable.setSerdeParam(k, v) } table.properties.foreach { case (k, v) => hiveTable.setProperty(k, v) } table.comment.foreach { c => hiveTable.setProperty("comment", c) } + table.collation.foreach { c => hiveTable.setProperty("collation", c) } // Hive will expand the view text, so it needs 2 fields: viewOriginalText and viewExpandedText. // Since we don't expand the view text, but only add table properties, we map the `viewText` to // the both fields in hive table. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala index cabdddd4c475d..0d4efd9e77742 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala @@ -55,6 +55,8 @@ class HiveFileFormat(fileSinkConf: FileSinkDesc) override def shortName(): String = "hive" + override def toString: String = "Hive" + override def inferSchema( sparkSession: SparkSession, options: Map[String, String], diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala index 0fcc43e5c3919..de2d15415837a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala @@ -283,9 +283,7 @@ object HiveScriptIOSchema extends HiveInspectors { propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames) val properties = new Properties() - // Can not use properties.putAll(propsMap.asJava) in scala-2.12 - // See https://github.com/scala/bug/issues/10418 - propsMap.foreach { case (k, v) => properties.put(k, v) } + properties.putAll(propsMap.asJava) serde.initialize(null, properties) serde @@ -299,9 +297,7 @@ object HiveScriptIOSchema extends HiveInspectors { val instance = Utils.classForName[RecordReader](klass).getConstructor(). newInstance() val props = new Properties() - // Can not use props.putAll(outputSerdeProps.toMap.asJava) in scala-2.12 - // See https://github.com/scala/bug/issues/10418 - ioschema.outputSerdeProps.toMap.foreach { case (k, v) => props.put(k, v) } + props.putAll(ioschema.outputSerdeProps.toMap.asJava) instance.initialize(inputStream, conf, props) instance } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala index 16edfea67e38e..d97d3cd6dd4a9 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala @@ -165,4 +165,6 @@ class HiveTempPath(session: SparkSession, val hadoopConf: Configuration, path: P def deleteIfNotStagingDir(path: Path, fs: FileSystem): Unit = { if (Option(path) != stagingDirForCreating) fs.delete(path, true) } + + override def toString: String = s"HiveTempPath($path)" } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala index 779562bed5b0f..6486904fe65af 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.FilterEstimation import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.DataSourceStrategy +import org.apache.spark.sql.internal.SQLConf /** * Prune hive table partitions using partition filters on [[HiveTableRelation]]. The pruned @@ -43,6 +44,8 @@ import org.apache.spark.sql.execution.datasources.DataSourceStrategy private[sql] class PruneHiveTablePartitions(session: SparkSession) extends Rule[LogicalPlan] with CastSupport with PredicateHelper { + override def conf: SQLConf = session.sessionState.conf + /** * Extract the partition filters from the filters on the table. */ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala index 700a4984a4e39..f5bf49439d3f9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala @@ -23,10 +23,11 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFPercentileApprox import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.functions.{lit, percentile_approx => pa} import org.apache.spark.sql.hive.execution.TestingTypedCount import org.apache.spark.sql.hive.test.TestHive -import org.apache.spark.sql.internal.ExpressionUtils.{column => toCol, expression} +import org.apache.spark.sql.internal.ExpressionUtils.expression import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.LongType @@ -117,7 +118,7 @@ object ObjectHashAggregateExecBenchmark extends SqlBasedBenchmark { output = output ) - def typed_count(column: Column): Column = TestingTypedCount(column) + def typed_count(column: Column): Column = Column(TestingTypedCount(expression(column))) val df = spark.range(N) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/DataSourceWithHiveResolverSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/DataSourceWithHiveResolverSuite.scala new file mode 100644 index 0000000000000..cb26354521b02 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/DataSourceWithHiveResolverSuite.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.analysis.resolver.{MetadataResolver, Resolver} +import org.apache.spark.sql.catalyst.catalog.{HiveTableRelation, UnresolvedCatalogRelation} +import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.hive.HiveUtils +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} + +class DataSourceWithHiveResolverSuite extends TestHiveSingleton with SQLTestUtils { + private val keyValueTableSchema = StructType( + Seq( + StructField("key", IntegerType, true), + StructField("value", StringType, true) + ) + ) + + test("ORC table resolution") { + withTable("src_orc") { + spark.sql("CREATE TABLE src_orc (key INT, value STRING) STORED AS ORC") + + checkResolveOperator( + sqlText = "SELECT * FROM src_orc", + expectedTableName = "spark_catalog.default.src_orc", + expectedTableSchema = keyValueTableSchema, + convertedToLogicalRelation = true + ) + } + } + + test("ORC table resolution without conversion") { + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "false") { + withTable("src_orc_no_conversion") { + spark.sql("CREATE TABLE src_orc_no_conversion (key INT, value STRING) STORED AS ORC") + + checkResolveOperator( + sqlText = "SELECT * FROM src_orc_no_conversion", + expectedTableName = "spark_catalog.default.src_orc_no_conversion", + expectedTableSchema = keyValueTableSchema, + convertedToLogicalRelation = false + ) + } + } + } + + private def checkResolveOperator( + sqlText: String, + expectedTableName: String, + expectedTableSchema: StructType, + convertedToLogicalRelation: Boolean) = { + val metadataResolver = new MetadataResolver( + spark.sessionState.catalogManager, + Resolver.createRelationResolution(spark.sessionState.catalogManager) + ) + val dataSourceWithHiveResolver = new DataSourceWithHiveResolver( + spark, + spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog] + ) + + val unresolvedPlan = spark.sql(sqlText).queryExecution.logical + + metadataResolver.resolve(unresolvedPlan) + + val unresolvedRelations = unresolvedPlan.collect { + case unresolvedRelation: UnresolvedRelation => unresolvedRelation + } + assert(unresolvedRelations.size == 1) + + val partiallyResolvedRelation = metadataResolver + .getRelationWithResolvedMetadata(unresolvedRelations.head) + .get + .asInstanceOf[SubqueryAlias] + .child + assert(partiallyResolvedRelation.isInstanceOf[UnresolvedCatalogRelation]) + + dataSourceWithHiveResolver.resolveOperator(partiallyResolvedRelation) match { + case logicalRelation: LogicalRelation => + assert(convertedToLogicalRelation) + assert(logicalRelation.catalogTable.get.identifier.unquotedString == expectedTableName) + assert(logicalRelation.relation.schema == expectedTableSchema) + case hiveTableRelation: HiveTableRelation => + assert(!convertedToLogicalRelation) + assert(hiveTableRelation.tableMeta.identifier.unquotedString == expectedTableName) + assert(hiveTableRelation.tableMeta.schema == expectedTableSchema) + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCharVarcharTestSuite.scala similarity index 98% rename from sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCharVarcharTestSuite.scala index c12d727e59740..90cb5501ee6f6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCharVarcharTestSuite.scala @@ -15,8 +15,9 @@ * limitations under the License. */ -package org.apache.spark.sql +package org.apache.spark.sql.hive +import org.apache.spark.sql.{CharVarcharTestSuite, Row} import org.apache.spark.sql.execution.command.CharVarcharDDLTestBase import org.apache.spark.sql.hive.test.TestHiveSingleton diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala index 2152a29b17ff4..6709a139dcf96 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala @@ -32,7 +32,7 @@ class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleto with BeforeAndAfter with SQLTestUtils { import spark.implicits._ - import spark.RichColumn + import spark.toRichColumn override def beforeAll(): Unit = { super.beforeAll() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala index f7e453a1dbdec..b67370f6eb9f6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala @@ -34,6 +34,9 @@ class PartitionedTablePerfStatsSuite override def beforeEach(): Unit = { super.beforeEach() + // Hive operation counters are doubled in dual-analyzer mode. + hiveContext.sparkSession.conf.set( + SQLConf.ANALYZER_DUAL_RUN_LEGACY_AND_SINGLE_PASS_RESOLVER.key, "false") FileStatusCache.resetForTesting() } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 9c2f4461ff263..e2f0040afe57c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -609,12 +609,15 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val message = intercept[AnalysisException] { - sql(s"ANALYZE TABLE $tableName PARTITION (DS='2010-01-01') COMPUTE STATISTICS") - }.getMessage - assert(message.contains( - "DS is not a valid partition column in table " + - s"`$SESSION_CATALOG_NAME`.`default`.`$tableName`")) + checkError( + exception = intercept[AnalysisException] { + sql(s"ANALYZE TABLE $tableName PARTITION (DS='2010-01-01') COMPUTE STATISTICS") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`DS`", + "tableName" -> s"`$SESSION_CATALOG_NAME`.`default`.`$tableName`") + ) } } } @@ -692,16 +695,26 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql(s"INSERT INTO TABLE $tableName PARTITION (ds='2010-01-01') SELECT * FROM src") - assertAnalysisException( - s"ANALYZE TABLE $tableName PARTITION (hour=20) COMPUTE STATISTICS", - "hour is not a valid partition column in table " + - s"`$SESSION_CATALOG_NAME`.`default`.`${tableName.toLowerCase(Locale.ROOT)}`" + checkError( + exception = intercept[AnalysisException] { + sql(s"ANALYZE TABLE $tableName PARTITION (hour=20) COMPUTE STATISTICS") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`hour`", + "tableName" -> + s"`$SESSION_CATALOG_NAME`.`default`.`${tableName.toLowerCase(Locale.ROOT)}`") ) - assertAnalysisException( - s"ANALYZE TABLE $tableName PARTITION (hour) COMPUTE STATISTICS", - "hour is not a valid partition column in table " + - s"`$SESSION_CATALOG_NAME`.`default`.`${tableName.toLowerCase(Locale.ROOT)}`" + checkError( + exception = intercept[AnalysisException] { + sql(s"ANALYZE TABLE $tableName PARTITION (hour) COMPUTE STATISTICS") + }, + condition = "PARTITIONS_NOT_FOUND", + parameters = Map( + "partitionList" -> "`hour`", + "tableName" -> + s"`$SESSION_CATALOG_NAME`.`default`.`${tableName.toLowerCase(Locale.ROOT)}`") ) intercept[NoSuchPartitionException] { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala index 5c65eb8b12bac..27dc80fbfc173 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.{DatabaseAlreadyExistsException, NoSuchDatabaseException, PartitionsAlreadyExistException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal} +import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.test.TestHiveVersion import org.apache.spark.sql.types.{IntegerType, StructType} @@ -68,11 +69,13 @@ class HiveClientSuite(version: String) extends HiveVersionSuite(version) { } def table(database: String, tableName: String, + collation: Option[String] = None, tableType: CatalogTableType = CatalogTableType.MANAGED): CatalogTable = { CatalogTable( identifier = TableIdentifier(tableName, Some(database)), tableType = tableType, schema = new StructType().add("key", "int"), + collation = collation, storage = CatalogStorageFormat( locationUri = None, inputFormat = Some(classOf[TextInputFormat].getName), @@ -204,6 +207,22 @@ class HiveClientSuite(version: String) extends HiveVersionSuite(version) { ignoreIfExists = false) } + test("create/alter table with collations") { + client.createTable(table("default", tableName = "collation_table", + collation = Some("UNICODE")), ignoreIfExists = false) + + val readBack = client.getTable("default", "collation_table") + assert(!readBack.properties.contains(TableCatalog.PROP_COLLATION)) + assert(readBack.collation === Some("UNICODE")) + + client.alterTable("default", "collation_table", + readBack.copy(collation = Some("UNICODE_CI"))) + val alteredTbl = client.getTable("default", "collation_table") + assert(alteredTbl.collation === Some("UNICODE_CI")) + + client.dropTable("default", "collation_table", ignoreIfNotExists = true, purge = true) + } + test("loadTable") { client.loadTable( emptyDir, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 53a65e195e3f0..a58adbce7ec52 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -657,10 +657,10 @@ class HiveDDLSuite exception = intercept[AnalysisException] { sql(s"ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-09', unknownCol='12')") }, - condition = "_LEGACY_ERROR_TEMP_1231", + condition = "PARTITIONS_NOT_FOUND", parameters = Map( - "key" -> "unknownCol", - "tblName" -> s"`$SESSION_CATALOG_NAME`.`default`.`exttable_with_partitions`") + "partitionList" -> "`unknownCol`", + "tableName" -> s"`$SESSION_CATALOG_NAME`.`default`.`exttable_with_partitions`") ) sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index c41370c96241a..5431066c30a9f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -72,7 +72,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd override def afterEach(): Unit = { try { - spark.artifactManager.cleanUpResources() + spark.artifactManager.cleanUpResourcesForTesting() } finally { super.afterEach() } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala index df6ef57a581d0..ecf89e59c501c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala @@ -117,7 +117,6 @@ class HiveResolutionSuite extends HiveComparisonTest { /** * Negative examples. Currently only left here for documentation purposes. - * TODO(marmbrus): Test that catalyst fails on these queries. */ /* SemanticException [Error 10009]: Line 1:7 Invalid table alias 'src' diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala index bcd0644af0782..008a324f73dac 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala @@ -23,12 +23,11 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax import org.scalatest.matchers.must.Matchers._ import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper} +import org.apache.spark.sql.catalyst.expressions.ExpressionEvalHelper import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.functions.{col, count_distinct, first, lit, max, percentile_approx => pa} import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.ExpressionUtils.{column => toCol, expression} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ @@ -181,7 +180,7 @@ class ObjectHashAggregateSuite pa(column, lit(percentage), lit(10000)) } - private def typed_count(column: Column): Column = TestingTypedCount(column) + private def typed_count(column: Column): Column = Column(TestingTypedCount(column.expr)) // Generates 50 random rows for a given schema. private def generateRandomRows(schemaForGenerator: StructType): Seq[Row] = { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala index 9ee3a0277c9a1..de6af30e663d2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala @@ -39,8 +39,8 @@ class ShowTablesSuite extends v1.ShowTablesSuiteBase with CommandSuiteBase { catalog: String, namespace: String, table: String): (String, Map[String, String]) = { - ("_LEGACY_ERROR_TEMP_1231", - Map("key" -> "id", "tblName" -> s"`$catalog`.`$namespace`.`$table`")) + ("PARTITIONS_NOT_FOUND", + Map("partitionList" -> "`id`", "tableName" -> s"`$catalog`.`$namespace`.`$table`")) } protected override def extendedPartExpectedResult: String = @@ -99,7 +99,7 @@ class ShowTablesSuite extends v1.ShowTablesSuiteBase with CommandSuiteBase { |View Original Text: SELECT id FROM $catalog.$namespace.$table |View Schema Mode: COMPENSATION |View Catalog and Namespace: $catalog.$namespace - |View Query Output Columns: [id] + |View Query Output Columns: [`id`] |Table Properties:
|Serde Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe |InputFormat: org.apache.hadoop.mapred.SequenceFileInputFormat diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala index 770e1da94a1c7..7a0599cda2fe7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala @@ -42,7 +42,7 @@ trait TestHiveSingleton extends SparkFunSuite with BeforeAndAfterAll { protected override def afterEach(): Unit = { try { - spark.artifactManager.cleanUpResources() + spark.artifactManager.cleanUpResourcesForTesting() } finally { super.afterEach() } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 87d6a4909fdd4..f0f9046c6b623 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -467,7 +467,7 @@ abstract class DStream[T: ClassTag] ( // Explicitly remove blocks of BlockRDD rdd match { case b: BlockRDD[_] => - logInfo(log"Removing blocks of RDD ${MDC(LogKeys.RDD_ID, b)} " + + logInfo(log"Removing blocks of RDD ${MDC(LogKeys.RDD, b)} " + log"of time ${MDC(LogKeys.TIME, time)}") b.removeBlocks() case _ => diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index e0e85712a2301..fae68123773dd 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -142,7 +142,7 @@ private[streaming] class BlockGenerator( state = StoppedAddingData } else { logWarning(log"Cannot stop BlockGenerator as its not in the Active state " + - log"[state = ${MDC(STATUS, state)}]") + log"[state = ${MDC(BLOCK_GENERATOR_STATUS, state)}]") return } }