Merge remote-tracking branch 'origin/master' into json-error-msg-cond…

…ition
MaxGekk · Jan 15, 2025 · a8acad8 · a8acad8
2 parents b6f0c59 + 39bb2d8
commit a8acad8
Show file tree

Hide file tree

Showing 2,550 changed files with 75,481 additions and 24,496 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -93,9 +93,9 @@ SQL:
   - changed-files:
     - all-globs-to-any-file: [
      '**/sql/**/*',
-     '!python/pyspark/sql/avro/**/*',
-     '!python/pyspark/sql/streaming/**/*',
-     '!python/pyspark/sql/tests/streaming/test_streaming*.py'
+     '!python/**/avro/**/*',
+     '!python/**/protobuf/**/*',
+     '!python/**/streaming/**/*'
     ]
     - any-glob-to-any-file: [
      'common/unsafe/**/*',
@@ -119,7 +119,7 @@ AVRO:
   - changed-files:
     - any-glob-to-any-file: [
      'connector/avro/**/*',
-     'python/pyspark/sql/avro/**/*'
+     'python/**/avro/**/*'
     ]
 
 DSTREAM:
@@ -152,18 +152,16 @@ ML:
 MLLIB:
   - changed-files:
     - any-glob-to-any-file: [
-     '**/spark/mllib/**/*',
-     'mllib-local/**/*',
-     'python/pyspark/mllib/**/*'
+     '**/mllib/**/*',
+     'mllib-local/**/*'
     ]
 
 STRUCTURED STREAMING:
   - changed-files:
     - any-glob-to-any-file: [
      '**/sql/**/streaming/**/*',
      'connector/kafka-0-10-sql/**/*',
-     'python/pyspark/sql/streaming/**/*',
-     'python/pyspark/sql/tests/streaming/test_streaming*.py',
+     'python/pyspark/sql/**/streaming/**/*',
      '**/*streaming.R'
     ]
 
@@ -226,13 +224,12 @@ CONNECT:
     - any-glob-to-any-file: [
      'sql/connect/**/*',
      'connector/connect/**/*',
-     'python/pyspark/sql/**/connect/**/*',
-     'python/pyspark/ml/**/connect/**/*'
+     'python/**/connect/**/*'
     ]
 
 PROTOBUF:
   - changed-files:
     - any-glob-to-any-file: [
      'connector/protobuf/**/*',
-     'python/pyspark/sql/protobuf/**/*'
+     'python/**/protobuf/**/*'
     ]
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml
@@ -22,6 +22,7 @@ name: "Build (branch-3.5, Scala 2.13, Hadoop 3, JDK 8)"
 on:
   schedule:
     - cron: '0 11 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -37,6 +38,7 @@ jobs:
       envs: >-
         {
           "SCALA_PROFILE": "scala2.13",
+          "PYSPARK_IMAGE_TO_TEST": "",
           "PYTHON_TO_TEST": "",
           "ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-xe:21.3.0"
         }

diff --git a/.github/workflows/build_branch35_python.yml b/.github/workflows/build_branch35_python.yml
@@ -22,6 +22,7 @@ name: "Build / Python-only (branch-3.5)"
 on:
   schedule:
     - cron: '0 11 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "",
           "PYTHON_TO_TEST": ""
         }
       jobs: >-

diff --git a/.github/workflows/build_coverage.yml b/.github/workflows/build_coverage.yml
@@ -22,6 +22,7 @@ name: "Build / Coverage (master, Scala 2.13, Hadoop 3, JDK 17)"
 on:
   schedule:
     - cron: '0 10 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-311",
           "PYTHON_TO_TEST": "python3.11",
           "PYSPARK_CODECOV": "true"
         }

diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml
@@ -30,9 +30,16 @@ on:
     - 'dev/spark-test-image/docs/Dockerfile'
     - 'dev/spark-test-image/lint/Dockerfile'
     - 'dev/spark-test-image/sparkr/Dockerfile'
+    - 'dev/spark-test-image/pypy-310/Dockerfile'
+    - 'dev/spark-test-image/python-309/Dockerfile'
+    - 'dev/spark-test-image/python-310/Dockerfile'
+    - 'dev/spark-test-image/python-311/Dockerfile'
+    - 'dev/spark-test-image/python-312/Dockerfile'
+    - 'dev/spark-test-image/python-313/Dockerfile'
     - '.github/workflows/build_infra_images_cache.yml'
   # Create infra image when cutting down branches/tags
   create:
+  workflow_dispatch:
 jobs:
   main:
     if: github.repository == 'apache/spark'
@@ -102,3 +109,107 @@ jobs:
       - name: Image digest (SparkR)
         if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
         run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
+      - name: Build and push (PySpark with old dependencies)
+        if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != ''
+        id: docker_build_pyspark_python_minimum
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-minimum/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with old dependencies)
+        if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_minimum.outputs.digest }}
+      - name: Build and push (PySpark PS with old dependencies)
+        if: hashFiles('dev/spark-test-image/python-ps-minimum/Dockerfile') != ''
+        id: docker_build_pyspark_python_ps_minimum
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-ps-minimum/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark PS with old dependencies)
+        if: hashFiles('dev/spark-test-image/python-ps-minimum/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_ps_minimum.outputs.digest }}
+      - name: Build and push (PySpark with PyPy 3.10)
+        if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != ''
+        id: docker_build_pyspark_pypy_310
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/pypy-310/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with PyPy 3.10)
+        if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_pypy_310.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.9)
+        if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
+        id: docker_build_pyspark_python_309
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-309/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.9)
+        if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.10)
+        if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != ''
+        id: docker_build_pyspark_python_310
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-310/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.10)
+        if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_310.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.11)
+        if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != ''
+        id: docker_build_pyspark_python_311
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-311/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.11)
+        if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_311.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.12)
+        if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != ''
+        id: docker_build_pyspark_python_312
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-312/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.12)
+        if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_312.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.13)
+        if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != ''
+        id: docker_build_pyspark_python_313
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-313/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.13)
+        if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_313.outputs.digest }}
diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml
@@ -22,6 +22,7 @@ name: "Build (master, Scala 2.13, Hadoop 3, JDK 21)"
 on:
   schedule:
     - cron: '0 4 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,8 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-311",
+          "PYTHON_TO_TEST": "python3.11",
           "SKIP_MIMA": "true",
           "SKIP_UNIDOC": "true",
           "DEDICATED_JVM_SBT_TESTS": "org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite"

diff --git a/.github/workflows/build_maven.yml b/.github/workflows/build_maven.yml
@@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 17)"
 on:
   schedule:
     - cron: '0 13 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_maven_java21.yml b/.github/workflows/build_maven_java21.yml
@@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21)"
 on:
   schedule:
     - cron: '0 14 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_maven_java21_macos15.yml b/.github/workflows/build_maven_java21_macos15.yml
@@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)"
 on:
   schedule:
     - cron: '0 20 */2 * *'
+  workflow_dispatch:
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_non_ansi.yml b/.github/workflows/build_non_ansi.yml
@@ -22,6 +22,7 @@ name: "Build / Non-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)"
 on:
   schedule:
     - cron: '0 1 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,8 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-311",
+          "PYTHON_TO_TEST": "python3.11",
           "SPARK_ANSI_SQL_MODE": "false",
         }
       jobs: >-

diff --git a/.github/workflows/build_python_3.10.yml b/.github/workflows/build_python_3.10.yml
@@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.10)"
 on:
   schedule:
     - cron: '0 17 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-310",
           "PYTHON_TO_TEST": "python3.10"
         }
       jobs: >-

diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml
@@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.11, MacOS)"
 on:
   schedule:
     - cron: '0 21 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_python_3.12.yml b/.github/workflows/build_python_3.12.yml
@@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.12)"
 on:
   schedule:
     - cron: '0 19 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-312",
           "PYTHON_TO_TEST": "python3.12"
         }
       jobs: >-

diff --git a/.github/workflows/build_python_3.13.yml b/.github/workflows/build_python_3.13.yml
@@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.13)"
 on:
   schedule:
     - cron: '0 20 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-313",
           "PYTHON_TO_TEST": "python3.13"
         }
       jobs: >-

diff --git a/.github/workflows/build_python_3.9.yml b/.github/workflows/build_python_3.9.yml
@@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.9)"
 on:
   schedule:
     - cron: '0 21 * * *'
+  workflow_dispatch:
 
 jobs:
   run-build:
@@ -36,6 +37,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-309",
           "PYTHON_TO_TEST": "python3.9"
         }
       jobs: >-

diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml
@@ -22,6 +22,7 @@ name: Build / Spark Connect Python-only (master, Python 3.11)
 on:
   schedule:
     - cron: '0 19 * * *'
+  workflow_dispatch:
 
 jobs:
   # Build: build Spark and run the tests for specified modules using SBT
@@ -82,7 +83,7 @@ jobs:
           sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
 
           # Start a Spark Connect server for local
-          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
+          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
             --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
 
@@ -93,15 +94,15 @@ jobs:
           # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
           ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
           # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
-          ./python/run-tests --parallelism=2 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
+          ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
 
           # Stop Spark Connect server.
           ./sbin/stop-connect-server.sh
           mv lib.back python/lib
           mv pyspark.back python/pyspark
 
           # Start a Spark Connect server for local-cluster
-          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
+          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
             --master "local-cluster[2, 4, 1024]" \
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
             --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"