Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into json-error-msg-cond…
Browse files Browse the repository at this point in the history
…ition
  • Loading branch information
MaxGekk committed Jan 15, 2025
2 parents b6f0c59 + 39bb2d8 commit a8acad8
Show file tree
Hide file tree
Showing 2,550 changed files with 75,481 additions and 24,496 deletions.
21 changes: 9 additions & 12 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ SQL:
- changed-files:
- all-globs-to-any-file: [
'**/sql/**/*',
'!python/pyspark/sql/avro/**/*',
'!python/pyspark/sql/streaming/**/*',
'!python/pyspark/sql/tests/streaming/test_streaming*.py'
'!python/**/avro/**/*',
'!python/**/protobuf/**/*',
'!python/**/streaming/**/*'
]
- any-glob-to-any-file: [
'common/unsafe/**/*',
Expand All @@ -119,7 +119,7 @@ AVRO:
- changed-files:
- any-glob-to-any-file: [
'connector/avro/**/*',
'python/pyspark/sql/avro/**/*'
'python/**/avro/**/*'
]

DSTREAM:
Expand Down Expand Up @@ -152,18 +152,16 @@ ML:
MLLIB:
- changed-files:
- any-glob-to-any-file: [
'**/spark/mllib/**/*',
'mllib-local/**/*',
'python/pyspark/mllib/**/*'
'**/mllib/**/*',
'mllib-local/**/*'
]

STRUCTURED STREAMING:
- changed-files:
- any-glob-to-any-file: [
'**/sql/**/streaming/**/*',
'connector/kafka-0-10-sql/**/*',
'python/pyspark/sql/streaming/**/*',
'python/pyspark/sql/tests/streaming/test_streaming*.py',
'python/pyspark/sql/**/streaming/**/*',
'**/*streaming.R'
]

Expand Down Expand Up @@ -226,13 +224,12 @@ CONNECT:
- any-glob-to-any-file: [
'sql/connect/**/*',
'connector/connect/**/*',
'python/pyspark/sql/**/connect/**/*',
'python/pyspark/ml/**/connect/**/*'
'python/**/connect/**/*'
]

PROTOBUF:
- changed-files:
- any-glob-to-any-file: [
'connector/protobuf/**/*',
'python/pyspark/sql/protobuf/**/*'
'python/**/protobuf/**/*'
]
107 changes: 87 additions & 20 deletions .github/workflows/build_and_test.yml

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions .github/workflows/build_branch35.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build (branch-3.5, Scala 2.13, Hadoop 3, JDK 8)"
on:
schedule:
- cron: '0 11 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -37,6 +38,7 @@ jobs:
envs: >-
{
"SCALA_PROFILE": "scala2.13",
"PYSPARK_IMAGE_TO_TEST": "",
"PYTHON_TO_TEST": "",
"ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-xe:21.3.0"
}
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_branch35_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Python-only (branch-3.5)"
on:
schedule:
- cron: '0 11 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "",
"PYTHON_TO_TEST": ""
}
jobs: >-
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Coverage (master, Scala 2.13, Hadoop 3, JDK 17)"
on:
schedule:
- cron: '0 10 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-311",
"PYTHON_TO_TEST": "python3.11",
"PYSPARK_CODECOV": "true"
}
Expand Down
111 changes: 111 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,16 @@ on:
- 'dev/spark-test-image/docs/Dockerfile'
- 'dev/spark-test-image/lint/Dockerfile'
- 'dev/spark-test-image/sparkr/Dockerfile'
- 'dev/spark-test-image/pypy-310/Dockerfile'
- 'dev/spark-test-image/python-309/Dockerfile'
- 'dev/spark-test-image/python-310/Dockerfile'
- 'dev/spark-test-image/python-311/Dockerfile'
- 'dev/spark-test-image/python-312/Dockerfile'
- 'dev/spark-test-image/python-313/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
workflow_dispatch:
jobs:
main:
if: github.repository == 'apache/spark'
Expand Down Expand Up @@ -102,3 +109,107 @@ jobs:
- name: Image digest (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
- name: Build and push (PySpark with old dependencies)
if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != ''
id: docker_build_pyspark_python_minimum
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-minimum/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-minimum-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with old dependencies)
if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_minimum.outputs.digest }}
- name: Build and push (PySpark PS with old dependencies)
if: hashFiles('dev/spark-test-image/python-ps-minimum/Dockerfile') != ''
id: docker_build_pyspark_python_ps_minimum
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-ps-minimum/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-ps-minimum-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark PS with old dependencies)
if: hashFiles('dev/spark-test-image/python-ps-minimum/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_ps_minimum.outputs.digest }}
- name: Build and push (PySpark with PyPy 3.10)
if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != ''
id: docker_build_pyspark_pypy_310
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/pypy-310/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-pypy-310-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with PyPy 3.10)
if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_pypy_310.outputs.digest }}
- name: Build and push (PySpark with Python 3.9)
if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
id: docker_build_pyspark_python_309
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-309/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with Python 3.9)
if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }}
- name: Build and push (PySpark with Python 3.10)
if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != ''
id: docker_build_pyspark_python_310
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-310/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-310-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with Python 3.10)
if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_310.outputs.digest }}
- name: Build and push (PySpark with Python 3.11)
if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != ''
id: docker_build_pyspark_python_311
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-311/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with Python 3.11)
if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_311.outputs.digest }}
- name: Build and push (PySpark with Python 3.12)
if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != ''
id: docker_build_pyspark_python_312
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-312/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with Python 3.12)
if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_312.outputs.digest }}
- name: Build and push (PySpark with Python 3.13)
if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != ''
id: docker_build_pyspark_python_313
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-313/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-313-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with Python 3.13)
if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_313.outputs.digest }}
3 changes: 3 additions & 0 deletions .github/workflows/build_java21.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build (master, Scala 2.13, Hadoop 3, JDK 21)"
on:
schedule:
- cron: '0 4 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,8 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-311",
"PYTHON_TO_TEST": "python3.11",
"SKIP_MIMA": "true",
"SKIP_UNIDOC": "true",
"DEDICATED_JVM_SBT_TESTS": "org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 17)"
on:
schedule:
- cron: '0 13 * * *'
workflow_dispatch:

jobs:
run-build:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_maven_java21.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21)"
on:
schedule:
- cron: '0 14 * * *'
workflow_dispatch:

jobs:
run-build:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_maven_java21_macos15.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)"
on:
schedule:
- cron: '0 20 */2 * *'
workflow_dispatch:

jobs:
run-build:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/build_non_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Non-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)"
on:
schedule:
- cron: '0 1 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,8 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-311",
"PYTHON_TO_TEST": "python3.11",
"SPARK_ANSI_SQL_MODE": "false",
}
jobs: >-
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_python_3.10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.10)"
on:
schedule:
- cron: '0 17 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-310",
"PYTHON_TO_TEST": "python3.10"
}
jobs: >-
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_python_3.11_macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.11, MacOS)"
on:
schedule:
- cron: '0 21 * * *'
workflow_dispatch:

jobs:
run-build:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_python_3.12.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.12)"
on:
schedule:
- cron: '0 19 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-312",
"PYTHON_TO_TEST": "python3.12"
}
jobs: >-
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_python_3.13.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.13)"
on:
schedule:
- cron: '0 20 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-313",
"PYTHON_TO_TEST": "python3.13"
}
jobs: >-
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_python_3.9.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: "Build / Python-only (master, Python 3.9)"
on:
schedule:
- cron: '0 21 * * *'
workflow_dispatch:

jobs:
run-build:
Expand All @@ -36,6 +37,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-309",
"PYTHON_TO_TEST": "python3.9"
}
jobs: >-
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/build_python_connect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name: Build / Spark Connect Python-only (master, Python 3.11)
on:
schedule:
- cron: '0 19 * * *'
workflow_dispatch:

jobs:
# Build: build Spark and run the tests for specified modules using SBT
Expand Down Expand Up @@ -82,7 +83,7 @@ jobs:
sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
# Start a Spark Connect server for local
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
--jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
Expand All @@ -93,15 +94,15 @@ jobs:
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
./python/run-tests --parallelism=2 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
# Stop Spark Connect server.
./sbin/stop-connect-server.sh
mv lib.back python/lib
mv pyspark.back python/pyspark
# Start a Spark Connect server for local-cluster
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
--master "local-cluster[2, 4, 1024]" \
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
--jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
Expand Down
Loading

0 comments on commit a8acad8

Please sign in to comment.