diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 63b9d0cd37..e3bae668c6 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -74,6 +74,6 @@ jobs: with: platforms: linux/amd64,linux/arm64 push: true - tags: ghcr.io/apache/datafusion-comet:spark-3.5-scala-2.12-${{ env.COMET_VERSION }} + tags: ghcr.io/apache/datafusion-comet:spark-4.1-scala-2.13-${{ env.COMET_VERSION }} file: kube/Dockerfile no-cache: true diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index 86d63a75c2..2826aeeecc 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -451,6 +451,8 @@ jobs: runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }} container: image: amd64/rust + env: + JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED steps: - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0 @@ -460,7 +462,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ env.RUST_VERSION }} - jdk-version: 11 + jdk-version: 17 - name: Download native library uses: actions/download-artifact@v8 @@ -505,6 +507,8 @@ jobs: runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }} container: image: amd64/rust + env: + JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED strategy: matrix: join: [sort_merge, broadcast, hash] @@ -518,7 +522,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ env.RUST_VERSION }} - jdk-version: 11 + jdk-version: 17 - name: Download native library uses: actions/download-artifact@v8 diff --git a/docs/source/contributor-guide/benchmarking_aws_ec2.md b/docs/source/contributor-guide/benchmarking_aws_ec2.md index 81f15d64ea..bb92a8958f 100644 --- a/docs/source/contributor-guide/benchmarking_aws_ec2.md +++ b/docs/source/contributor-guide/benchmarking_aws_ec2.md @@ -104,7 +104,7 @@ make release Set `COMET_JAR` environment variable. ```shell -export COMET_JAR=/home/ec2-user/datafusion-comet/spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar +export COMET_JAR=/home/ec2-user/datafusion-comet/spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar ``` ## Run Benchmarks diff --git a/docs/source/contributor-guide/benchmarking_macos.md b/docs/source/contributor-guide/benchmarking_macos.md index e75261e8d5..4c37a32a26 100644 --- a/docs/source/contributor-guide/benchmarking_macos.md +++ b/docs/source/contributor-guide/benchmarking_macos.md @@ -55,13 +55,13 @@ export DF_BENCH=`pwd` ## Install Spark -Install Apache Spark. This example refers to 3.5.4 version. +Install Apache Spark. This example refers to 4.1.1 version. ```shell -wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz -tar xzf spark-3.5.4-bin-hadoop3.tgz -sudo mv spark-3.5.4-bin-hadoop3 /opt -export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3/ +wget https://archive.apache.org/dist/spark/spark-4.1.1/spark-4.1.1-bin-hadoop3.tgz +tar xzf spark-4.1.1-bin-hadoop3.tgz +sudo mv spark-4.1.1-bin-hadoop3 /opt +export SPARK_HOME=/opt/spark-4.1.1-bin-hadoop3/ ``` Start Spark in standalone mode: @@ -129,7 +129,7 @@ make release COMET_FEATURES=mimalloc Set `COMET_JAR` to point to the location of the Comet jar file. Example for Comet 0.8 ```shell -export COMET_JAR=`pwd`/spark/target/comet-spark-spark3.5_2.12-0.8.0-SNAPSHOT.jar +export COMET_JAR=`pwd`/spark/target/comet-spark-spark4.1_2.13-0.8.0-SNAPSHOT.jar ``` Run the following command (the `--data` parameter will need to be updated to point to your S3 bucket): diff --git a/docs/source/contributor-guide/benchmarking_spark_sql_perf.md b/docs/source/contributor-guide/benchmarking_spark_sql_perf.md index 538539759c..d893319443 100644 --- a/docs/source/contributor-guide/benchmarking_spark_sql_perf.md +++ b/docs/source/contributor-guide/benchmarking_spark_sql_perf.md @@ -34,8 +34,8 @@ partitioning and writing to Parquet format automatically. ## Prerequisites -- Java 17 (for Spark 3.5+) -- Apache Spark 3.5.x +- Java 17 +- Apache Spark 4.1.x - SBT (Scala Build Tool) - C compiler toolchain (`gcc`, `make`, `flex`, `bison`, `byacc`) @@ -225,7 +225,7 @@ Build Comet from source and launch `spark-shell` with both the Comet and spark-s ```shell make release -export COMET_JAR=$(pwd)/spark/target/comet-spark-spark3.5_2.12-*.jar +export COMET_JAR=$(pwd)/spark/target/comet-spark-spark4.1_2.13-*.jar $SPARK_HOME/bin/spark-shell \ --master $SPARK_MASTER \ diff --git a/docs/source/contributor-guide/debugging.md b/docs/source/contributor-guide/debugging.md index 3356a83893..e5372d922d 100644 --- a/docs/source/contributor-guide/debugging.md +++ b/docs/source/contributor-guide/debugging.md @@ -136,7 +136,7 @@ make release COMET_FEATURES=backtrace Set `RUST_BACKTRACE=1` for the Spark worker/executor process, or for `spark-submit` if running in local mode. ```console -RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true +RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true ``` Get the expanded exception details diff --git a/docs/source/contributor-guide/iceberg-spark-tests.md b/docs/source/contributor-guide/iceberg-spark-tests.md index 38becc0208..f37ee4f1f3 100644 --- a/docs/source/contributor-guide/iceberg-spark-tests.md +++ b/docs/source/contributor-guide/iceberg-spark-tests.md @@ -40,7 +40,7 @@ Here is an overview of the changes that the diffs make to Iceberg: Run `make release` in Comet to install the Comet JAR into the local Maven repository, specifying the Spark version. ```shell -PROFILES="-Pspark-3.5" make release +PROFILES="-Pspark-4.1" make release ``` ## 2. Clone Iceberg and Apply Diff diff --git a/docs/source/user-guide/latest/datasources.md b/docs/source/user-guide/latest/datasources.md index 572beb1e02..bbf45dba96 100644 --- a/docs/source/user-guide/latest/datasources.md +++ b/docs/source/user-guide/latest/datasources.md @@ -69,12 +69,12 @@ Unlike to native Comet reader the Datafusion reader fully supports nested types To build Comet with native DataFusion reader and remote HDFS support it is required to have a JDK installed Example: -Build a Comet for `spark-3.5` provide a JDK path in `JAVA_HOME` +Build a Comet for `spark-4.1` provide a JDK path in `JAVA_HOME` Provide the JRE linker path in `RUSTFLAGS`, the path can vary depending on the system. Typically JRE linker is a part of installed JDK ```shell -export JAVA_HOME="/opt/homebrew/opt/openjdk@11" -make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server" +export JAVA_HOME="/opt/homebrew/opt/openjdk@17" +make release PROFILES="-Pspark-4.1" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server" ``` Start Comet with experimental reader and HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled) @@ -149,7 +149,7 @@ docker compose -f kube/local/hdfs-docker-compose.yml up - Build a project with HDFS support ```shell -JAVA_HOME="/opt/homebrew/opt/openjdk@11" make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home/lib/server" +JAVA_HOME="/opt/homebrew/opt/openjdk@17" make release PROFILES="-Pspark-4.1" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home/lib/server" ``` - Run local test diff --git a/docs/source/user-guide/latest/iceberg.md b/docs/source/user-guide/latest/iceberg.md index 24a4bda057..cb6fdab2c9 100644 --- a/docs/source/user-guide/latest/iceberg.md +++ b/docs/source/user-guide/latest/iceberg.md @@ -31,7 +31,7 @@ reader is enabled by default. To disable it, set `spark.comet.scan.icebergNative ```shell $SPARK_HOME/bin/spark-shell \ - --packages org.apache.datafusion:comet-spark-spark3.5_2.12:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ + --packages org.apache.datafusion:comet-spark-spark4.1_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ --repositories https://repo1.maven.org/maven2/ \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkCatalog \ @@ -106,7 +106,7 @@ configure Spark to use a REST catalog with Comet's native Iceberg scan: ```shell $SPARK_HOME/bin/spark-shell \ - --packages org.apache.datafusion:comet-spark-spark3.5_2.12:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ + --packages org.apache.datafusion:comet-spark-spark4.1_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ --repositories https://repo1.maven.org/maven2/ \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.rest_cat=org.apache.iceberg.spark.SparkCatalog \ diff --git a/docs/source/user-guide/latest/installation.md b/docs/source/user-guide/latest/installation.md index 71f5a3d0ce..3da84e210d 100644 --- a/docs/source/user-guide/latest/installation.md +++ b/docs/source/user-guide/latest/installation.md @@ -85,7 +85,7 @@ Here are the direct links for downloading the Comet $COMET_VERSION jar file. - [Comet plugin for Spark 3.5 / Scala 2.12](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/$COMET_VERSION/comet-spark-spark3.5_2.12-$COMET_VERSION.jar) - [Comet plugin for Spark 3.5 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/$COMET_VERSION/comet-spark-spark3.5_2.13-$COMET_VERSION.jar) - [Comet plugin for Spark 4.0 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.0_2.13/$COMET_VERSION/comet-spark-spark4.0_2.13-$COMET_VERSION.jar) -- [Comet plugin for Spark 4.1 / Scala 2.13 (Experimental)](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.1_2.13/$COMET_VERSION/comet-spark-spark4.1_2.13-$COMET_VERSION.jar) +- [Comet plugin for Spark 4.1 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.1_2.13/$COMET_VERSION/comet-spark-spark4.1_2.13-$COMET_VERSION.jar) - [Comet plugin for Spark 4.2 / Scala 2.13 (Experimental)](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.2_2.13/$COMET_VERSION/comet-spark-spark4.2_2.13-$COMET_VERSION.jar) @@ -105,7 +105,7 @@ See the [Comet Kubernetes Guide](kubernetes.md) guide. Make sure `SPARK_HOME` points to the same Spark version as Comet was built for. ```shell -export COMET_JAR=spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar +export COMET_JAR=spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar $SPARK_HOME/bin/spark-shell \ --jars $COMET_JAR \ @@ -161,7 +161,7 @@ explicitly contain Comet otherwise Spark may use a different class-loader for th components which will then fail at runtime. For example: ``` ---driver-class-path spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar +--driver-class-path spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar ``` Some cluster managers may require additional configuration, see diff --git a/docs/source/user-guide/latest/kubernetes.md b/docs/source/user-guide/latest/kubernetes.md index 2fb037d630..fd84b7ad9b 100644 --- a/docs/source/user-guide/latest/kubernetes.md +++ b/docs/source/user-guide/latest/kubernetes.md @@ -69,30 +69,30 @@ metadata: spec: type: Scala mode: cluster - image: apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11 + image: apache/datafusion-comet:0.7.0-spark4.1.1-scala2.13-java17 imagePullPolicy: IfNotPresent mainClass: org.apache.spark.examples.SparkPi - mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.5.jar + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.13-4.1.1.jar sparkConf: - "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar" - "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar" + "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark4.1_2.13-0.7.0.jar" + "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark4.1_2.13-0.7.0.jar" "spark.plugins": "org.apache.spark.CometPlugin" "spark.comet.enabled": "true" "spark.comet.exec.enabled": "true" "spark.comet.exec.shuffle.enabled": "true" "spark.comet.exec.shuffle.mode": "auto" "spark.shuffle.manager": "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager" - sparkVersion: 3.5.6 + sparkVersion: 4.1.1 driver: labels: - version: 3.5.6 + version: 4.1.1 cores: 1 coreLimit: 1200m memory: 512m serviceAccount: spark-operator-spark executor: labels: - version: 3.5.6 + version: 4.1.1 instances: 1 cores: 1 coreLimit: 1200m diff --git a/docs/source/user-guide/latest/source.md b/docs/source/user-guide/latest/source.md index eb56e1f21b..6ae43be56a 100644 --- a/docs/source/user-guide/latest/source.md +++ b/docs/source/user-guide/latest/source.md @@ -38,7 +38,7 @@ cd apache-datafusion-comet-$COMET_VERSION Build ```console -make release-nogit PROFILES="-Pspark-3.5" +make release-nogit PROFILES="-Pspark-4.1" ``` ## Building from the GitHub repository @@ -53,17 +53,17 @@ Build Comet for a specific Spark version: ```console cd datafusion-comet -make release PROFILES="-Pspark-3.5" +make release PROFILES="-Pspark-4.1" ``` -Note that the project builds for Scala 2.12 by default but can be built for Scala 2.13 using an additional profile: +Note that the project builds for Scala 2.13 by default but can be built for Scala 2.12 using an additional profile: ```console -make release PROFILES="-Pspark-3.5 -Pscala-2.13" +make release PROFILES="-Pspark-3.5 -Pscala-2.12" ``` To build Comet from the source distribution on an isolated environment without an access to `github.com` it is necessary to disable `git-commit-id-maven-plugin`, otherwise you will face errors that there is no access to the git during the build process. In that case you may use: ```console -make release-nogit PROFILES="-Pspark-3.5" +make release-nogit PROFILES="-Pspark-4.1" ``` diff --git a/kube/Dockerfile b/kube/Dockerfile index 699aeeb210..d13b401a01 100644 --- a/kube/Dockerfile +++ b/kube/Dockerfile @@ -15,14 +15,14 @@ # limitations under the License. # -FROM apache/spark:3.5.8 AS builder +FROM apache/spark:4.1.1 AS builder USER root -# Installing JDK11 as the image comes with JRE +# Installing JDK17 as the image comes with JRE RUN apt update \ && apt install -y curl \ - && apt install -y openjdk-11-jdk \ + && apt install -y openjdk-17-jdk \ && apt clean RUN apt install -y gcc-10 g++-10 cpp-10 unzip @@ -37,8 +37,8 @@ ENV PATH="$PATH:/root/.local/bin" RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" ENV RUSTFLAGS="-C debuginfo=line-tables-only -C incremental=false" -ENV SPARK_VERSION=3.5 -ENV SCALA_VERSION=2.12 +ENV SPARK_VERSION=4.1 +ENV SCALA_VERSION=2.13 # copy source files to Docker image RUN mkdir /comet @@ -70,9 +70,9 @@ RUN mkdir -p /root/.m2 && \ RUN cd /comet \ && JAVA_HOME=$(readlink -f $(which javac) | sed "s/\/bin\/javac//") make release-nogit PROFILES="-Pspark-$SPARK_VERSION -Pscala-$SCALA_VERSION" -FROM apache/spark:3.5.8 -ENV SPARK_VERSION=3.5 -ENV SCALA_VERSION=2.12 +FROM apache/spark:4.1.1 +ENV SPARK_VERSION=4.1 +ENV SCALA_VERSION=2.13 USER root # note the use of a wildcard in the file name so that this works with both snapshot and final release versions diff --git a/pom.xml b/pom.xml index b83a6fd45b..f378bee7a8 100644 --- a/pom.xml +++ b/pom.xml @@ -65,24 +65,24 @@ under the License. 1.7.0 3.6.1 0.16.1 - 2.12.18 - 2.12 + 2.13.17 + 2.13 4.9.6 3.2.16 2.2.0 - 3.5.8 - 3.5 + 4.1.1 + 4.1 provided 3.25.5 - 1.13.1 + 1.16.0 provided 3.3.4 18.3.0 1.9.13 2.43.0 0.8.11 - 4.8.8 - 2.0.7 + 4.13.6 + 2.0.17 33.2.1-jre 1.21.0 2.31.51 @@ -116,8 +116,8 @@ under the License. -Djdk.reflect.useDirectMethodHandle=false -ea -Xmx4g -Xss4m ${extraJavaTestArgs} - spark-3.x - spark-3.5 + spark-4.x + spark-4.1 @@ -635,10 +635,13 @@ under the License. spark-3.4 2.12.17 + 2.12 3.4.3 3.4 1.13.1 + 4.8.8 2.0.6 + spark-3.x spark-3.4 11 ${java.version} @@ -650,10 +653,13 @@ under the License. spark-3.5 2.12.18 + 2.12 3.5.8 3.5 1.13.1 + 4.8.8 2.0.7 + spark-3.x spark-3.5 11 ${java.version} @@ -662,10 +668,8 @@ under the License. - spark-4.0 - 2.13.16 2.13 4.0.2 @@ -675,7 +679,6 @@ under the License. 2.0.16 spark-4.x spark-4.0 - 17 ${java.version} ${java.version} @@ -683,7 +686,6 @@ under the License. - spark-4.1 17 ${java.version} ${java.version} @@ -729,6 +730,11 @@ under the License. scala-2.12 + + 2.12.18 + 2.12 + 4.8.8 + diff --git a/spark/pom.xml b/spark/pom.xml index d6d990c906..1b2179c80e 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -223,9 +223,6 @@ under the License. spark-3.5 - - true - org.apache.iceberg @@ -275,6 +272,9 @@ under the License. spark-4.1 + + true +