From 42d965b3049fea61cda13fd0cd2a059b71a84bf2 Mon Sep 17 00:00:00 2001 From: gnehil Date: Fri, 19 Apr 2024 10:20:56 +0800 Subject: [PATCH 01/45] init --- .../spark/load/RecordBatchInputStream.java | 2 +- .../doris/spark/load/StreamLoader.scala | 6 +- spark-load/build.sh | 0 spark-load/pom.xml | 370 ++++++ spark-load/spark-dpp/.flattened-pom.xml | 409 ++++++ .../spark-dpp/dependency-reduced-pom.xml | 237 ++++ spark-load/spark-dpp/pom.xml | 273 ++++ .../doris/common/SparkDppException.java | 39 + .../doris/load/loadv2/dpp/ColumnParser.java | 297 +++++ .../load/loadv2/dpp/DorisKryoRegistrator.java | 36 + .../loadv2/dpp/DorisRangePartitioner.java | 89 ++ .../doris/load/loadv2/dpp/DppColumns.java | 108 ++ .../doris/load/loadv2/dpp/DppUtils.java | 299 +++++ .../load/loadv2/dpp/GlobalDictBuilder.java | 432 ++++++ .../dpp/MinimumCoverageRollupTreeBuilder.java | 127 ++ .../load/loadv2/dpp/RollupTreeBuilder.java | 25 + .../doris/load/loadv2/dpp/RollupTreeNode.java | 53 + .../doris/load/loadv2/dpp/SparkDpp.java | 1167 +++++++++++++++++ .../load/loadv2/dpp/SparkRDDAggregator.java | 607 +++++++++ .../load/loadv2/dpp/StringAccumulator.java | 65 + .../doris/load/loadv2/etl/SparkEtlJob.java | 288 ++++ .../load/loadv2/dpp/ColumnParserTest.java | 136 ++ .../loadv2/dpp/DorisRangePartitionerTest.java | 135 ++ .../doris/load/loadv2/dpp/DppUtilsTest.java | 238 ++++ .../MinimumCoverageRollupTreeBuilderTest.java | 109 ++ .../doris/load/loadv2/dpp/SparkDppTest.java | 67 + .../load/loadv2/etl/SparkEtlJobTest.java | 194 +++ spark-load/spark-load-core/pom.xml | 94 ++ .../org/apache/doris/SparkLoadRunner.java | 129 ++ .../org/apache/doris/client/DorisClient.java | 185 +++ .../doris/common/CommandLineOptions.java | 16 + .../org/apache/doris/common/Constants.java | 7 + .../org/apache/doris/common/DppResult.java | 54 + .../org/apache/doris/common/JobStatus.java | 9 + .../org/apache/doris/common/LoadInfo.java | 18 + .../org/apache/doris/common/LoadMode.java | 5 + .../apache/doris/common/ResponseEntity.java | 14 + .../doris/common/SparkLoadException.java | 12 + .../doris/common/meta/LoadInfoResponse.java | 14 + .../apache/doris/common/meta/LoadMeta.java | 72 + .../apache/doris/common/meta/TableMeta.java | 67 + .../org/apache/doris/config/JobConfig.java | 188 +++ .../org/apache/doris/config/TaskType.java | 8 + .../org/apache/doris/load/JobMonitor.java | 13 + .../org/apache/doris/load/LoadManager.java | 37 + .../apache/doris/load/TransactionManager.java | 17 + .../org/apache/doris/load/job/Loader.java | 92 ++ .../org/apache/doris/load/job/PullLoader.java | 350 +++++ .../apache/doris/load/job/Recoverable.java | 11 + .../java/org/apache/doris/util/DateUtils.java | 21 + .../org/apache/doris/util/HadoopUtils.java | 99 ++ .../java/org/apache/doris/util/HttpUtils.java | 42 + .../java/org/apache/doris/util/JsonUtils.java | 49 + .../src/main/resources/log4j.properties | 8 + 54 files changed, 7434 insertions(+), 5 deletions(-) create mode 100644 spark-load/build.sh create mode 100644 spark-load/pom.xml create mode 100644 spark-load/spark-dpp/.flattened-pom.xml create mode 100644 spark-load/spark-dpp/dependency-reduced-pom.xml create mode 100644 spark-load/spark-dpp/pom.xml create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/common/SparkDppException.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java create mode 100644 spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java create mode 100644 spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java create mode 100644 spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java create mode 100644 spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java create mode 100644 spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java create mode 100644 spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java create mode 100644 spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java create mode 100644 spark-load/spark-load-core/pom.xml create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java create mode 100644 spark-load/spark-load-core/src/main/resources/log4j.properties diff --git a/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java b/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java index 3b6be497..b6264edc 100644 --- a/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java +++ b/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java @@ -87,7 +87,7 @@ public int read() throws IOException { if (read < 0) { return -1; } else { - return bytes[0]; + return bytes[0] & 0xFF; } } diff --git a/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala b/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala index 06bb56ff..faa08d7d 100644 --- a/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala +++ b/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala @@ -498,14 +498,12 @@ class StreamLoader(settings: SparkSettings, isStreaming: Boolean) extends Loader val loadResponse: StreamLoadResponse = StreamLoadResponse(code, msg, content) if (loadResponse.code != HttpStatus.SC_OK) { - LOG.error(s"Stream load http status is not OK, status: ${loadResponse.code}, response: $loadResponse") - throw new StreamLoadException(String.format("stream load error, http status:%d, response:%s", - new Integer(loadResponse.code), loadResponse)) + throw new StreamLoadException(String.format("stream load error, http status:%d, msg:%s", + new Integer(loadResponse.code), loadResponse.msg)) } else { try { val respContent = MAPPER.readValue(loadResponse.content, classOf[RespContent]) if (!respContent.isSuccess) { - LOG.error(s"Stream load status is not success, status:${respContent.getStatus}, response:$loadResponse") throw new StreamLoadException(String.format("stream load error, load status:%s, response:%s", respContent.getStatus, loadResponse)) } LOG.info("Stream load Response:{}", loadResponse) diff --git a/spark-load/build.sh b/spark-load/build.sh new file mode 100644 index 00000000..e69de29b diff --git a/spark-load/pom.xml b/spark-load/pom.xml new file mode 100644 index 00000000..1ae40197 --- /dev/null +++ b/spark-load/pom.xml @@ -0,0 +1,370 @@ + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + pom + + spark-load-core + spark-dpp + + + + 8 + 8 + UTF-8 + 1.0-SNAPSHOT + 1.2-SNAPSHOT + 1.13 + 3.9 + 3.3.6 + 4.1.104.Final + 1.13.1 + 3.2.2 + 4.0.2 + 32.1.2-jre + 2.16.1 + 1.18.30 + 1.4 + 5.2.1 + 5.8.2 + 1.49 + 2.17.1 + 2.0.7 + 1.2 + + + + + + ${project.groupId} + fe-common + ${doris.fe.version} + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-api + + + org.apache.logging.log4j + log4j-core + + + commons-logging + commons-logging + + + org.slf4j + slf4j-api + + + + + + commons-codec + commons-codec + ${commons-codec.version} + + + + org.apache.commons + commons-lang3 + ${commons-lang3.version} + + + + + + + + + + + org.apache.spark + spark-core_${scala.major.version} + ${spark.version} + provided + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-api + + + org.apache.logging.log4j + log4j-core + + + commons-logging + commons-logging + + + org.slf4j + slf4j-api + + + + + + io.netty + netty-all + ${netty-all.version} + + + + + org.apache.spark + spark-sql_${scala.major.version} + ${spark.version} + provided + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + org.slf4j + slf4j-api + + + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + + + org.slf4j + slf4j-api + + + + + org.apache.parquet + parquet-column + ${parquet.version} + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + org.apache.parquet + parquet-common + ${parquet.version} + + + commons-collections + commons-collections + ${commons-collections.version} + + + org.scala-lang + scala-library + ${scala.version} + provided + + + com.esotericsoftware + kryo-shaded + ${kryo.version} + + + org.apache.spark + spark-catalyst_${scala.major.version} + ${spark.version} + + + org.slf4j + slf4j-api + + + provided + + + com.google.guava + guava + ${guava.version} + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + org.projectlombok + lombok + ${lombok.veresion} + provided + + + + commons-cli + commons-cli + ${commons-cli.version} + + + org.apache.spark + spark-launcher_${scala.major.version} + ${spark.version} + + + + org.apache.httpcomponents.client5 + httpclient5 + ${httpclient5.version} + + + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + test + + + + org.junit.vintage + junit-vintage-engine + ${junit.version} + test + + + + org.junit.jupiter + junit-jupiter-params + ${junit.version} + test + + + + org.jmockit + jmockit + ${jmockit.version} + test + + + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + + + + org.apache.logging.log4j + log4j-slf4j-impl + ${log4j.version} + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + commons-logging + commons-logging + ${commons-logging.version} + + + + + + + + spark2 + + false + + + 2.4.8 + + + + spark3 + + true + + + 3.4.1 + + + + scala_2.11 + + false + + + 2.11.8 + 2.11 + + + + scala_2.12 + + true + + + 2.12.10 + 2.12 + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + + com.google.code.findbugs:* + org.slf4j:* + org.scala-lang:* + + + + + org.apache.hadoop + org.apache.doris.shaded.org.apache.hadoop + + + + + + package + + shade + + + + + + + + \ No newline at end of file diff --git a/spark-load/spark-dpp/.flattened-pom.xml b/spark-load/spark-dpp/.flattened-pom.xml new file mode 100644 index 00000000..0e8b1d66 --- /dev/null +++ b/spark-load/spark-dpp/.flattened-pom.xml @@ -0,0 +1,409 @@ + + + + 4.0.0 + org.apache.doris + spark-dpp + 1.2-SNAPSHOT + The Apache Software Foundation provides support for the Apache community of open-source software projects. + The Apache projects are characterized by a collaborative, consensus based development process, an open and + pragmatic software license, and a desire to create high quality software that leads the way in its field. + We consider ourselves not simply a group of projects sharing a server, but rather a community of developers + and users. + https://doris.apache.org/spark-dpp/ + + The Apache Software Foundation + https://www.apache.org/ + + + + Apache 2.0 License + https://www.apache.org/licenses/LICENSE-2.0.html + repo + + + + + Dev Mailing List + dev-subscribe@doris.apache.org + dev-unsubscribe@doris.apache.org + dev@doris.apache.org + + + Commits Mailing List + commits-subscribe@doris.apache.org + commits-unsubscribe@doris.apache.org + commits@doris.apache.org + + + + scm:git:https://git@github.com/apache/doris.git/spark-dpp + scm:git:https://git@github.com/apache/doris.git/spark-dpp + scm:git:https://git@github.com/apache/doris.git/spark-dpp + + + GitHub + https://github.com/apache/doris/issues + + + + apache.releases.https + Apache Release Distribution Repository + https://repository.apache.org/service/local/staging/deploy/maven2 + + + apache.snapshots.https + Apache Development Snapshot Repository + https://repository.apache.org/content/repositories/snapshots + + + + 4.7.2 + https://repository.apache.org/service/local/staging/deploy/maven2 + https://repository.apache.org/content/repositories/snapshots + 1.3.2 + https://sonarcloud.io + 2.15.2 + 2.0.3 + https://www.apache.org/images/asf_logo_wide_2016.png + 4.9.3 + 2.0.6 + 1.13 + 2.3.0 + Apache Release Distribution Repository + 1.0.1 + 1.5 + 0.4.6 + 1 + false + 1.22 + 9.35 + 6.5.1 + 1.2.0 + 3.4.1 + 2.18.0 + 1.12.669 + 3.4.0 + 2.1.1 + 3.1.5 + 6.4.5 + 4.0.2 + 0.8.13 + 1.22.0 + 3.0.9 + 1.70 + com.google.protobuf:protoc:3.24.3 + ${fe.dir}/../thirdparty + 2.9.3 + 2.22.2 + 6.7.2 + 4.5.13 + 1.1.1 + 1.33.0 + 2.10.1 + 9.4 + 2.7.4-11 + 15.0.2 + 3.4.0 + 3.42.0 + posix + 1.9.7 + 2.3 + 0.2.14 + 1.60.1 + 2.7 + 12.2.0.1 + 4.0.2 + 2.1 + 2.0 + 3.1.3 + 3.8.9.Final + 1.13.1 + apache + 1.2 + 4.4.15 + UTF-8 + 1.18.24 + 1.8 + 1.8.4 + 1.10.0 + 2022-12-11T19:18:10Z + 1.0.4 + 435 + shade-format-flatbuffers + 2.2 + 2.3.9 + 3.9.1 + source-release + 3.3.6 + 2.10.1 + 3.0.0 + 3.10.6.Final + 3.24.3 + 3.9 + 2.0.1.Final + 1.11.3 + 5.8.2 + Apache Development Snapshot Repository + 3.18.2-GA + 3.2.2 + 1.8 + 3.7.0 + 0.2.3 + 0.45.2-public + 1.8 + 1.4 + 2.7.13 + 4.1.104.Final + 0.8.10 + 2.8.1 + 2.6 + 1.2-SNAPSHOT + 1.4.3 + 8.5.86 + true + 2.3.2 + 2.12.10 + 9.4.53.v20231009 + 0.6.0-incubating + 32.1.2-jre + 2.1 + 1.5.1 + 1.12.0 + 3.0.0-8 + github + 2.2 + UTF-8 + 3.4.4 + 1.34.0 + 1.11-8 + 2.4.0 + 202 + 0.14.1 + 2.18.0 + 1.49 + 3.2.5 + 1.1.10.5 + 0.16.0 + 3.1.0 + 1.4.3 + io.grpc:protoc-gen-grpc-java:1.34.0 + 18.3.14-doris-SNAPSHOT + 1.5.4 + 1.2.5 + /Users/gnehil/doris/fe/spark-dpp/../../ + 0.11-a-czt02-cdh + 3.24.3 + 1.7 + + + + org.apache.doris + fe-common + 1.2-SNAPSHOT + compile + + + commons-codec + commons-codec + 1.13 + compile + + + org.apache.commons + commons-lang3 + 3.9 + compile + + + com.google.code.gson + gson + 2.10.1 + compile + + + org.apache.spark + spark-core_2.12 + 3.4.1 + provided + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + org.eclipse.jetty + jetty-util + + + io.netty + netty + + + + + io.netty + netty-all + 4.1.104.Final + compile + + + org.apache.spark + spark-sql_2.12 + 3.4.1 + provided + + + org.apache.arrow + arrow-vector + + + + + org.apache.hadoop + hadoop-common + 3.3.6 + compile + + + jdk.tools + jdk.tools + + + org.eclipse.jetty + jetty-util + + + org.eclipse.jetty + jetty-servlet + + + io.netty + netty-all + + + log4j + log4j + + + + + org.apache.parquet + parquet-column + 1.13.1 + compile + + + org.apache.parquet + parquet-hadoop + 1.13.1 + compile + + + org.apache.parquet + parquet-common + 1.13.1 + compile + + + commons-collections + commons-collections + 3.2.2 + compile + + + org.scala-lang + scala-library + 2.12.10 + provided + + + com.esotericsoftware + kryo-shaded + 4.0.2 + compile + + + org.apache.spark + spark-catalyst_2.12 + 3.4.1 + provided + + + com.google.guava + guava + 32.1.2-jre + compile + + + org.apache.logging.log4j + log4j-core + 2.18.0 + compile + + + org.apache.logging.log4j + log4j-slf4j-impl + 2.18.0 + compile + + + org.apache.logging.log4j + log4j-core + + + + + org.apache.logging.log4j + log4j-1.2-api + 2.18.0 + compile + + + org.awaitility + awaitility + 4.2.0 + compile + + + + + + always + + snapshots + apache snapshots maven repo https + https://repository.apache.org/content/repositories/snapshots/ + + + + false + + apache.snapshots + Apache Snapshot Repository + https://repository.apache.org/snapshots + + + diff --git a/spark-load/spark-dpp/dependency-reduced-pom.xml b/spark-load/spark-dpp/dependency-reduced-pom.xml new file mode 100644 index 00000000..7bf4da08 --- /dev/null +++ b/spark-load/spark-dpp/dependency-reduced-pom.xml @@ -0,0 +1,237 @@ + + + + fe + org.apache.doris + ${revision} + + 4.0.0 + spark-dpp + + spark-dpp-${project.version} + + + maven-surefire-plugin + + ${fe_ut_parallel} + false + -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} + + + + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + runtime + ${skip.plugin} + + + + + + maven-assembly-plugin + + + make-assembly + package + + single + + + + + + + org.apache.doris.load.loadv2.etl.SparkEtlJob + + + + jar-with-dependencies + + + + + org.codehaus.mojo + cobertura-maven-plugin + 2.7 + + + 1024m + + + + + maven-clean-plugin + 3.1.0 + + + auto-clean + initialize + + clean + + + + + + maven-shade-plugin + + + package + + shade + + + + + + + com.google.code.findbugs:* + org.slf4j:* + + + + + org.roaringbitmap + org.apache.doris.shaded.org.roaringbitmap + com.google.guava + org.apache.doris.shaded.com.google.guava + + + + + + maven-javadoc-plugin + + true + + + + + + + org.apache.spark + spark-core_2.12 + 3.4.1 + provided + + + log4j + log4j + + + slf4j-log4j12 + org.slf4j + + + jetty-util + org.eclipse.jetty + + + netty + io.netty + + + + + org.apache.spark + spark-sql_2.12 + 3.4.1 + provided + + + arrow-vector + org.apache.arrow + + + + + org.scala-lang + scala-library + 2.12.10 + provided + + + org.apache.spark + spark-catalyst_2.12 + 3.4.1 + provided + + + org.junit.jupiter + junit-jupiter-engine + 5.8.2 + test + + + junit-platform-engine + org.junit.platform + + + junit-jupiter-api + org.junit.jupiter + + + apiguardian-api + org.apiguardian + + + + + org.junit.vintage + junit-vintage-engine + 5.8.2 + test + + + junit + junit + + + junit-platform-engine + org.junit.platform + + + apiguardian-api + org.apiguardian + + + + + org.junit.jupiter + junit-jupiter-params + 5.8.2 + test + + + junit-jupiter-api + org.junit.jupiter + + + apiguardian-api + org.apiguardian + + + + + org.jmockit + jmockit + 1.49 + test + + + + 1 + ${basedir}/../../ + + diff --git a/spark-load/spark-dpp/pom.xml b/spark-load/spark-dpp/pom.xml new file mode 100644 index 00000000..cc4516f1 --- /dev/null +++ b/spark-load/spark-dpp/pom.xml @@ -0,0 +1,273 @@ + + + + 4.0.0 + + org.apache.doris + ${revision} + spark-load + + spark-dpp + jar + + + ${project.groupId} + fe-common + + + + commons-codec + commons-codec + + + + org.apache.commons + commons-lang3 + + + + + + + + + + org.apache.spark + spark-core_${scala.major.version} + + + + io.netty + netty-all + + + + + org.apache.spark + spark-sql_${scala.major.version} + + + org.apache.hadoop + hadoop-common + + + org.apache.parquet + parquet-column + + + org.apache.parquet + parquet-hadoop + + + org.apache.parquet + parquet-common + + + commons-collections + commons-collections + + + org.scala-lang + scala-library + + + com.esotericsoftware + kryo-shaded + + + org.apache.spark + spark-catalyst_${scala.major.version} + + + com.google.guava + guava + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.junit.vintage + junit-vintage-engine + test + + + + org.junit.jupiter + junit-jupiter-params + test + + + org.jmockit + jmockit + test + + + + org.apache.logging.log4j + log4j-core + + + + org.apache.logging.log4j + log4j-api + + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + org.slf4j + slf4j-api + + + + + spark-dpp-${project.version} + + + + org.apache.maven.plugins + maven-surefire-plugin + + set larger, eg, 3, to reduce the time or running FE unit tests<--> + + not reuse forked jvm, so that each unit test will run in separate jvm. to avoid singleton confict<--> + + + -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + runtime + ${skip.plugin} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + + org.apache.doris.load.loadv2.etl.SparkEtlJob + + + + jar-with-dependencies + + + + + make-assembly + + package + + + single + + + + + + org.codehaus.mojo + cobertura-maven-plugin + 2.7 + + + 1024m + + + + + + org.apache.maven.plugins + maven-clean-plugin + 3.1.0 + + + auto-clean + initialize + + clean + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + + com.google.code.findbugs:* + org.slf4j:* + + + + + org.roaringbitmap + org.apache.doris.shaded.org.roaringbitmap + com.google.guava + org.apache.doris.shaded.com.google.guava + + + + + + package + + shade + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + true + + + + + diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/common/SparkDppException.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/common/SparkDppException.java new file mode 100644 index 00000000..66547461 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/common/SparkDppException.java @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import com.google.common.base.Strings; + +// Exception for Spark DPP process +public class SparkDppException extends Exception { + public SparkDppException(String msg, Throwable cause) { + super(Strings.nullToEmpty(msg), cause); + } + + public SparkDppException(Throwable cause) { + super(cause); + } + + public SparkDppException(String msg, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(Strings.nullToEmpty(msg), cause, enableSuppression, writableStackTrace); + } + + public SparkDppException(String msg) { + super(Strings.nullToEmpty(msg)); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java new file mode 100644 index 00000000..84ef9ba8 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java @@ -0,0 +1,297 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; + + +// Parser to validate value for different type +public abstract class ColumnParser implements Serializable { + + protected static final Logger LOG = LoggerFactory.getLogger(ColumnParser.class); + + // thread safe formatter + public static final DateTimeFormatter DATE_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("uuuu-MM-dd") + .toFormatter(); + public static final DateTimeFormatter DATE_TIME_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("uuuu-MM-dd HH:mm:ss") + .toFormatter(); + + public static ColumnParser create(EtlJobConfig.EtlColumn etlColumn) throws SparkDppException { + String columnType = etlColumn.columnType; + if (columnType.equalsIgnoreCase("TINYINT")) { + return new TinyIntParser(); + } else if (columnType.equalsIgnoreCase("SMALLINT")) { + return new SmallIntParser(); + } else if (columnType.equalsIgnoreCase("INT")) { + return new IntParser(); + } else if (columnType.equalsIgnoreCase("BIGINT")) { + return new BigIntParser(); + } else if (columnType.equalsIgnoreCase("FLOAT")) { + return new FloatParser(); + } else if (columnType.equalsIgnoreCase("DOUBLE")) { + return new DoubleParser(); + } else if (columnType.equalsIgnoreCase("BOOLEAN")) { + return new BooleanParser(); + } else if (columnType.equalsIgnoreCase("DATE") + || columnType.equalsIgnoreCase("DATEV2")) { + return new DateParser(); + } else if (columnType.equalsIgnoreCase("DATETIME") + || columnType.equalsIgnoreCase("DATETIMEV2")) { + return new DatetimeParser(); + } else if (columnType.equalsIgnoreCase("STRING") + || columnType.equalsIgnoreCase("TEXT")) { + return new StringTypeParser(etlColumn); + } else if (columnType.equalsIgnoreCase("VARCHAR") + || columnType.equalsIgnoreCase("CHAR") + || columnType.equalsIgnoreCase("BITMAP") + || columnType.equalsIgnoreCase("HLL")) { + return new StringParser(etlColumn); + } else if (columnType.equalsIgnoreCase("DECIMALV2") + || columnType.equalsIgnoreCase("DECIMAL32") + || columnType.equalsIgnoreCase("DECIMAL64") + || columnType.equalsIgnoreCase("DECIMAL128")) { + return new DecimalParser(etlColumn); + } else if (columnType.equalsIgnoreCase("LARGEINT")) { + return new LargeIntParser(); + } else { + throw new SparkDppException("unsupported type:" + columnType); + } + } + + public abstract boolean parse(String value); +} + +class TinyIntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Byte.parseByte(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class SmallIntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Short.parseShort(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class IntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Integer.parseInt(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class BigIntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Long.parseLong(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class FloatParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Float ret = Float.parseFloat(value); + return !ret.isNaN() && !ret.isInfinite(); + } catch (NumberFormatException e) { + return false; + } + } +} + +class DoubleParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Double ret = Double.parseDouble(value); + return !ret.isInfinite() && !ret.isNaN(); + } catch (NumberFormatException e) { + return false; + } + } +} + +class BooleanParser extends ColumnParser { + @Override + public boolean parse(String value) { + if (value.equalsIgnoreCase("true") + || value.equalsIgnoreCase("false") + || value.equals("0") || value.equals("1")) { + return true; + } + return false; + } +} + +class DateParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + DATE_FORMATTER.parse(value); + } catch (Exception e) { + return false; + } + return true; + } +} + +class DatetimeParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + DATE_TIME_FORMATTER.parse(value); + } catch (Exception e) { + return false; + } + return true; + } +} + +class StringParser extends ColumnParser { + + private EtlJobConfig.EtlColumn etlColumn; + + public StringParser(EtlJobConfig.EtlColumn etlColumn) { + this.etlColumn = etlColumn; + } + + @Override + public boolean parse(String value) { + try { + return value.getBytes("UTF-8").length <= etlColumn.stringLength; + } catch (Exception e) { + throw new RuntimeException("string check failed ", e); + } + } +} + +class StringTypeParser extends ColumnParser { + + private EtlJobConfig.EtlColumn etlColumn; + + public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) { + this.etlColumn = etlColumn; + } + + @Override + public boolean parse(String value) { + try { + return value.getBytes("UTF-8").length <= DppUtils.STRING_LENGTH_LIMIT; + } catch (Exception e) { + throw new RuntimeException("string check failed ", e); + } + } +} + + +class DecimalParser extends ColumnParser { + + public static int PRECISION = 27; + public static int SCALE = 9; + + private BigDecimal maxValue; + private BigDecimal minValue; + + public DecimalParser(EtlJobConfig.EtlColumn etlColumn) { + StringBuilder precisionStr = new StringBuilder(); + for (int i = 0; i < etlColumn.precision - etlColumn.scale; i++) { + precisionStr.append("9"); + } + StringBuilder scaleStr = new StringBuilder(); + for (int i = 0; i < etlColumn.scale; i++) { + scaleStr.append("9"); + } + maxValue = new BigDecimal(precisionStr.toString() + "." + scaleStr.toString()); + minValue = new BigDecimal("-" + precisionStr.toString() + "." + scaleStr.toString()); + } + + @Override + public boolean parse(String value) { + try { + BigDecimal bigDecimal = new BigDecimal(value); + return bigDecimal.precision() - bigDecimal.scale() <= PRECISION - SCALE && bigDecimal.scale() <= SCALE; + } catch (NumberFormatException e) { + return false; + } catch (Exception e) { + throw new RuntimeException("decimal parse failed ", e); + } + } + + public BigDecimal getMaxValue() { + return maxValue; + } + + public BigDecimal getMinValue() { + return minValue; + } +} + +class LargeIntParser extends ColumnParser { + + private BigInteger maxValue = new BigInteger("170141183460469231731687303715884105727"); + private BigInteger minValue = new BigInteger("-170141183460469231731687303715884105728"); + + @Override + public boolean parse(String value) { + try { + BigInteger inputValue = new BigInteger(value); + return inputValue.compareTo(maxValue) < 0 && inputValue.compareTo(minValue) > 0; + } catch (NumberFormatException e) { + return false; + } catch (ArithmeticException e) { + LOG.warn("int value is too big even for java BigInteger,value={}" + value); + return false; + } catch (Exception e) { + throw new RuntimeException("large int parse failed:" + value, e); + } + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java new file mode 100644 index 00000000..c873f5af --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.io.BitmapValue; +import org.apache.doris.common.io.Roaring64Map; + +import com.esotericsoftware.kryo.Kryo; +import org.apache.spark.serializer.KryoRegistrator; + +/** + * register etl classes with Kryo when using Kryo serialization. + */ +public class DorisKryoRegistrator implements KryoRegistrator { + + @Override + public void registerClasses(Kryo kryo) { + kryo.register(Roaring64Map.class); + kryo.register(BitmapValue.class); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java new file mode 100644 index 00000000..05f2bdcc --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.apache.spark.Partitioner; + +import java.io.Serializable; +import java.util.List; + +public class DorisRangePartitioner extends Partitioner { + private static final String UNPARTITIONED_TYPE = "UNPARTITIONED"; + private EtlJobConfig.EtlPartitionInfo partitionInfo; + private List partitionRangeKeys; + List partitionKeyIndexes; + + public DorisRangePartitioner(EtlJobConfig.EtlPartitionInfo partitionInfo, + List partitionKeyIndexes, + List partitionRangeKeys) { + this.partitionInfo = partitionInfo; + this.partitionKeyIndexes = partitionKeyIndexes; + this.partitionRangeKeys = partitionRangeKeys; + } + + public int numPartitions() { + if (partitionInfo == null) { + return 0; + } + if (partitionInfo.partitionType.equalsIgnoreCase(UNPARTITIONED_TYPE)) { + return 1; + } + return partitionInfo.partitions.size(); + } + + public int getPartition(Object var1) { + if (partitionInfo.partitionType != null + && partitionInfo.partitionType.equalsIgnoreCase(UNPARTITIONED_TYPE)) { + return 0; + } + DppColumns key = (DppColumns) var1; + // get the partition columns from key as partition key + DppColumns partitionKey = new DppColumns(key, partitionKeyIndexes); + // TODO: optimize this by use binary search + for (int i = 0; i < partitionRangeKeys.size(); ++i) { + if (partitionRangeKeys.get(i).isRowContained(partitionKey)) { + return i; + } + } + return -1; + } + + public static class PartitionRangeKey implements Serializable { + public boolean isMaxPartition; + public DppColumns startKeys; + public DppColumns endKeys; + + public boolean isRowContained(DppColumns row) { + if (isMaxPartition) { + return startKeys.compareTo(row) <= 0; + } else { + return startKeys.compareTo(row) <= 0 && endKeys.compareTo(row) > 0; + } + } + + public String toString() { + return "PartitionRangeKey{" + + "isMaxPartition=" + isMaxPartition + + ", startKeys=" + startKeys + + ", endKeys=" + endKeys + + '}'; + } + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java new file mode 100644 index 00000000..5b5e3f5d --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import com.google.common.base.Preconditions; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Objects; + +// DppColumns is used to store the +class DppColumns implements Comparable, Serializable { + public List columns = new ArrayList(); + + public DppColumns(List keys) { + this.columns = keys; + } + + public DppColumns(DppColumns key, List indexes) { + for (int i = 0; i < indexes.size(); ++i) { + columns.add(key.columns.get(indexes.get(i))); + } + } + + @Override + public int compareTo(DppColumns other) { + Preconditions.checkState(columns.size() == other.columns.size()); + + int cmp = 0; + for (int i = 0; i < columns.size(); i++) { + Object columnObj = columns.get(i); + Object otherColumn = other.columns.get(i); + if (columnObj == null && otherColumn == null) { + return 0; + } else if (columnObj == null || otherColumn == null) { + if (columnObj == null) { + return -1; + } else { + return 1; + } + } + if (columns.get(i) instanceof Integer) { + cmp = ((Integer) (columns.get(i))).compareTo((Integer) (other.columns.get(i))); + } else if (columns.get(i) instanceof Long) { + cmp = ((Long) (columns.get(i))).compareTo((Long) (other.columns.get(i))); + } else if (columns.get(i) instanceof Boolean) { + cmp = ((Boolean) (columns.get(i))).compareTo((Boolean) (other.columns.get(i))); + } else if (columns.get(i) instanceof Short) { + cmp = ((Short) (columns.get(i))).compareTo((Short) (other.columns.get(i))); + } else if (columns.get(i) instanceof Float) { + cmp = ((Float) (columns.get(i))).compareTo((Float) (other.columns.get(i))); + } else if (columns.get(i) instanceof Double) { + cmp = ((Double) (columns.get(i))).compareTo((Double) (other.columns.get(i))); + } else if (columns.get(i) instanceof Date) { + cmp = ((Date) (columns.get(i))).compareTo((Date) (other.columns.get(i))); + } else if (columns.get(i) instanceof java.sql.Timestamp) { + cmp = ((java.sql.Timestamp) columns.get(i)).compareTo((java.sql.Timestamp) other.columns.get(i)); + } else { + cmp = ((String) (columns.get(i))).compareTo((String) (other.columns.get(i))); + } + if (cmp != 0) { + return cmp; + } + } + return cmp; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DppColumns dppColumns = (DppColumns) o; + return Objects.equals(columns, dppColumns.columns); + } + + @Override + public int hashCode() { + return Objects.hash(columns); + } + + @Override + public String toString() { + return "dppColumns{" + + "columns=" + columns + + '}'; + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java new file mode 100644 index 00000000..0c6b6454 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java @@ -0,0 +1,299 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import com.google.common.collect.Lists; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import java.util.Set; +import java.util.zip.CRC32; + +public class DppUtils { + public static final String BUCKET_ID = "__bucketId__"; + + public static final int STRING_LENGTH_LIMIT = 1048576; + + public static Class getClassFromDataType(DataType dataType) { + if (dataType == null) { + return null; + } + if (dataType.equals(DataTypes.BooleanType)) { + return Boolean.class; + } else if (dataType.equals(DataTypes.ShortType)) { + return Short.class; + } else if (dataType.equals(DataTypes.IntegerType)) { + return Integer.class; + } else if (dataType.equals(DataTypes.LongType)) { + return Long.class; + } else if (dataType.equals(DataTypes.FloatType)) { + return Float.class; + } else if (dataType.equals(DataTypes.DoubleType)) { + return Double.class; + } else if (dataType.equals(DataTypes.DateType)) { + return Date.class; + } else if (dataType.equals(DataTypes.StringType)) { + return String.class; + } else if (dataType instanceof DecimalType) { + DecimalType decimalType = (DecimalType) dataType; + return BigDecimal.valueOf(decimalType.precision(), decimalType.scale()).getClass(); + } else if (dataType.equals(DataTypes.TimestampType)) { + return Long.class; + } + return null; + } + + public static Class getClassFromColumn(EtlJobConfig.EtlColumn column) throws SparkDppException { + switch (column.columnType) { + case "BOOLEAN": + return Boolean.class; + case "TINYINT": + case "SMALLINT": + return Short.class; + case "INT": + return Integer.class; + case "DATETIME": + case "DATETIMEV2": + return java.sql.Timestamp.class; + case "BIGINT": + return Long.class; + case "LARGEINT": + throw new SparkDppException("LARGEINT is not supported now"); + case "FLOAT": + return Float.class; + case "DOUBLE": + return Double.class; + case "DATE": + case "DATEV2": + return Date.class; + case "HLL": + case "CHAR": + case "VARCHAR": + case "STRING": + case "TEXT": + case "BITMAP": + case "OBJECT": + return String.class; + case "DECIMALV2": + case "DECIMAL32": + case "DECIMAL64": + case "DECIMAL128": + return BigDecimal.valueOf(column.precision, column.scale).getClass(); + default: + return String.class; + } + } + + public static DataType getDataTypeFromColumn(EtlJobConfig.EtlColumn column, boolean regardDistinctColumnAsBinary) { + DataType dataType = DataTypes.StringType; + switch (column.columnType) { + case "BOOLEAN": + dataType = DataTypes.StringType; + break; + case "TINYINT": + dataType = DataTypes.ByteType; + break; + case "SMALLINT": + dataType = DataTypes.ShortType; + break; + case "INT": + dataType = DataTypes.IntegerType; + break; + case "DATETIME": + case "DATETIMEV2": + dataType = DataTypes.TimestampType; + break; + case "BIGINT": + dataType = DataTypes.LongType; + break; + case "LARGEINT": + dataType = DataTypes.StringType; + break; + case "FLOAT": + dataType = DataTypes.FloatType; + break; + case "DOUBLE": + dataType = DataTypes.DoubleType; + break; + case "DATE": + case "DATEV2": + dataType = DataTypes.DateType; + break; + case "CHAR": + case "VARCHAR": + case "STRING": + case "TEXT": + case "OBJECT": + dataType = DataTypes.StringType; + break; + case "HLL": + case "BITMAP": + dataType = regardDistinctColumnAsBinary ? DataTypes.BinaryType : DataTypes.StringType; + break; + case "DECIMALV2": + case "DECIMAL32": + case "DECIMAL64": + case "DECIMAL128": + dataType = DecimalType.apply(column.precision, column.scale); + break; + default: + throw new RuntimeException("Reason: invalid column type:" + column); + } + return dataType; + } + + public static ByteBuffer getHashValue(Object o, DataType type) { + ByteBuffer buffer = ByteBuffer.allocate(8); + buffer.order(ByteOrder.LITTLE_ENDIAN); + if (o == null) { + buffer.putInt(0); + return buffer; + } + if (type.equals(DataTypes.ByteType)) { + buffer.put((byte) o); + } else if (type.equals(DataTypes.ShortType)) { + buffer.putShort((Short) o); + } else if (type.equals(DataTypes.IntegerType)) { + buffer.putInt((Integer) o); + } else if (type.equals(DataTypes.LongType)) { + buffer.putLong((Long) o); + } else if (type.equals(DataTypes.StringType)) { + try { + String str = String.valueOf(o); + buffer = ByteBuffer.wrap(str.getBytes("UTF-8")); + } catch (Exception e) { + throw new RuntimeException(e); + } + } else if (type.equals(DataTypes.BooleanType)) { + Boolean b = (Boolean) o; + byte value = (byte) (b ? 1 : 0); + buffer.put(value); + } + // do not flip buffer when the buffer was created by wrap() + if (!type.equals(DataTypes.StringType)) { + buffer.flip(); + } + return buffer; + } + + public static long getHashValue(Row row, List distributeColumns, StructType dstTableSchema) { + CRC32 hashValue = new CRC32(); + for (String distColumn : distributeColumns) { + Object columnObject = row.get(row.fieldIndex(distColumn)); + ByteBuffer buffer = getHashValue(columnObject, dstTableSchema.apply(distColumn).dataType()); + hashValue.update(buffer.array(), 0, buffer.limit()); + } + return hashValue.getValue(); + } + + public static StructType replaceBinaryColsInSchema(Set binaryColumns, StructType dstSchema) { + List fields = new ArrayList<>(); + for (StructField originField : dstSchema.fields()) { + if (binaryColumns.contains(originField.name())) { + fields.add(DataTypes.createStructField(originField.name(), + DataTypes.BinaryType, originField.nullable())); + } else { + fields.add(DataTypes.createStructField(originField.name(), + originField.dataType(), originField.nullable())); + } + } + StructType ret = DataTypes.createStructType(fields); + return ret; + } + + public static StructType createDstTableSchema(List columns, + boolean addBucketIdColumn, boolean regardDistinctColumnAsBinary) { + List fields = new ArrayList<>(); + if (addBucketIdColumn) { + StructField bucketIdField = DataTypes.createStructField(BUCKET_ID, DataTypes.StringType, true); + fields.add(bucketIdField); + } + for (EtlJobConfig.EtlColumn column : columns) { + DataType structColumnType = getDataTypeFromColumn(column, regardDistinctColumnAsBinary); + StructField field = DataTypes.createStructField(column.columnName, structColumnType, column.isAllowNull); + fields.add(field); + } + StructType dstSchema = DataTypes.createStructType(fields); + return dstSchema; + } + + public static List parseColumnsFromPath(String filePath, List columnsFromPath) + throws SparkDppException { + if (columnsFromPath == null || columnsFromPath.isEmpty()) { + return Collections.emptyList(); + } + String[] strings = filePath.split("/"); + if (strings.length < 2) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + String[] columns = new String[columnsFromPath.size()]; + int size = 0; + for (int i = strings.length - 2; i >= 0; i--) { + String str = strings[i]; + if (str != null && str.isEmpty()) { + continue; + } + if (str == null || !str.contains("=")) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + String[] pair = str.split("=", 2); + if (pair.length != 2) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + int index = columnsFromPath.indexOf(pair[0]); + if (index == -1) { + continue; + } + columns[index] = pair[1]; + size++; + if (size >= columnsFromPath.size()) { + break; + } + } + if (size != columnsFromPath.size()) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + return Lists.newArrayList(columns); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java new file mode 100644 index 00000000..e19cfae8 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java @@ -0,0 +1,432 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.commons.collections.map.MultiValueMap; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.AnalysisException; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalog.Column; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; + +/** + * used for build hive global dict and encode source hive table + * + * input: a source hive table + * output: a intermediate hive table whose distinct column is encode with int value + * + * usage example + * step1,create a intermediate hive table + * GlobalDictBuilder.createHiveIntermediateTable() + * step2, get distinct column's value + * GlobalDictBuilder.extractDistinctColumn() + * step3, build global dict + * GlobalDictBuilder.buildGlobalDict() + * step4, encode intermediate hive table with global dict + * GlobalDictBuilder.encodeDorisIntermediateHiveTable() + */ + +public class GlobalDictBuilder { + + protected static final Logger LOG = LoggerFactory.getLogger(GlobalDictBuilder.class); + + // name of the column in doris table which need to build global dict + // for example: some dict columns a,b,c + // case 1: all dict columns has no relation, then the map is as below + // [a=null, b=null, c=null] + // case 2: column a's value can reuse column b's value which means column a's value is a subset of column b's value + // [b=a,c=null] + private MultiValueMap dictColumn; + // target doris table columns in current spark load job + private List dorisOlapTableColumnList; + + // distinct columns which need to use map join to solve data skew in encodeDorisIntermediateHiveTable() + // we needn't to specify it until data skew happends + private List mapSideJoinColumns; + + // hive table datasource,format is db.table + private String sourceHiveDBTableName; + // user-specified filter when query sourceHiveDBTable + private String sourceHiveFilter; + // intermediate hive table to store the distinct value of distinct column + private String distinctKeyTableName; + // current doris table's global dict hive table + private String globalDictTableName; + + // used for next step to read + private String dorisIntermediateHiveTable; + private SparkSession spark; + + // key=doris column name,value=column type + private Map dorisColumnNameTypeMap = new HashMap<>(); + + // column in this list means need split distinct value and then encode respectively + // to avoid the performance bottleneck to transfer origin value to dict value + private List veryHighCardinalityColumn; + // determine the split num of new distinct value,better can be divisible by 1 + private int veryHighCardinalityColumnSplitNum; + + private ExecutorService pool; + + private StructType distinctValueSchema; + + public GlobalDictBuilder(MultiValueMap dictColumn, + List dorisOlapTableColumnList, + List mapSideJoinColumns, + String sourceHiveDBTableName, + String sourceHiveFilter, + String dorisHiveDB, + String distinctKeyTableName, + String globalDictTableName, + String dorisIntermediateHiveTable, + int buildConcurrency, + List veryHighCardinalityColumn, + int veryHighCardinalityColumnSplitNum, + SparkSession spark) { + this.dictColumn = dictColumn; + this.dorisOlapTableColumnList = dorisOlapTableColumnList; + this.mapSideJoinColumns = mapSideJoinColumns; + this.sourceHiveDBTableName = sourceHiveDBTableName; + this.sourceHiveFilter = sourceHiveFilter; + this.distinctKeyTableName = distinctKeyTableName; + this.globalDictTableName = globalDictTableName; + this.dorisIntermediateHiveTable = dorisIntermediateHiveTable; + this.spark = spark; + this.pool = Executors.newFixedThreadPool(buildConcurrency < 0 ? 1 : buildConcurrency); + this.veryHighCardinalityColumn = veryHighCardinalityColumn; + this.veryHighCardinalityColumnSplitNum = veryHighCardinalityColumnSplitNum; + + spark.sql("use " + dorisHiveDB); + } + + public void createHiveIntermediateTable() throws AnalysisException { + Map sourceHiveTableColumn = spark.catalog() + .listColumns(sourceHiveDBTableName) + .collectAsList() + .stream().collect(Collectors.toMap(Column::name, Column::dataType)); + + Map sourceHiveTableColumnInLowercase = new HashMap<>(); + for (Map.Entry entry : sourceHiveTableColumn.entrySet()) { + sourceHiveTableColumnInLowercase.put(entry.getKey().toLowerCase(), entry.getValue().toLowerCase()); + } + + // check and get doris column type in hive + dorisOlapTableColumnList.stream().map(String::toLowerCase).forEach(columnName -> { + String columnType = sourceHiveTableColumnInLowercase.get(columnName); + if (StringUtils.isEmpty(columnType)) { + throw new RuntimeException(String.format("doris column %s not in source hive table", columnName)); + } + dorisColumnNameTypeMap.put(columnName, columnType); + }); + + spark.sql(String.format("drop table if exists %s ", dorisIntermediateHiveTable)); + // create IntermediateHiveTable + spark.sql(getCreateIntermediateHiveTableSql()); + + // insert data to IntermediateHiveTable + spark.sql(getInsertIntermediateHiveTableSql()); + } + + public void extractDistinctColumn() { + // create distinct tables + spark.sql(getCreateDistinctKeyTableSql()); + + // extract distinct column + List workerList = new ArrayList<>(); + // For the column in dictColumns's valueSet, their value is a subset of column in keyset, + // so we don't need to extract distinct value of column in valueSet + for (Object column : dictColumn.keySet()) { + workerList.add( + () -> spark.sql(getInsertDistinctKeyTableSql(column.toString(), dorisIntermediateHiveTable))); + } + + submitWorker(workerList); + } + + public void buildGlobalDict() throws ExecutionException, InterruptedException { + // create global dict hive table + spark.sql(getCreateGlobalDictHiveTableSql()); + + List globalDictBuildWorkers = new ArrayList<>(); + for (Object distinctColumnNameOrigin : dictColumn.keySet()) { + String distinctColumnNameTmp = distinctColumnNameOrigin.toString(); + globalDictBuildWorkers.add(() -> { + // get global dict max value + List maxGlobalDictValueRow + = spark.sql(getMaxGlobalDictValueSql(distinctColumnNameTmp)).collectAsList(); + if (maxGlobalDictValueRow.size() == 0) { + throw new RuntimeException(String.format("get max dict value failed: %s", distinctColumnNameTmp)); + } + + long maxDictValue = 0; + long minDictValue = 0; + Row row = maxGlobalDictValueRow.get(0); + if (row != null && row.get(0) != null) { + maxDictValue = (long) row.get(0); + minDictValue = (long) row.get(1); + } + LOG.info(" column " + distinctColumnNameTmp + " 's max value in dict is " + + maxDictValue + ", min value is " + minDictValue); + // maybe never happened, but we need detect it + if (minDictValue < 0) { + throw new RuntimeException(String.format(" column %s 's cardinality has exceed bigint's max value", + distinctColumnNameTmp)); + } + + if (veryHighCardinalityColumn.contains(distinctColumnNameTmp) + && veryHighCardinalityColumnSplitNum > 1) { + // split distinct key first and then encode with count + buildGlobalDictBySplit(maxDictValue, distinctColumnNameTmp); + } else { + // build global dict directly + spark.sql(getBuildGlobalDictSql(maxDictValue, distinctColumnNameTmp)); + } + + }); + } + submitWorker(globalDictBuildWorkers); + } + + // encode dorisIntermediateHiveTable's distinct column + public void encodeDorisIntermediateHiveTable() { + for (Object distinctColumnObj : dictColumn.keySet()) { + spark.sql(getEncodeDorisIntermediateHiveTableSql(distinctColumnObj.toString(), + (ArrayList) dictColumn.get(distinctColumnObj.toString()))); + } + } + + private String getCreateIntermediateHiveTableSql() { + StringBuilder sql = new StringBuilder(); + sql.append("create table if not exists ").append(dorisIntermediateHiveTable).append(" ( "); + + Set allDictColumn = new HashSet<>(); + allDictColumn.addAll(dictColumn.keySet()); + allDictColumn.addAll(dictColumn.values()); + dorisOlapTableColumnList.forEach(columnName -> { + sql.append(columnName).append(" "); + if (allDictColumn.contains(columnName)) { + sql.append(" string ,"); + } else { + sql.append(dorisColumnNameTypeMap.get(columnName)).append(" ,"); + } + }); + return sql.deleteCharAt(sql.length() - 1).append(" )").append(" stored as sequencefile ").toString(); + } + + private String getInsertIntermediateHiveTableSql() { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(dorisIntermediateHiveTable).append(" select "); + dorisOlapTableColumnList.forEach(columnName -> { + sql.append(columnName).append(" ,"); + }); + sql.deleteCharAt(sql.length() - 1) + .append(" from ").append(sourceHiveDBTableName); + if (!StringUtils.isEmpty(sourceHiveFilter)) { + sql.append(" where ").append(sourceHiveFilter); + } + return sql.toString(); + } + + private String getCreateDistinctKeyTableSql() { + return "create table if not exists " + distinctKeyTableName + + "(dict_key string) partitioned by (dict_column string) stored as sequencefile "; + } + + private String getInsertDistinctKeyTableSql(String distinctColumnName, String sourceHiveTable) { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(distinctKeyTableName) + .append(" partition(dict_column='").append(distinctColumnName).append("')") + .append(" select ").append(distinctColumnName) + .append(" from ").append(sourceHiveTable) + .append(" group by ").append(distinctColumnName); + return sql.toString(); + } + + private String getCreateGlobalDictHiveTableSql() { + return "create table if not exists " + globalDictTableName + + "(dict_key string, dict_value bigint) partitioned by(dict_column string) stored as sequencefile "; + } + + private String getMaxGlobalDictValueSql(String distinctColumnName) { + return "select max(dict_value) as max_value,min(dict_value) as min_value from " + + globalDictTableName + " where dict_column='" + distinctColumnName + "'"; + } + + private void buildGlobalDictBySplit(long maxGlobalDictValue, String distinctColumnName) { + // 1. get distinct value + Dataset newDistinctValue = spark.sql(getNewDistinctValue(distinctColumnName)); + + // 2. split the newDistinctValue to avoid window functions' single node bottleneck + Dataset[] splitedDistinctValue = newDistinctValue.randomSplit(getRandomSplitWeights()); + long currentMaxDictValue = maxGlobalDictValue; + Map distinctKeyMap = new HashMap<>(); + + for (int i = 0; i < splitedDistinctValue.length; i++) { + long currentDatasetStartDictValue = currentMaxDictValue; + long splitDistinctValueCount = splitedDistinctValue[i].count(); + currentMaxDictValue += splitDistinctValueCount; + String tmpDictTableName = String.format("%s_%s_tmp_dict_%s", i, + currentDatasetStartDictValue, distinctColumnName); + distinctKeyMap.put(tmpDictTableName, currentDatasetStartDictValue); + Dataset distinctValueFrame = spark.createDataFrame( + splitedDistinctValue[i].toJavaRDD(), getDistinctValueSchema()); + distinctValueFrame.createOrReplaceTempView(tmpDictTableName); + } + + spark.sql(getSplitBuildGlobalDictSql(distinctKeyMap, distinctColumnName)); + + } + + private String getSplitBuildGlobalDictSql(Map distinctKeyMap, String distinctColumnName) { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(globalDictTableName) + .append(" partition(dict_column='").append(distinctColumnName).append("') ") + .append(" select dict_key,dict_value from ").append(globalDictTableName) + .append(" where dict_column='").append(distinctColumnName).append("' "); + for (Map.Entry entry : distinctKeyMap.entrySet()) { + sql.append(" union all select dict_key, CAST((row_number() over(order by dict_key)) as BIGINT) ") + .append(String.format("+ CAST(%s as BIGINT) as dict_value from %s", + entry.getValue(), entry.getKey())); + } + return sql.toString(); + } + + private StructType getDistinctValueSchema() { + if (distinctValueSchema == null) { + List fieldList = new ArrayList<>(); + fieldList.add(DataTypes.createStructField("dict_key", DataTypes.StringType, false)); + distinctValueSchema = DataTypes.createStructType(fieldList); + } + return distinctValueSchema; + } + + private double[] getRandomSplitWeights() { + double[] weights = new double[veryHighCardinalityColumnSplitNum]; + double weight = 1 / Double.parseDouble(String.valueOf(veryHighCardinalityColumnSplitNum)); + Arrays.fill(weights, weight); + return weights; + } + + private String getBuildGlobalDictSql(long maxGlobalDictValue, String distinctColumnName) { + return "insert overwrite table " + globalDictTableName + " partition(dict_column='" + distinctColumnName + "') " + + " select dict_key,dict_value from " + globalDictTableName + + " where dict_column='" + distinctColumnName + "' " + + " union all select t1.dict_key as dict_key," + + "CAST((row_number() over(order by t1.dict_key)) as BIGINT) + " + + "CAST(" + maxGlobalDictValue + " as BIGINT) as dict_value from " + + "(select dict_key from " + distinctKeyTableName + + " where dict_column='" + distinctColumnName + "' and dict_key is not null)t1 left join " + + " (select dict_key,dict_value from " + globalDictTableName + + " where dict_column='" + distinctColumnName + "' )t2 " + + "on t1.dict_key = t2.dict_key where t2.dict_value is null"; + } + + private String getNewDistinctValue(String distinctColumnName) { + return "select t1.dict_key from " + + " (select dict_key from " + distinctKeyTableName + + " where dict_column='" + distinctColumnName + + "' and dict_key is not null)t1 left join " + + " (select dict_key,dict_value from " + globalDictTableName + + " where dict_column='" + distinctColumnName + "' )t2 " + + "on t1.dict_key = t2.dict_key where t2.dict_value is null"; + + } + + private String getEncodeDorisIntermediateHiveTableSql(String dictColumn, List childColumn) { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(dorisIntermediateHiveTable).append(" select "); + // using map join to solve distinct column data skew + // here is a spark sql hint + if (mapSideJoinColumns.size() != 0 && mapSideJoinColumns.contains(dictColumn)) { + sql.append(" /*+ BROADCAST (t) */ "); + } + dorisOlapTableColumnList.forEach(columnName -> { + if (dictColumn.equals(columnName)) { + sql.append("t.dict_value").append(" ,"); + // means the dictColumn is reused + } else if (childColumn != null && childColumn.contains(columnName)) { + sql.append(String.format(" if(%s is null, null, t.dict_value) ", columnName)).append(" ,"); + } else { + sql.append(dorisIntermediateHiveTable).append(".").append(columnName).append(" ,"); + } + }); + sql.deleteCharAt(sql.length() - 1) + .append(" from ") + .append(dorisIntermediateHiveTable) + .append(" LEFT OUTER JOIN ( select dict_key,dict_value from ").append(globalDictTableName) + .append(" where dict_column='").append(dictColumn).append("' ) t on ") + .append(dorisIntermediateHiveTable).append(".").append(dictColumn) + .append(" = t.dict_key "); + return sql.toString(); + } + + private void submitWorker(List workerList) { + try { + List> futureList = new ArrayList<>(); + for (GlobalDictBuildWorker globalDictBuildWorker : workerList) { + futureList.add(pool.submit(new Callable() { + @Override + public Boolean call() throws Exception { + try { + globalDictBuildWorker.work(); + return true; + } catch (Exception e) { + LOG.error("BuildGlobalDict failed", e); + return false; + } + } + })); + } + + LOG.info("begin to fetch worker result"); + for (Future future : futureList) { + if (!future.get()) { + throw new RuntimeException("detect one worker failed"); + } + } + LOG.info("fetch worker result complete"); + } catch (Exception e) { + LOG.error("submit worker failed", e); + throw new RuntimeException("submit worker failed", e); + } + } + + private interface GlobalDictBuildWorker { + void work(); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java new file mode 100644 index 00000000..0b54389a --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +// Build RollupTree by using minimum coverage strategy, +// which is to find the index with the minimum columns that +// has all columns of rollup index as parent index node. +// Eg: +// There are three indexes: +// index1(c1, c2, c3, c4, c5) +// index2(c1, c2, c4) +// index3(c1, c2) +// index4(c3, c4) +// index5(c1, c2, c5) +// then the result tree is: +// index1 +// | \ \ +// index2 index4 index5 +// | +// index3 +// Now, if there are more than one indexes meet the column coverage requirement, +// have the same column size(eg: index2 vs index5), child rollup is preferred +// builded from the front index(eg: index3 is the child of index2). This can be +// further optimized based on the row number of the index. +public class MinimumCoverageRollupTreeBuilder implements RollupTreeBuilder { + public RollupTreeNode build(EtlJobConfig.EtlTable tableMeta) { + List indexes = tableMeta.indexes; + List indexMetas = new ArrayList<>(); + EtlJobConfig.EtlIndex baseIndex = null; + for (EtlJobConfig.EtlIndex indexMeta : indexes) { + if (indexMeta.isBaseIndex) { + baseIndex = indexMeta; + continue; + } + indexMetas.add(indexMeta); + } + List baseIndexColumns = baseIndex.columns; + List baseKeyColumns = new ArrayList<>(); + List baseValueColumns = new ArrayList<>(); + for (EtlJobConfig.EtlColumn columnMeta : baseIndexColumns) { + if (columnMeta.isKey) { + baseKeyColumns.add(columnMeta.columnName); + } else { + baseValueColumns.add(columnMeta.columnName); + } + } + RollupTreeNode root = new RollupTreeNode(); + root.parent = null; + root.keyColumnNames = baseKeyColumns; + root.valueColumnNames = baseValueColumns; + root.indexId = baseIndex.indexId; + root.indexMeta = baseIndex; + + // sort the index metas to make sure the column number decrease + Collections.sort(indexMetas, new EtlJobConfig.EtlIndexComparator().reversed()); + for (int i = 0; i < indexMetas.size(); ++i) { + List keyColumns = new ArrayList<>(); + List valueColumns = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : indexMetas.get(i).columns) { + if (column.isKey) { + keyColumns.add(column.columnName); + } else { + valueColumns.add(column.columnName); + } + } + if (!insertIndex(root, indexMetas.get(i), keyColumns, valueColumns)) { + throw new RuntimeException(String.format("can't find a parent rollup for rollup %s," + + " rollup tree is %s", indexMetas.get(i).toString(), root)); + } + } + return root; + } + + // DFS traverse to build the rollup tree + // return true means we find a parent rollup for current rollup table + private boolean insertIndex(RollupTreeNode root, EtlJobConfig.EtlIndex indexMeta, + List keyColumns, + List valueColumns) { + // find suitable parent rollup from current node's children + if (root.children != null) { + for (int i = root.children.size() - 1; i >= 0; i--) { + if (insertIndex(root.children.get(i), indexMeta, keyColumns, valueColumns)) { + return true; + } + } + } + + // find suitable parent rollup from current node + if (root.keyColumnNames.containsAll(keyColumns) && root.valueColumnNames.containsAll(valueColumns)) { + if (root.children == null) { + root.children = new ArrayList<>(); + } + RollupTreeNode newChild = new RollupTreeNode(); + newChild.keyColumnNames = keyColumns; + newChild.valueColumnNames = valueColumns; + newChild.indexMeta = indexMeta; + newChild.indexId = indexMeta.indexId; + newChild.parent = root; + newChild.level = root.level + 1; + root.children.add(newChild); + return true; + } + + return false; + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java new file mode 100644 index 00000000..acb0d4c9 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +// RollupTreeBuilder is used to get the RollupTree from the TableMeta +public abstract interface RollupTreeBuilder { + public RollupTreeNode build(EtlJobConfig.EtlTable tableMeta); +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java new file mode 100644 index 00000000..a95482c2 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import java.util.List; + +// Base and rollup indexes are managed by as a RollupTree in order to +// produce the rollup index data from the best-fit index to get better performance. +// The calculation will be done through preorder traversal +public class RollupTreeNode { + public RollupTreeNode parent; + public List children; + public long indexId; + public List keyColumnNames; + public List valueColumnNames; + public int level; + public EtlJobConfig.EtlIndex indexMeta; + + public String toString() { + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < level; ++i) { + builder.append("-"); + } + builder.append("indexid: " + indexId + "\n"); + if (children != null && !children.isEmpty()) { + for (int i = 0; i < level; ++i) { + builder.append("-"); + } + builder.append("children:\n"); + for (RollupTreeNode child : children) { + builder.append(child.toString()); + } + } + return builder.toString(); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java new file mode 100644 index 00000000..b282d7d1 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -0,0 +1,1167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.sparkdpp.DppResult; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import com.google.common.base.Strings; +import com.google.common.collect.Maps; +import com.google.gson.Gson; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.spark.Partitioner; +import org.apache.spark.TaskContext; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.PairFlatMapFunction; +import org.apache.spark.api.java.function.VoidFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.CatalystTypeConverters; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; +import org.apache.spark.sql.functions; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.util.LongAccumulator; +import org.apache.spark.util.SerializableConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; +import scala.collection.JavaConverters; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +// This class is a Spark-based data preprocessing program, +// which will make use of the distributed compute framework of spark to +// do ETL job/sort/preaggregate jobs in spark job +// to boost the process of large amount of data load. +// the process steps are as following: +// 1. load data +// 1.1 load data from path/hive table +// 1.2 do the etl process +// 2. repartition data by using doris data model(partition and bucket) +// 3. process aggregation if needed +// 4. write data to parquet file + +public final class SparkDpp implements java.io.Serializable { + private static final Logger LOG = LoggerFactory.getLogger(SparkDpp.class); + + private static final String NULL_FLAG = "\\N"; + private static final String DPP_RESULT_FILE = "dpp_result.json"; + private static final String BITMAP_TYPE = "bitmap"; + private SparkSession spark = null; + private EtlJobConfig etlJobConfig = null; + private LongAccumulator abnormalRowAcc = null; + private LongAccumulator scannedRowsAcc = null; + private LongAccumulator fileNumberAcc = null; + private LongAccumulator fileSizeAcc = null; + private Map bucketKeyMap = new HashMap<>(); + // accumulator to collect invalid rows + private StringAccumulator invalidRows = new StringAccumulator(); + // save the hadoop configuration from spark session. + // because hadoop configuration is not serializable, + // we need to wrap it so that we can use it in executor. + private SerializableConfiguration serializableHadoopConf; + private DppResult dppResult = new DppResult(); + Map> tableToBitmapDictColumns = new HashMap<>(); + Map> tableToBinaryBitmapColumns = new HashMap<>(); + + // just for ut + public SparkDpp() { + } + + public SparkDpp(SparkSession spark, EtlJobConfig etlJobConfig, Map> tableToBitmapDictColumns, + Map> tableToBinaryBitmapColumns) { + this.spark = spark; + this.etlJobConfig = etlJobConfig; + if (tableToBitmapDictColumns != null) { + this.tableToBitmapDictColumns = tableToBitmapDictColumns; + } + if (tableToBinaryBitmapColumns != null) { + this.tableToBinaryBitmapColumns = tableToBinaryBitmapColumns; + } + } + + public void init() { + abnormalRowAcc = spark.sparkContext().longAccumulator("abnormalRowAcc"); + scannedRowsAcc = spark.sparkContext().longAccumulator("scannedRowsAcc"); + fileNumberAcc = spark.sparkContext().longAccumulator("fileNumberAcc"); + fileSizeAcc = spark.sparkContext().longAccumulator("fileSizeAcc"); + spark.sparkContext().register(invalidRows, "InvalidRowsAccumulator"); + this.serializableHadoopConf = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration()); + } + + private JavaPairRDD, Object[]> processRDDAggregate(JavaPairRDD, Object[]> currentPairRDD, + RollupTreeNode curNode, + SparkRDDAggregator[] sparkRDDAggregators) + throws SparkDppException { + final boolean isDuplicateTable = !StringUtils.equalsIgnoreCase(curNode.indexMeta.indexType, "AGGREGATE") + && !StringUtils.equalsIgnoreCase(curNode.indexMeta.indexType, "UNIQUE"); + // Aggregate/UNIQUE table + if (!isDuplicateTable) { + int idx = 0; + for (int i = 0; i < curNode.indexMeta.columns.size(); i++) { + if (!curNode.indexMeta.columns.get(i).isKey) { + sparkRDDAggregators[idx] = SparkRDDAggregator.buildAggregator(curNode.indexMeta.columns.get(i)); + idx++; + } + } + + if (curNode.indexMeta.isBaseIndex) { + JavaPairRDD, Object[]> result = currentPairRDD.mapToPair( + new EncodeBaseAggregateTableFunction(sparkRDDAggregators)) + .reduceByKey(new AggregateReduceFunction(sparkRDDAggregators)); + return result; + } else { + JavaPairRDD, Object[]> result = currentPairRDD + .mapToPair(new EncodeRollupAggregateTableFunction( + getColumnIndexInParentRollup(curNode.keyColumnNames, curNode.valueColumnNames, + curNode.parent.keyColumnNames, curNode.parent.valueColumnNames))) + .reduceByKey(new AggregateReduceFunction(sparkRDDAggregators)); + return result; + } + // Duplicate Table + } else { + int idx = 0; + for (int i = 0; i < curNode.indexMeta.columns.size(); i++) { + if (!curNode.indexMeta.columns.get(i).isKey) { + // duplicate table doesn't need aggregator + // init a aggregator here just for keeping interface compatibility when writing data to HDFS + sparkRDDAggregators[idx] = new DefaultSparkRDDAggregator(); + idx++; + } + } + if (curNode.indexMeta.isBaseIndex) { + return currentPairRDD; + } else { + return currentPairRDD.mapToPair(new EncodeRollupAggregateTableFunction( + getColumnIndexInParentRollup(curNode.keyColumnNames, curNode.valueColumnNames, + curNode.parent.keyColumnNames, curNode.parent.valueColumnNames))); + } + } + } + + // write data to parquet file by using writing the parquet scheme of spark. + private void writeRepartitionAndSortedRDDToParquet(JavaPairRDD, Object[]> resultRDD, + String pathPattern, long tableId, + EtlJobConfig.EtlIndex indexMeta, + SparkRDDAggregator[] sparkRDDAggregators) { + // TODO(wb) should deal largeint as BigInteger instead of string when using biginteger as key, + // data type may affect sorting logic + StructType dstSchema = DppUtils.createDstTableSchema(indexMeta.columns, false, true); + + resultRDD.repartitionAndSortWithinPartitions(new BucketPartitioner(bucketKeyMap), new BucketComparator()) + .foreachPartition((VoidFunction, Object[]>>>) t -> { + // write the data to dst file + Configuration conf = new Configuration(serializableHadoopConf.value()); + FileSystem fs = FileSystem.get(new Path(etlJobConfig.outputPath).toUri(), conf); + String lastBucketKey = null; + ParquetWriter parquetWriter = null; + TaskContext taskContext = TaskContext.get(); + long taskAttemptId = taskContext.taskAttemptId(); + String dstPath = ""; + String tmpPath = ""; + + while (t.hasNext()) { + Tuple2, Object[]> pair = t.next(); + List keyColumns = pair._1(); + Object[] valueColumns = pair._2(); + if ((keyColumns.size() + valueColumns.length) <= 1) { + LOG.warn("invalid row:" + pair); + continue; + } + + String curBucketKey = keyColumns.get(0).toString(); + List columnObjects = new ArrayList<>(); + for (int i = 1; i < keyColumns.size(); ++i) { + columnObjects.add(keyColumns.get(i)); + } + for (int i = 0; i < valueColumns.length; ++i) { + columnObjects.add(sparkRDDAggregators[i].finalize(valueColumns[i])); + } + + Row rowWithoutBucketKey = RowFactory.create(columnObjects.toArray()); + // if the bucket key is new, it will belong to a new tablet + if (lastBucketKey == null || !curBucketKey.equals(lastBucketKey)) { + if (parquetWriter != null) { + parquetWriter.close(); + // rename tmpPath to path + try { + fs.rename(new Path(tmpPath), new Path(dstPath)); + } catch (IOException ioe) { + LOG.warn("rename from tmpPath" + tmpPath + " to dstPath:" + dstPath + + " failed. exception:" + ioe); + throw ioe; + } + } + // flush current writer and create a new writer + String[] bucketKey = curBucketKey.split("_"); + if (bucketKey.length != 2) { + LOG.warn("invalid bucket key:" + curBucketKey); + continue; + } + long partitionId = Long.parseLong(bucketKey[0]); + int bucketId = Integer.parseInt(bucketKey[1]); + dstPath = String.format(pathPattern, tableId, partitionId, indexMeta.indexId, bucketId, + indexMeta.schemaHash); + tmpPath = dstPath + "." + taskAttemptId; + conf.setBoolean("spark.sql.parquet.writeLegacyFormat", false); + conf.setBoolean("spark.sql.parquet.int64AsTimestampMillis", false); + conf.setBoolean("spark.sql.parquet.int96AsTimestamp", true); + conf.setBoolean("spark.sql.parquet.binaryAsString", false); + conf.set("spark.sql.parquet.outputTimestampType", "INT96"); + ParquetWriteSupport.setSchema(dstSchema, conf); + ParquetWriteSupport parquetWriteSupport = new ParquetWriteSupport(); + parquetWriter = new ParquetWriter<>(new Path(tmpPath), parquetWriteSupport, + CompressionCodecName.SNAPPY, 256 * 1024 * 1024, 16 * 1024, 1024 * 1024, true, false, + WriterVersion.PARQUET_1_0, conf); + LOG.info("[HdfsOperate]>> initialize writer succeed! path:" + tmpPath); + lastBucketKey = curBucketKey; + } + Object[] array = columnObjects.toArray(); + Object[] catalystArr = new Object[array.length]; + for (int i = 0; i < array.length; i++) { + catalystArr[i] = CatalystTypeConverters.convertToCatalyst(array[i]); + } + InternalRow internalRow = InternalRow.apply( + JavaConverters.asScalaBufferConverter(Arrays.asList(catalystArr)).asScala() + .toSeq()); + parquetWriter.write(internalRow); + } + if (parquetWriter != null) { + parquetWriter.close(); + try { + fs.rename(new Path(tmpPath), new Path(dstPath)); + } catch (IOException ioe) { + LOG.warn("rename from tmpPath" + tmpPath + " to dstPath:" + dstPath + " failed. exception:" + + ioe); + throw ioe; + } + } + + }); + } + + // TODO(wb) one shuffle to calculate the rollup in the same level + private void processRollupTree(RollupTreeNode rootNode, + JavaPairRDD, Object[]> rootRDD, + long tableId, EtlJobConfig.EtlIndex baseIndex) throws SparkDppException { + Queue nodeQueue = new LinkedList<>(); + nodeQueue.offer(rootNode); + int currentLevel = 0; + // level travel the tree + Map, Object[]>> parentRDDMap = new HashMap<>(); + parentRDDMap.put(baseIndex.indexId, rootRDD); + Map, Object[]>> childrenRDDMap = new HashMap<>(); + String pathPattern = etlJobConfig.outputPath + "/" + etlJobConfig.outputFilePattern; + while (!nodeQueue.isEmpty()) { + RollupTreeNode curNode = nodeQueue.poll(); + LOG.info("start to process index:" + curNode.indexId); + if (curNode.children != null) { + for (RollupTreeNode child : curNode.children) { + nodeQueue.offer(child); + } + } + JavaPairRDD, Object[]> curRDD = null; + // column select for rollup + if (curNode.level != currentLevel) { + for (JavaPairRDD, Object[]> rdd : parentRDDMap.values()) { + rdd.unpersist(); + } + currentLevel = curNode.level; + parentRDDMap.clear(); + parentRDDMap = childrenRDDMap; + childrenRDDMap = new HashMap<>(); + } + + long parentIndexId = baseIndex.indexId; + if (curNode.parent != null) { + parentIndexId = curNode.parent.indexId; + } + + JavaPairRDD, Object[]> parentRDD = parentRDDMap.get(parentIndexId); + + // aggregate + SparkRDDAggregator[] sparkRDDAggregators = new SparkRDDAggregator[curNode.valueColumnNames.size()]; + curRDD = processRDDAggregate(parentRDD, curNode, sparkRDDAggregators); + + childrenRDDMap.put(curNode.indexId, curRDD); + + if (curNode.children != null && curNode.children.size() > 1) { + // if the children number larger than 1, persist the dataframe for performance + curRDD.persist(StorageLevel.MEMORY_AND_DISK()); + } + // repartition and write to hdfs + writeRepartitionAndSortedRDDToParquet(curRDD, pathPattern, tableId, curNode.indexMeta, sparkRDDAggregators); + } + } + + // get column index map from parent rollup to child rollup + // not consider bucketId here + private Pair getColumnIndexInParentRollup(List childRollupKeyColumns, + List childRollupValueColumns, + List parentRollupKeyColumns, + List parentRollupValueColumns) + throws SparkDppException { + List keyMap = new ArrayList<>(); + List valueMap = new ArrayList<>(); + // find column index in parent rollup schema + for (int i = 0; i < childRollupKeyColumns.size(); i++) { + for (int j = 0; j < parentRollupKeyColumns.size(); j++) { + if (StringUtils.equalsIgnoreCase(childRollupKeyColumns.get(i), parentRollupKeyColumns.get(j))) { + keyMap.add(j); + break; + } + } + } + + for (int i = 0; i < childRollupValueColumns.size(); i++) { + for (int j = 0; j < parentRollupValueColumns.size(); j++) { + if (StringUtils.equalsIgnoreCase(childRollupValueColumns.get(i), parentRollupValueColumns.get(j))) { + valueMap.add(j); + break; + } + } + } + + if (keyMap.size() != childRollupKeyColumns.size() || valueMap.size() != childRollupValueColumns.size()) { + throw new SparkDppException(String.format("column map index from child to parent has error," + + " key size src: %s, dst: %s; value size src: %s, dst: %s", + childRollupKeyColumns.size(), keyMap.size(), childRollupValueColumns.size(), valueMap.size())); + } + + return Pair.of(keyMap.toArray(new Integer[keyMap.size()]), valueMap.toArray(new Integer[valueMap.size()])); + } + + /** + * check decimal,char/varchar + */ + public boolean validateData(Object srcValue, EtlJobConfig.EtlColumn etlColumn, ColumnParser columnParser, Row row) { + + switch (etlColumn.columnType.toUpperCase()) { + case "DECIMALV2": + case "DECIMAL32": + case "DECIMAL64": + case "DECIMAL128": + // TODO(wb): support decimal round; see be DecimalV2Value::round + DecimalParser decimalParser = (DecimalParser) columnParser; + BigDecimal srcBigDecimal = (BigDecimal) srcValue; + if (srcValue != null && (decimalParser.getMaxValue().compareTo(srcBigDecimal) < 0 + || decimalParser.getMinValue().compareTo(srcBigDecimal) > 0)) { + LOG.warn(String.format("decimal value is not valid for defination, column=%s," + + " value=%s,precision=%s,scale=%s", + etlColumn.columnName, srcValue, srcBigDecimal.precision(), srcBigDecimal.scale())); + return false; + } + break; + case "CHAR": + case "VARCHAR": + // TODO(wb) padding char type + int strSize = 0; + if (srcValue != null && (strSize = srcValue.toString().getBytes(StandardCharsets.UTF_8).length) + > etlColumn.stringLength) { + LOG.warn(String.format("the length of input is too long than schema." + + " column_name:%s,input_str[%s],schema length:%s,actual length:%s", + etlColumn.columnName, row.toString(), etlColumn.stringLength, strSize)); + return false; + } + break; + case "STRING": + case "TEXT": + // TODO(zjf) padding string type + int strDataSize = 0; + if (srcValue != null && (strDataSize = srcValue.toString().getBytes(StandardCharsets.UTF_8).length) + > DppUtils.STRING_LENGTH_LIMIT) { + LOG.warn(String.format("The string type is limited to a maximum of %s bytes." + + " column_name:%s,input_str[%s],actual length:%s", + DppUtils.STRING_LENGTH_LIMIT, etlColumn.columnName, row.toString(), strDataSize)); + return false; + } + break; + default: + return true; + } + return true; + } + + /** + * 1 project column and reorder column + * 2 validate data + * 3 fill tuple with partition column + */ + private JavaPairRDD, Object[]> fillTupleWithPartitionColumn(Dataset dataframe, + EtlJobConfig.EtlPartitionInfo partitionInfo, + List partitionKeyIndex, + List partitionRangeKeys, + List keyAndPartitionColumnNames, + List valueColumnNames, + StructType dstTableSchema, + EtlJobConfig.EtlIndex baseIndex, + List validPartitionIds) + throws SparkDppException { + List distributeColumns = partitionInfo.distributionColumnRefs; + Partitioner partitioner = new DorisRangePartitioner(partitionInfo, partitionKeyIndex, partitionRangeKeys); + Set validPartitionIndex = new HashSet<>(); + if (validPartitionIds == null) { + for (int i = 0; i < partitionInfo.partitions.size(); ++i) { + validPartitionIndex.add(i); + } + } else { + for (int i = 0; i < partitionInfo.partitions.size(); ++i) { + if (validPartitionIds.contains(partitionInfo.partitions.get(i).partitionId)) { + validPartitionIndex.add(i); + } + } + } + + Map parsers = Maps.newHashMap(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + parsers.put(column.columnName, ColumnParser.create(column)); + } + + // use PairFlatMapFunction instead of PairMapFunction because the there will be + // 0 or 1 output row for 1 input row + JavaPairRDD, Object[]> resultPairRDD = dataframe.toJavaRDD().flatMapToPair( + (PairFlatMapFunction, Object[]>) row -> { + List, Object[]>> result = new ArrayList<>(); + List keyAndPartitionColumns = new ArrayList<>(); + List keyColumns = new ArrayList<>(); + List valueColumns = new ArrayList<>(valueColumnNames.size()); + for (int i = 0; i < keyAndPartitionColumnNames.size(); i++) { + String columnName = keyAndPartitionColumnNames.get(i); + Object columnObject = row.get(row.fieldIndex(columnName)); + if (!validateData(columnObject, baseIndex.getColumn(columnName), + parsers.get(columnName), row)) { + abnormalRowAcc.add(1); + return result.iterator(); + } + keyAndPartitionColumns.add(columnObject); + + if (baseIndex.getColumn(columnName).isKey) { + keyColumns.add(columnObject); + } + } + + for (int i = 0; i < valueColumnNames.size(); i++) { + String columnName = valueColumnNames.get(i); + Object columnObject = row.get(row.fieldIndex(columnName)); + if (!validateData(columnObject, baseIndex.getColumn(columnName), + parsers.get(columnName), row)) { + abnormalRowAcc.add(1); + return result.iterator(); + } + valueColumns.add(columnObject); + } + + DppColumns key = new DppColumns(keyAndPartitionColumns); + int pid = partitioner.getPartition(key); + if (!validPartitionIndex.contains(pid)) { + LOG.warn("invalid partition for row:" + row + ", pid:" + pid); + abnormalRowAcc.add(1); + LOG.info("abnormalRowAcc:" + abnormalRowAcc); + if (abnormalRowAcc.value() < 5) { + LOG.info("add row to invalidRows:" + row.toString()); + invalidRows.add(row.toString()); + LOG.info("invalid rows contents:" + invalidRows.value()); + } + } else { + // TODO(wb) support lagreint for hash + long hashValue = DppUtils.getHashValue(row, distributeColumns, dstTableSchema); + int bucketId = (int) ((hashValue & 0xffffffffL) % partitionInfo.partitions.get(pid).bucketNum); + long partitionId = partitionInfo.partitions.get(pid).partitionId; + // bucketKey is partitionId_bucketId + String bucketKey = partitionId + "_" + bucketId; + + List tuple = new ArrayList<>(); + tuple.add(bucketKey); + tuple.addAll(keyColumns); + result.add(new Tuple2<>(tuple, valueColumns.toArray())); + } + return result.iterator(); + }); + + // use bucket number as the parallel number + int reduceNum = 0; + for (EtlJobConfig.EtlPartition partition : partitionInfo.partitions) { + for (int i = 0; i < partition.bucketNum; i++) { + bucketKeyMap.put(partition.partitionId + "_" + i, reduceNum); + reduceNum++; + } + } + + // print to system.out for easy to find log info + System.out.println("print bucket key map:" + bucketKeyMap.toString()); + + return resultPairRDD; + } + + // do the etl process + private Dataset convertSrcDataframeToDstDataframe(EtlJobConfig.EtlIndex baseIndex, + Dataset srcDataframe, StructType dstTableSchema, + EtlJobConfig.EtlFileGroup fileGroup) + throws SparkDppException { + Dataset dataframe = srcDataframe; + StructType srcSchema = dataframe.schema(); + Set srcColumnNames = new HashSet<>(); + for (StructField field : srcSchema.fields()) { + srcColumnNames.add(field.name()); + } + Map columnMappings = fileGroup.columnMappings; + // 1. process simple columns + Set mappingColumns = null; + if (columnMappings != null) { + mappingColumns = columnMappings.keySet(); + } + List dstColumnNames = new ArrayList<>(); + for (StructField dstField : dstTableSchema.fields()) { + dstColumnNames.add(dstField.name()); + EtlJobConfig.EtlColumn column = baseIndex.getColumn(dstField.name()); + if (!srcColumnNames.contains(dstField.name())) { + if (mappingColumns != null && mappingColumns.contains(dstField.name())) { + // mapping columns will be processed in next step + continue; + } + if (column.defaultValue != null) { + if (column.defaultValue.equals(NULL_FLAG)) { + dataframe = dataframe.withColumn(dstField.name(), functions.lit(null)); + } else { + dataframe = dataframe.withColumn(dstField.name(), functions.lit(column.defaultValue)); + } + } else if (column.isAllowNull) { + dataframe = dataframe.withColumn(dstField.name(), functions.lit(null)); + } else { + throw new SparkDppException("Reason: no data for column:" + dstField.name()); + } + } + if (column.columnType.equalsIgnoreCase("DATE")) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(DataTypes.DateType)); + } else if (column.columnType.equalsIgnoreCase("DATETIME")) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(DataTypes.TimestampType)); + } else if (column.columnType.equalsIgnoreCase("BOOLEAN")) { + dataframe = dataframe.withColumn(dstField.name(), + functions.when(functions.lower(dataframe.col(dstField.name())).equalTo("true"), "1") + .when(dataframe.col(dstField.name()).equalTo("1"), "1") + .otherwise("0")); + } else if (!column.columnType.equalsIgnoreCase(BITMAP_TYPE) + && !dstField.dataType().equals(DataTypes.StringType)) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(dstField.dataType())); + } else if (column.columnType.equalsIgnoreCase(BITMAP_TYPE) + && dstField.dataType().equals(DataTypes.BinaryType)) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(DataTypes.BinaryType)); + } + if (fileGroup.isNegative && !column.isKey) { + // negative load + // value will be convert te -1 * value + dataframe = dataframe.withColumn(dstField.name(), functions.expr("-1 *" + dstField.name())); + } + } + // 2. process the mapping columns + for (String mappingColumn : mappingColumns) { + String mappingDescription = columnMappings.get(mappingColumn).toDescription(); + if (mappingDescription.toLowerCase().contains("hll_hash")) { + continue; + } + // here should cast data type to dst column type + dataframe = dataframe.withColumn(mappingColumn, + functions.expr(mappingDescription).cast(dstTableSchema.apply(mappingColumn).dataType())); + } + return dataframe; + } + + private Dataset loadDataFromPath(SparkSession spark, + EtlJobConfig.EtlFileGroup fileGroup, + String fileUrl, + EtlJobConfig.EtlIndex baseIndex, + List columns) throws SparkDppException { + List columnValueFromPath = DppUtils.parseColumnsFromPath(fileUrl, fileGroup.columnsFromPath); + List dataSrcColumns = fileGroup.fileFieldNames; + if (dataSrcColumns == null) { + // if there is no source columns info + // use base index columns as source columns + dataSrcColumns = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + dataSrcColumns.add(column.columnName); + } + } + // for getting schema to check source data + Map dstColumnNameToIndex = new HashMap(); + for (int i = 0; i < baseIndex.columns.size(); i++) { + dstColumnNameToIndex.put(baseIndex.columns.get(i).columnName, i); + } + List srcColumnsWithColumnsFromPath = new ArrayList<>(); + srcColumnsWithColumnsFromPath.addAll(dataSrcColumns); + if (fileGroup.columnsFromPath != null) { + srcColumnsWithColumnsFromPath.addAll(fileGroup.columnsFromPath); + } + + if ("parquet".equalsIgnoreCase(fileGroup.fileFormat)) { + // parquet had its own schema, just use it; perhaps we could add some validation in future. + Dataset dataFrame = spark.read().parquet(fileUrl); + if (!CollectionUtils.isEmpty(columnValueFromPath)) { + for (int k = 0; k < columnValueFromPath.size(); k++) { + dataFrame = dataFrame.withColumn( + fileGroup.columnsFromPath.get(k), functions.lit(columnValueFromPath.get(k))); + } + } + return dataFrame; + } + + if ("orc".equalsIgnoreCase(fileGroup.fileFormat)) { + Dataset dataFrame = spark.read().orc(fileUrl); + if (!CollectionUtils.isEmpty(columnValueFromPath)) { + for (int k = 0; k < columnValueFromPath.size(); k++) { + dataFrame = dataFrame.withColumn( + fileGroup.columnsFromPath.get(k), functions.lit(columnValueFromPath.get(k))); + } + } + return dataFrame; + } + + StructType srcSchema = createScrSchema(srcColumnsWithColumnsFromPath); + JavaRDD sourceDataRdd = spark.read().textFile(fileUrl).toJavaRDD(); + int columnSize = dataSrcColumns.size(); + List parsers = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + parsers.add(ColumnParser.create(column)); + } + char separator = (char) fileGroup.columnSeparator.getBytes(Charset.forName("UTF-8"))[0]; + JavaRDD rowRDD = sourceDataRdd.flatMap( + record -> { + scannedRowsAcc.add(1); + String[] attributes = splitLine(record, separator); + List result = new ArrayList<>(); + boolean validRow = true; + if (attributes.length != columnSize) { + LOG.warn("invalid src schema, data columns:" + + attributes.length + ", file group columns:" + + columnSize + ", row:" + record); + validRow = false; + } else { + for (int i = 0; i < attributes.length; ++i) { + StructField field = srcSchema.apply(i); + String srcColumnName = field.name(); + if (attributes[i].equals(NULL_FLAG) && dstColumnNameToIndex.containsKey(srcColumnName)) { + if (baseIndex.columns.get(dstColumnNameToIndex.get(srcColumnName)).isAllowNull) { + attributes[i] = null; + } else { + LOG.warn("column name:" + srcColumnName + ", attribute: " + i + + " can not be null. row:" + record); + validRow = false; + break; + } + } + boolean isStrictMode = etlJobConfig.properties.strictMode; + if (isStrictMode) { + if (dstColumnNameToIndex.containsKey(srcColumnName)) { + int index = dstColumnNameToIndex.get(srcColumnName); + String type = columns.get(index).columnType; + if (type.equalsIgnoreCase("CHAR") + || type.equalsIgnoreCase("VARCHAR") + || fileGroup.columnMappings.containsKey(field.name())) { + continue; + } + ColumnParser parser = parsers.get(index); + boolean valid = parser.parse(attributes[i]); + if (!valid) { + validRow = false; + LOG.warn("invalid row:" + record + + ", attribute " + i + ": " + attributes[i] + " parsed failed"); + break; + } + } + } + } + } + if (validRow) { + Row row = null; + if (fileGroup.columnsFromPath == null) { + row = RowFactory.create(attributes); + } else { + // process columns from path + // append columns from path to the tail + List columnAttributes = new ArrayList<>(); + columnAttributes.addAll(Arrays.asList(attributes)); + columnAttributes.addAll(columnValueFromPath); + row = RowFactory.create(columnAttributes.toArray()); + } + result.add(row); + } else { + abnormalRowAcc.add(1); + // at most add 5 rows to invalidRows + if (abnormalRowAcc.value() <= 5) { + invalidRows.add(record); + } + } + return result.iterator(); + } + ); + + Dataset dataframe = spark.createDataFrame(rowRDD, srcSchema); + if (!Strings.isNullOrEmpty(fileGroup.where)) { + dataframe = dataframe.where(fileGroup.where); + } + return dataframe; + } + + private StructType createScrSchema(List srcColumns) { + List fields = new ArrayList<>(); + for (String srcColumn : srcColumns) { + // user StringType to load source data + StructField field = DataTypes.createStructField(srcColumn, DataTypes.StringType, true); + fields.add(field); + } + StructType srcSchema = DataTypes.createStructType(fields); + return srcSchema; + } + + // This method is to keep the splitting consistent with broker load / mini load + private String[] splitLine(String line, char sep) { + if (line == null || line.equals("")) { + return new String[0]; + } + int index = 0; + int lastIndex = 0; + // line-begin char and line-end char are considered to be 'delimeter' + List values = new ArrayList<>(); + for (int i = 0; i < line.length(); i++, index++) { + if (line.charAt(index) == sep) { + values.add(line.substring(lastIndex, index)); + lastIndex = index + 1; + } + } + values.add(line.substring(lastIndex, index)); + return values.toArray(new String[0]); + } + + // partition keys will be parsed into double from json + // so need to convert it to partition columns' type + private Object convertPartitionKey(Object srcValue, Class dstClass) throws SparkDppException { + if (dstClass.equals(Float.class) || dstClass.equals(Double.class)) { + return null; + } + if (srcValue instanceof Double) { + if (dstClass.equals(Short.class)) { + return ((Double) srcValue).shortValue(); + } else if (dstClass.equals(Integer.class)) { + return ((Double) srcValue).intValue(); + } else if (dstClass.equals(Long.class)) { + return ((Double) srcValue).longValue(); + } else if (dstClass.equals(BigInteger.class)) { + // TODO(wb) gson will cast origin value to double by default + // when the partition column is largeint, this will cause error data + // need fix it thoroughly + return new BigInteger(srcValue.toString()); + } else if (dstClass.equals(java.sql.Date.class) || dstClass.equals(java.util.Date.class)) { + double srcValueDouble = (double) srcValue; + return convertToJavaDate((int) srcValueDouble); + } else if (dstClass.equals(java.sql.Timestamp.class)) { + double srcValueDouble = (double) srcValue; + return convertToJavaDatetime((long) srcValueDouble); + } else { + // dst type is string + return srcValue.toString(); + } + } else { + LOG.warn("unsupport partition key:" + srcValue); + throw new SparkDppException("unsupport partition key:" + srcValue); + } + } + + private java.sql.Timestamp convertToJavaDatetime(long src) { + String dateTimeStr = Long.valueOf(src).toString(); + if (dateTimeStr.length() != 14) { + throw new RuntimeException("invalid input date format for SparkDpp"); + } + + String year = dateTimeStr.substring(0, 4); + String month = dateTimeStr.substring(4, 6); + String day = dateTimeStr.substring(6, 8); + String hour = dateTimeStr.substring(8, 10); + String min = dateTimeStr.substring(10, 12); + String sec = dateTimeStr.substring(12, 14); + + return java.sql.Timestamp.valueOf(String.format("%s-%s-%s %s:%s:%s", year, month, day, hour, min, sec)); + } + + private java.sql.Date convertToJavaDate(int originDate) { + int day = originDate & 0x1f; + originDate >>= 5; + int month = originDate & 0x0f; + originDate >>= 4; + int year = originDate; + return java.sql.Date.valueOf(String.format("%04d-%02d-%02d", year, month, day)); + } + + private List createPartitionRangeKeys( + EtlJobConfig.EtlPartitionInfo partitionInfo, List partitionKeySchema) throws SparkDppException { + List partitionRangeKeys = new ArrayList<>(); + for (EtlJobConfig.EtlPartition partition : partitionInfo.partitions) { + DorisRangePartitioner.PartitionRangeKey partitionRangeKey = new DorisRangePartitioner.PartitionRangeKey(); + List startKeyColumns = new ArrayList<>(); + for (int i = 0; i < partition.startKeys.size(); i++) { + Object value = partition.startKeys.get(i); + startKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i))); + } + partitionRangeKey.startKeys = new DppColumns(startKeyColumns); + if (!partition.isMaxPartition) { + partitionRangeKey.isMaxPartition = false; + List endKeyColumns = new ArrayList<>(); + for (int i = 0; i < partition.endKeys.size(); i++) { + Object value = partition.endKeys.get(i); + endKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i))); + } + partitionRangeKey.endKeys = new DppColumns(endKeyColumns); + } else { + partitionRangeKey.isMaxPartition = true; + } + partitionRangeKeys.add(partitionRangeKey); + } + return partitionRangeKeys; + } + + private Dataset loadDataFromFilePaths(SparkSession spark, + EtlJobConfig.EtlIndex baseIndex, + List filePaths, + EtlJobConfig.EtlFileGroup fileGroup, + StructType dstTableSchema) + throws SparkDppException, IOException { + Dataset fileGroupDataframe = null; + for (String filePath : filePaths) { + try { + FileSystem fs = FileSystem.get(new Path(filePath).toUri(), serializableHadoopConf.value()); + FileStatus[] fileStatuses = fs.globStatus(new Path(filePath)); + if (fileStatuses == null) { + throw new SparkDppException("fs list status failed: " + filePath); + } + for (FileStatus fileStatus : fileStatuses) { + if (fileStatus.isDirectory()) { + continue; + } + fileNumberAcc.add(1); + fileSizeAcc.add(fileStatus.getLen()); + } + } catch (Exception e) { + LOG.warn("parse path failed:" + filePath); + throw e; + } + if (fileGroup.columnSeparator == null) { + LOG.warn("invalid null column separator!"); + throw new SparkDppException("Reason: invalid null column separator!"); + } + Dataset dataframe = null; + + dataframe = loadDataFromPath(spark, fileGroup, filePath, baseIndex, baseIndex.columns); + dataframe = convertSrcDataframeToDstDataframe(baseIndex, dataframe, dstTableSchema, fileGroup); + if (fileGroupDataframe == null) { + fileGroupDataframe = dataframe; + } else { + fileGroupDataframe.union(dataframe); + } + } + return fileGroupDataframe; + } + + private Dataset loadDataFromHiveTable(SparkSession spark, + String hiveDbTableName, + EtlJobConfig.EtlIndex baseIndex, + EtlJobConfig.EtlFileGroup fileGroup, + StructType dstTableSchema, + Set dictBitmapColumnSet, + Set binaryBitmapColumnsSet) throws SparkDppException { + // select base index columns from hive table + StringBuilder sql = new StringBuilder(); + sql.append("select "); + baseIndex.columns.forEach(column -> { + sql.append(column.columnName).append(","); + }); + sql.deleteCharAt(sql.length() - 1).append(" from ").append(hiveDbTableName); + if (!Strings.isNullOrEmpty(fileGroup.where)) { + sql.append(" where ").append(fileGroup.where); + } + + Dataset dataframe = spark.sql(sql.toString()); + // Note(wb): in current spark load implementation, spark load can't be consistent with doris BE; + // The reason is as follows + // For stream load in doris BE, it runs as follow steps: + // step 1: type check + // step 2: expression calculation + // step 3: strict mode check + // step 4: nullable column check + // BE can do the four steps row by row + // but spark load relies on spark to do step2, so it can only do step 1 for whole dataset + // and then do step 2 for whole dataset and so on; + // So in spark load, we first do step 1,3,4,and then do step 2. + dataframe = checkDataFromHiveWithStrictMode(dataframe, baseIndex, fileGroup.columnMappings.keySet(), + etlJobConfig.properties.strictMode, dstTableSchema, dictBitmapColumnSet, binaryBitmapColumnsSet); + dataframe = convertSrcDataframeToDstDataframe(baseIndex, dataframe, dstTableSchema, fileGroup); + return dataframe; + } + + private Dataset checkDataFromHiveWithStrictMode(Dataset dataframe, EtlJobConfig.EtlIndex baseIndex, + Set mappingColKeys, boolean isStrictMode, + StructType dstTableSchema, + Set dictBitmapColumnSet, + Set binaryBitmapColumnsSet) throws SparkDppException { + List columnNameNeedCheckArrayList = new ArrayList<>(); + List columnParserArrayList = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + // note(wb): there are three data source for bitmap column + // case 1: global dict and binary data; needn't check + // case 2: bitmap hash function; this func is not supported in spark load now, so ignore it here + // case 3: origin value is a integer value; it should be checked use LongParser + if (StringUtils.equalsIgnoreCase(column.columnType, "bitmap")) { + if (dictBitmapColumnSet.contains(column.columnName.toLowerCase())) { + continue; + } + if (binaryBitmapColumnsSet.contains(column.columnName.toLowerCase())) { + continue; + } + columnNameNeedCheckArrayList.add(column); + columnParserArrayList.add(new BigIntParser()); + } else if (!StringUtils.equalsIgnoreCase(column.columnType, "varchar") + && !StringUtils.equalsIgnoreCase(column.columnType, "char") + && !mappingColKeys.contains(column.columnName)) { + columnNameNeedCheckArrayList.add(column); + columnParserArrayList.add(ColumnParser.create(column)); + } + } + + ColumnParser[] columnParserArray = columnParserArrayList.toArray(new ColumnParser[0]); + EtlJobConfig.EtlColumn[] columnNameArray = columnNameNeedCheckArrayList.toArray(new EtlJobConfig.EtlColumn[0]); + + StructType srcSchema = dataframe.schema(); + JavaRDD result = dataframe.toJavaRDD().flatMap(new FlatMapFunction() { + @Override + public Iterator call(Row row) throws Exception { + List result = new ArrayList<>(); + Set columnIndexNeedToRepalceNull = new HashSet(); + boolean validRow = true; + for (int i = 0; i < columnNameArray.length; i++) { + EtlJobConfig.EtlColumn column = columnNameArray[i]; + int fieldIndex = row.fieldIndex(column.columnName); + Object value = row.get(fieldIndex); + if (value == null && !column.isAllowNull) { + validRow = false; + LOG.warn("column:" + i + " can not be null. row:" + row.toString()); + break; + } + if (value != null && !columnParserArray[i].parse(value.toString())) { + if (isStrictMode) { + validRow = false; + LOG.warn(String.format("row parsed failed in strict mode, column name %s, src row %s", + column.columnName, row.toString())); + } else if (!column.isAllowNull) { + // a column parsed failed would be filled null, + // but if doris column is not allowed null, we should skip this row + validRow = false; + LOG.warn("column:" + i + " can not be null. row:" + row.toString()); + break; + } else { + columnIndexNeedToRepalceNull.add(fieldIndex); + } + } + } + if (!validRow) { + abnormalRowAcc.add(1); + // at most add 5 rows to invalidRows + if (abnormalRowAcc.value() <= 5) { + invalidRows.add(row.toString()); + } + } else if (columnIndexNeedToRepalceNull.size() != 0) { + Object[] newRow = new Object[row.size()]; + for (int i = 0; i < row.size(); i++) { + if (columnIndexNeedToRepalceNull.contains(i)) { + newRow[i] = null; + } else { + newRow[i] = row.get(i); + } + } + result.add(RowFactory.create(newRow)); + } else { + result.add(row); + } + return result.iterator(); + } + }); + + // here we just check data but not do cast, + // so data type should be same with src schema which is hive table schema + return spark.createDataFrame(result, srcSchema); + } + + private void process() throws Exception { + try { + for (Map.Entry entry : etlJobConfig.tables.entrySet()) { + Long tableId = entry.getKey(); + EtlJobConfig.EtlTable etlTable = entry.getValue(); + Set dictBitmapColumnSet = tableToBitmapDictColumns.getOrDefault(tableId, new HashSet<>()); + Set binaryBitmapColumnSet = tableToBinaryBitmapColumns.getOrDefault(tableId, new HashSet<>()); + + // get the base index meta + EtlJobConfig.EtlIndex baseIndex = null; + for (EtlJobConfig.EtlIndex indexMeta : etlTable.indexes) { + if (indexMeta.isBaseIndex) { + baseIndex = indexMeta; + break; + } + } + + // get key and partition column names and value column names separately + List keyAndPartitionColumnNames = new ArrayList<>(); + List valueColumnNames = new ArrayList<>(); + for (EtlJobConfig.EtlColumn etlColumn : baseIndex.columns) { + if (etlColumn.isKey) { + keyAndPartitionColumnNames.add(etlColumn.columnName); + } else { + if (etlTable.partitionInfo.partitionColumnRefs.contains(etlColumn.columnName)) { + keyAndPartitionColumnNames.add(etlColumn.columnName); + } + valueColumnNames.add(etlColumn.columnName); + } + } + + EtlJobConfig.EtlPartitionInfo partitionInfo = etlTable.partitionInfo; + List partitionKeyIndex = new ArrayList(); + List partitionKeySchema = new ArrayList<>(); + for (String key : partitionInfo.partitionColumnRefs) { + for (int i = 0; i < baseIndex.columns.size(); ++i) { + EtlJobConfig.EtlColumn column = baseIndex.columns.get(i); + if (column.columnName.equals(key)) { + partitionKeyIndex.add(keyAndPartitionColumnNames.indexOf(key)); + partitionKeySchema.add(DppUtils.getClassFromColumn(column)); + break; + } + } + } + List partitionRangeKeys + = createPartitionRangeKeys(partitionInfo, partitionKeySchema); + StructType dstTableSchema = DppUtils.createDstTableSchema(baseIndex.columns, false, false); + dstTableSchema = DppUtils.replaceBinaryColsInSchema(binaryBitmapColumnSet, dstTableSchema); + RollupTreeBuilder rollupTreeParser = new MinimumCoverageRollupTreeBuilder(); + RollupTreeNode rootNode = rollupTreeParser.build(etlTable); + LOG.info("Start to process rollup tree:" + rootNode); + + JavaPairRDD, Object[]> tablePairRDD = null; + for (EtlJobConfig.EtlFileGroup fileGroup : etlTable.fileGroups) { + List filePaths = fileGroup.filePaths; + Dataset fileGroupDataframe = null; + EtlJobConfig.SourceType sourceType = fileGroup.sourceType; + if (sourceType == EtlJobConfig.SourceType.FILE) { + fileGroupDataframe = loadDataFromFilePaths( + spark, baseIndex, filePaths, fileGroup, dstTableSchema); + } else if (sourceType == EtlJobConfig.SourceType.HIVE) { + fileGroupDataframe = loadDataFromHiveTable(spark, fileGroup.dppHiveDbTableName, + baseIndex, fileGroup, dstTableSchema, dictBitmapColumnSet, binaryBitmapColumnSet); + } else { + throw new RuntimeException("Unknown source type: " + sourceType.name()); + } + if (fileGroupDataframe == null) { + LOG.info("no data for file file group:" + fileGroup); + continue; + } + + JavaPairRDD, Object[]> ret = fillTupleWithPartitionColumn( + fileGroupDataframe, + partitionInfo, partitionKeyIndex, + partitionRangeKeys, + keyAndPartitionColumnNames, valueColumnNames, + dstTableSchema, baseIndex, fileGroup.partitions); + if (tablePairRDD == null) { + tablePairRDD = ret; + } else { + tablePairRDD.union(ret); + } + } + processRollupTree(rootNode, tablePairRDD, tableId, baseIndex); + } + LOG.info("invalid rows contents:" + invalidRows.value()); + dppResult.isSuccess = true; + dppResult.failedReason = ""; + } catch (Exception exception) { + LOG.warn("spark dpp failed for exception:" + exception); + dppResult.isSuccess = false; + dppResult.failedReason = exception.getMessage(); + throw exception; + } finally { + spark.stop(); + dppResult.normalRows = scannedRowsAcc.value() - abnormalRowAcc.value(); + dppResult.scannedRows = scannedRowsAcc.value(); + dppResult.fileNumber = fileNumberAcc.value(); + dppResult.fileSize = fileSizeAcc.value(); + dppResult.abnormalRows = abnormalRowAcc.value(); + dppResult.partialAbnormalRows = invalidRows.value(); + } + } + + private void writeDppResult(DppResult dppResult) throws Exception { + String outputPath = etlJobConfig.getOutputPath(); + String resultFilePath = outputPath + "/" + DPP_RESULT_FILE; + FileSystem fs = FileSystem.get(new Path(outputPath).toUri(), serializableHadoopConf.value()); + Path filePath = new Path(resultFilePath); + FSDataOutputStream outputStream = fs.create(filePath); + Gson gson = new Gson(); + outputStream.write(gson.toJson(dppResult).getBytes()); + outputStream.write('\n'); + outputStream.close(); + } + + public void doDpp() throws Exception { + try { + process(); + } catch (Exception e) { + throw e; + } finally { + // write dpp result to file in outputPath + writeDppResult(dppResult); + } + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java new file mode 100644 index 00000000..0e140af1 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java @@ -0,0 +1,607 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.common.io.BitmapValue; +import org.apache.doris.common.io.Hll; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.spark.Partitioner; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import scala.Tuple2; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.Serializable; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +// contains all class about spark aggregate + +public abstract class SparkRDDAggregator implements Serializable { + + T init(Object value) { + return (T) value; + } + + abstract T update(T v1, T v2); + + Object finalize(Object value) { + return value; + } + + public static SparkRDDAggregator buildAggregator(EtlJobConfig.EtlColumn column) throws SparkDppException { + String aggType = StringUtils.lowerCase(column.aggregationType); + String columnType = StringUtils.lowerCase(column.columnType); + switch (aggType) { + case "bitmap_union": + return new BitmapUnionAggregator(); + case "hll_union": + return new HllUnionAggregator(); + case "max": + switch (columnType) { + case "tinyint": + case "smallint": + case "int": + case "bigint": + case "float": + case "double": + case "decimalv2": + case "decimal32": + case "decimal64": + case "decimal128": + case "date": + case "datetime": + case "datev2": + case "datetimev2": + return new NumberMaxAggregator(); + case "char": + case "varchar": + return new StringMaxAggregator(); + case "largeint": + return new LargeIntMaxAggregator(); + default: + throw new SparkDppException( + String.format("unsupported max aggregator for column type:%s", columnType)); + } + case "min": + switch (columnType) { + case "tinyint": + case "smallint": + case "int": + case "bigint": + case "float": + case "double": + case "decimalv2": + case "decimal32": + case "decimal64": + case "decimal128": + case "date": + case "datetime": + case "datev2": + case "datetimev2": + return new NumberMinAggregator(); + case "char": + case "varchar": + return new StringMinAggregator(); + case "largeint": + return new LargeIntMinAggregator(); + default: + throw new SparkDppException( + String.format("unsupported min aggregator for column type:%s", columnType)); + } + case "sum": + switch (columnType) { + case "tinyint": + return new ByteSumAggregator(); + case "smallint": + return new ShortSumAggregator(); + case "int": + return new IntSumAggregator(); + case "bigint": + return new LongSumAggregator(); + case "float": + return new FloatSumAggregator(); + case "double": + return new DoubleSumAggregator(); + case "largeint": + return new LargeIntSumAggregator(); + case "decimalv2": + case "decimal32": + case "decimal64": + case "decimal128": + return new BigDecimalSumAggregator(); + default: + throw new SparkDppException( + String.format("unsupported sum aggregator for column type:%s", columnType)); + } + case "replace_if_not_null": + return new ReplaceIfNotNullAggregator(); + case "replace": + return new ReplaceAggregator(); + default: + throw new SparkDppException(String.format("unsupported aggregate type %s", aggType)); + } + } + +} + +// just used for duplicate table, default logic is enough +class DefaultSparkRDDAggregator extends SparkRDDAggregator { + + @Override + Object update(Object v1, Object v2) { + return null; + } +} + +// just encode value column,used for base rollup +class EncodeBaseAggregateTableFunction implements PairFunction, Object[]>, List, Object[]> { + + private SparkRDDAggregator[] valueAggregators; + + public EncodeBaseAggregateTableFunction(SparkRDDAggregator[] valueAggregators) { + this.valueAggregators = valueAggregators; + } + + + @Override + public Tuple2, Object[]> call(Tuple2, Object[]> srcPair) throws Exception { + for (int i = 0; i < srcPair._2().length; i++) { + srcPair._2()[i] = valueAggregators[i].init(srcPair._2()[i]); + } + return srcPair; + } +} + +// just map column from parent rollup index to child rollup index,used for child rollup +class EncodeRollupAggregateTableFunction + implements PairFunction, Object[]>, List, Object[]> { + + Pair columnIndexInParentRollup; + + public EncodeRollupAggregateTableFunction(Pair columnIndexInParentRollup) { + this.columnIndexInParentRollup = columnIndexInParentRollup; + } + + @Override + public Tuple2, Object[]> call(Tuple2, Object[]> parentRollupKeyValuePair) + throws Exception { + Integer[] keyColumnIndexMap = columnIndexInParentRollup.getKey(); + Integer[] valueColumnIndexMap = columnIndexInParentRollup.getValue(); + + List keys = new ArrayList(); + Object[] values = new Object[valueColumnIndexMap.length]; + + // deal bucket_id column + keys.add(parentRollupKeyValuePair._1().get(0)); + for (int i = 0; i < keyColumnIndexMap.length; i++) { + keys.add(parentRollupKeyValuePair._1().get(keyColumnIndexMap[i] + 1)); + } + + for (int i = 0; i < valueColumnIndexMap.length; i++) { + values[i] = parentRollupKeyValuePair._2()[valueColumnIndexMap[i]]; + } + return new Tuple2<>(keys, values); + } +} + +class AggregateReduceFunction implements Function2 { + + private SparkRDDAggregator[] valueAggregators; + + public AggregateReduceFunction(SparkRDDAggregator[] sparkDppAggregators) { + this.valueAggregators = sparkDppAggregators; + } + + @Override + public Object[] call(Object[] v1, Object[] v2) throws Exception { + Object[] result = new Object[valueAggregators.length]; + for (int i = 0; i < v1.length; i++) { + result[i] = valueAggregators[i].update(v1[i], v2[i]); + } + return result; + } +} + +class ReplaceAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + return src; + } +} + +class ReplaceIfNotNullAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + return src == null ? dst : src; + } +} + +class BitmapUnionAggregator extends SparkRDDAggregator { + private static final Logger LOG = LogManager.getLogger(BitmapUnionAggregator.class); + + @Override + BitmapValue init(Object value) { + try { + BitmapValue bitmapValue = new BitmapValue(); + if (value instanceof byte[]) { + bitmapValue.deserialize(new DataInputStream(new ByteArrayInputStream((byte[]) value))); + } else if (value != null) { + bitmapValue.add(Long.parseLong(value.toString())); + } + return bitmapValue; + } catch (Exception e) { + throw new RuntimeException("build bitmap value failed", e); + } + } + + @Override + BitmapValue update(BitmapValue v1, BitmapValue v2) { + BitmapValue newBitmapValue = new BitmapValue(); + if (v1 != null) { + newBitmapValue.or(v1); + } + if (v2 != null) { + newBitmapValue.or(v2); + } + return newBitmapValue; + } + + @Override + byte[] finalize(Object value) { + try { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + DataOutputStream outputStream = new DataOutputStream(bos); + ((BitmapValue) value).serialize(outputStream); + return bos.toByteArray(); + } catch (IOException ioException) { + LOG.warn("", ioException); + throw new RuntimeException(ioException); + } + } + +} + +class HllUnionAggregator extends SparkRDDAggregator { + private static final Logger LOG = LogManager.getLogger(HllUnionAggregator.class); + + @Override + Hll init(Object value) { + try { + Hll hll = new Hll(); + if (value instanceof byte[]) { + hll.deserialize(new DataInputStream(new ByteArrayInputStream((byte[]) value))); + } else if (value != null) { + hll.updateWithHash(value); + } + return hll; + } catch (Exception e) { + throw new RuntimeException("build hll value failed", e); + } + } + + @Override + Hll update(Hll v1, Hll v2) { + Hll newHll = new Hll(); + if (v1 != null) { + newHll.merge(v1); + } + if (v2 != null) { + newHll.merge(v2); + } + return newHll; + } + + @Override + byte[] finalize(Object value) { + try { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + DataOutputStream outputStream = new DataOutputStream(bos); + ((Hll) value).serialize(outputStream); + return bos.toByteArray(); + } catch (IOException ioException) { + LOG.warn("", ioException); + throw new RuntimeException(ioException); + } + } + +} + +class LargeIntMaxAggregator extends SparkRDDAggregator { + + BigInteger init(Object value) { + if (value == null) { + return null; + } + return new BigInteger(value.toString()); + } + + @Override + BigInteger update(BigInteger dst, BigInteger src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) > 0 ? dst : src; + } + + @Override + String finalize(Object value) { + BigInteger bigInteger = (BigInteger) value; + return bigInteger.toString(); + } +} + +class LargeIntMinAggregator extends LargeIntMaxAggregator { + + @Override + BigInteger update(BigInteger dst, BigInteger src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) < 0 ? dst : src; + } +} + +class LargeIntSumAggregator extends LargeIntMaxAggregator { + + @Override + BigInteger update(BigInteger dst, BigInteger src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.add(src); + } +} + + +class NumberMaxAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return ((Comparable) dst).compareTo(src) > 0 ? dst : src; + } +} + + +class NumberMinAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return ((Comparable) dst).compareTo(src) < 0 ? dst : src; + } +} + +class LongSumAggregator extends SparkRDDAggregator { + + @Override + Long update(Long dst, Long src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst + src; + } +} + +class ShortSumAggregator extends SparkRDDAggregator { + + @Override + Short update(Short dst, Short src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + int ret = dst + src; + // here may overflow, just keep the same logic with be + return (short) ret; + } +} + +class IntSumAggregator extends SparkRDDAggregator { + + @Override + Integer update(Integer dst, Integer src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + long ret = Long.sum(dst, src); + // here may overflow, just keep the same logic with be + return (int) ret; + } +} + +class ByteSumAggregator extends SparkRDDAggregator { + + @Override + Byte update(Byte dst, Byte src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + int ret = dst + src; + // here may overflow, just keep the same logic with be + return (byte) ret; + } +} + +class DoubleSumAggregator extends SparkRDDAggregator { + + @Override + strictfp Double update(Double dst, Double src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst + src; + } +} + +class FloatSumAggregator extends SparkRDDAggregator { + + @Override + strictfp Float update(Float dst, Float src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst + src; + } +} + +class StringMaxAggregator extends SparkRDDAggregator { + + @Override + String update(String dst, String src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) > 0 ? dst : src; + } +} + +class StringMinAggregator extends SparkRDDAggregator { + + @Override + String update(String dst, String src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) < 0 ? dst : src; + } +} + +class BigDecimalSumAggregator extends SparkRDDAggregator { + + + @Override + BigDecimal update(BigDecimal src, BigDecimal dst) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return src.add(dst); + } +} + + +class BucketComparator implements Comparator>, Serializable { + + @Override + public int compare(List keyArray1, List keyArray2) { + int cmp = 0; + + for (int i = 0; i < keyArray1.size(); i++) { + Object key1 = keyArray1.get(i); + Object key2 = keyArray2.get(i); + if (key1 == key2) { + continue; + } + if (key1 == null || key2 == null) { + return key1 == null ? -1 : 1; + } + if (key1 instanceof Comparable && key2 instanceof Comparable) { + cmp = ((Comparable) key1).compareTo(key2); + } else { + throw new RuntimeException(String.format("uncomparable column type %s", key1.getClass().toString())); + } + if (cmp != 0) { + return cmp; + } + } + + return cmp; + } +} + +class BucketPartitioner extends Partitioner { + + private Map bucketKeyMap; + + public BucketPartitioner(Map bucketKeyMap) { + this.bucketKeyMap = bucketKeyMap; + } + + @Override + public int numPartitions() { + return bucketKeyMap.size(); + } + + @Override + public int getPartition(Object key) { + List rddKey = (List) key; + return bucketKeyMap.get(String.valueOf(rddKey.get(0))); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java new file mode 100644 index 00000000..428a9d42 --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.spark.util.AccumulatorV2; + +import java.util.ArrayList; +import java.util.List; + +// This class is a accumulator of string based on AccumulatorV2 +// (https://spark.apache.org/docs/latest/api/java/org/apache/spark/util/AccumulatorV2.html). +// Spark does not provide string accumulator. +// +// This class is used to collect the invalid rows when doing etl. +public class StringAccumulator extends AccumulatorV2 { + private List strs = new ArrayList<>(); + + @Override + public boolean isZero() { + return strs.isEmpty(); + } + + @Override + public AccumulatorV2 copy() { + StringAccumulator newAccumulator = new StringAccumulator(); + newAccumulator.strs.addAll(this.strs); + return newAccumulator; + } + + @Override + public void reset() { + strs.clear(); + } + + @Override + public void add(String v) { + strs.add(v); + } + + @Override + public void merge(AccumulatorV2 other) { + StringAccumulator o = (StringAccumulator) other; + strs.addAll(o.strs); + } + + @Override + public String value() { + return strs.toString(); + } +} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java new file mode 100644 index 00000000..a359612e --- /dev/null +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.etl; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.load.loadv2.dpp.GlobalDictBuilder; +import org.apache.doris.load.loadv2.dpp.SparkDpp; +import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumn; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumnMapping; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlFileGroup; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlIndex; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlTable; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.google.common.io.CharStreams; +import org.apache.commons.collections.map.MultiValueMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.deploy.SparkHadoopUtil; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.functions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * SparkEtlJob is responsible for global dict building, data partition, data sort and data aggregation. + * 1. init job config + * 2. check if job has bitmap_dict function columns + * 3. build global dict if step 2 is true + * 4. dpp (data partition, data sort and data aggregation) + */ +public class SparkEtlJob { + private static final Logger LOG = LoggerFactory.getLogger(SparkEtlJob.class); + + private static final String BITMAP_DICT_FUNC = "bitmap_dict"; + private static final String TO_BITMAP_FUNC = "to_bitmap"; + private static final String BITMAP_HASH = "bitmap_hash"; + private static final String BINARY_BITMAP = "binary_bitmap"; + + private String jobConfigFilePath; + private EtlJobConfig etlJobConfig; + private Set hiveSourceTables; + private Map> tableToBitmapDictColumns; + private Map> tableToBinaryBitmapColumns; + private final SparkConf conf; + private SparkSession spark; + + private SparkEtlJob(String jobConfigFilePath) { + this.jobConfigFilePath = jobConfigFilePath; + this.etlJobConfig = null; + this.hiveSourceTables = Sets.newHashSet(); + this.tableToBitmapDictColumns = Maps.newHashMap(); + this.tableToBinaryBitmapColumns = Maps.newHashMap(); + conf = new SparkConf(); + } + + private void initSpark() { + //serialization conf + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.set("spark.kryo.registrator", "org.apache.doris.load.loadv2.dpp.DorisKryoRegistrator"); + conf.set("spark.kryo.registrationRequired", "false"); + spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate(); + } + + private void initSparkConfigs(Map configs) { + if (configs == null) { + return; + } + for (Map.Entry entry : configs.entrySet()) { + conf.set(entry.getKey(), entry.getValue()); + conf.set("spark.hadoop." + entry.getKey(), entry.getValue()); + } + } + + private void initConfig() throws IOException { + if (LOG.isDebugEnabled()) { + LOG.debug("job config file path: " + jobConfigFilePath); + } + Configuration hadoopConf = SparkHadoopUtil.get().newConfiguration(this.conf); + String jsonConfig; + Path path = new Path(jobConfigFilePath); + try (FileSystem fs = path.getFileSystem(hadoopConf); DataInputStream in = fs.open(path)) { + jsonConfig = CharStreams.toString(new InputStreamReader(in)); + } + if (LOG.isDebugEnabled()) { + LOG.debug("rdd read json config: " + jsonConfig); + } + etlJobConfig = EtlJobConfig.configFromJson(jsonConfig); + if (LOG.isDebugEnabled()) { + LOG.debug("etl job config: " + etlJobConfig); + } + } + + /* + * 1. check bitmap column + * 2. fill tableToBitmapDictColumns + * 3. remove bitmap_dict and to_bitmap mapping from columnMappings + */ + private void checkConfig() throws Exception { + for (Map.Entry entry : etlJobConfig.tables.entrySet()) { + boolean isHiveSource = false; + Set bitmapDictColumns = Sets.newHashSet(); + Set binaryBitmapColumns = Sets.newHashSet(); + + for (EtlFileGroup fileGroup : entry.getValue().fileGroups) { + if (fileGroup.sourceType == EtlJobConfig.SourceType.HIVE) { + isHiveSource = true; + } + Map newColumnMappings = Maps.newHashMap(); + for (Map.Entry mappingEntry : fileGroup.columnMappings.entrySet()) { + String columnName = mappingEntry.getKey(); + String exprStr = mappingEntry.getValue().toDescription(); + String funcName = functions.expr(exprStr).expr().prettyName(); + if (funcName.equalsIgnoreCase(BITMAP_HASH)) { + throw new SparkDppException("spark load not support bitmap_hash now"); + } + if (funcName.equalsIgnoreCase(BINARY_BITMAP)) { + binaryBitmapColumns.add(columnName.toLowerCase()); + } else if (funcName.equalsIgnoreCase(BITMAP_DICT_FUNC)) { + bitmapDictColumns.add(columnName.toLowerCase()); + } else if (!funcName.equalsIgnoreCase(TO_BITMAP_FUNC)) { + newColumnMappings.put(mappingEntry.getKey(), mappingEntry.getValue()); + } + } + // reset new columnMappings + fileGroup.columnMappings = newColumnMappings; + } + if (isHiveSource) { + hiveSourceTables.add(entry.getKey()); + } + if (!bitmapDictColumns.isEmpty()) { + tableToBitmapDictColumns.put(entry.getKey(), bitmapDictColumns); + } + if (!binaryBitmapColumns.isEmpty()) { + tableToBinaryBitmapColumns.put(entry.getKey(), binaryBitmapColumns); + } + } + LOG.info("init hiveSourceTables: " + hiveSourceTables + + ",tableToBitmapDictColumns: " + tableToBitmapDictColumns); + + // spark etl must have only one table with bitmap type column to process. + if (hiveSourceTables.size() > 1 + || tableToBitmapDictColumns.size() > 1 + || tableToBinaryBitmapColumns.size() > 1) { + throw new Exception("spark etl job must have only one hive table with bitmap type column to process"); + } + } + + private void processDpp() throws Exception { + SparkDpp sparkDpp = new SparkDpp(spark, etlJobConfig, tableToBitmapDictColumns, tableToBinaryBitmapColumns); + sparkDpp.init(); + sparkDpp.doDpp(); + } + + private String buildGlobalDictAndEncodeSourceTable(EtlTable table, long tableId) { + // dict column map + MultiValueMap dictColumnMap = new MultiValueMap(); + for (String dictColumn : tableToBitmapDictColumns.get(tableId)) { + dictColumnMap.put(dictColumn, null); + } + + // doris schema + List dorisOlapTableColumnList = Lists.newArrayList(); + for (EtlIndex etlIndex : table.indexes) { + if (etlIndex.isBaseIndex) { + for (EtlColumn column : etlIndex.columns) { + dorisOlapTableColumnList.add(column.columnName); + } + } + } + + // hive db and tables + EtlFileGroup fileGroup = table.fileGroups.get(0); + String sourceHiveDBTableName = fileGroup.hiveDbTableName; + String dorisHiveDB = sourceHiveDBTableName.split("\\.")[0]; + String taskId = etlJobConfig.outputPath.substring(etlJobConfig.outputPath.lastIndexOf("/") + 1); + String globalDictTableName = String.format(EtlJobConfig.GLOBAL_DICT_TABLE_NAME, tableId); + String distinctKeyTableName = String.format(EtlJobConfig.DISTINCT_KEY_TABLE_NAME, tableId, taskId); + String dorisIntermediateHiveTable = String.format( + EtlJobConfig.DORIS_INTERMEDIATE_HIVE_TABLE_NAME, tableId, taskId); + String sourceHiveFilter = fileGroup.where; + + // others + List mapSideJoinColumns = Lists.newArrayList(); + int buildConcurrency = 1; + List veryHighCardinalityColumn = Lists.newArrayList(); + int veryHighCardinalityColumnSplitNum = 1; + + LOG.info("global dict builder args, dictColumnMap: " + dictColumnMap + + ", dorisOlapTableColumnList: " + dorisOlapTableColumnList + + ", sourceHiveDBTableName: " + sourceHiveDBTableName + + ", sourceHiveFilter: " + sourceHiveFilter + + ", distinctKeyTableName: " + distinctKeyTableName + + ", globalDictTableName: " + globalDictTableName + + ", dorisIntermediateHiveTable: " + dorisIntermediateHiveTable); + try { + GlobalDictBuilder globalDictBuilder = new GlobalDictBuilder(dictColumnMap, dorisOlapTableColumnList, + mapSideJoinColumns, sourceHiveDBTableName, sourceHiveFilter, dorisHiveDB, distinctKeyTableName, + globalDictTableName, dorisIntermediateHiveTable, buildConcurrency, veryHighCardinalityColumn, + veryHighCardinalityColumnSplitNum, spark); + globalDictBuilder.createHiveIntermediateTable(); + globalDictBuilder.extractDistinctColumn(); + globalDictBuilder.buildGlobalDict(); + globalDictBuilder.encodeDorisIntermediateHiveTable(); + } catch (Exception e) { + throw new RuntimeException(e); + } + + return String.format("%s.%s", dorisHiveDB, dorisIntermediateHiveTable); + } + + private void processData() throws Exception { + if (!hiveSourceTables.isEmpty()) { + // only one table + long tableId = -1; + EtlTable table = null; + for (Map.Entry entry : etlJobConfig.tables.entrySet()) { + tableId = entry.getKey(); + table = entry.getValue(); + break; + } + + // init hive configs like metastore service + EtlFileGroup fileGroup = table.fileGroups.get(0); + initSparkConfigs(fileGroup.hiveTableProperties); + fileGroup.dppHiveDbTableName = fileGroup.hiveDbTableName; + + // build global dict and encode source hive table if has bitmap dict columns + if (!tableToBitmapDictColumns.isEmpty() && tableToBitmapDictColumns.containsKey(tableId)) { + String dorisIntermediateHiveDbTableName = buildGlobalDictAndEncodeSourceTable(table, tableId); + // set with dorisIntermediateHiveDbTable + fileGroup.dppHiveDbTableName = dorisIntermediateHiveDbTableName; + } + } + + initSpark(); + // data partition sort and aggregation + processDpp(); + } + + private void run() throws Exception { + initConfig(); + checkConfig(); + processData(); + } + + public static void main(String[] args) { + if (args.length < 1) { + System.err.println("missing job config file path arg"); + System.exit(-1); + } + + try { + new SparkEtlJob(args[0]).run(); + } catch (Exception e) { + System.err.println("spark etl job run failed"); + LOG.warn("", e); + System.exit(-1); + } + } +} diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java new file mode 100644 index 00000000..9091686c --- /dev/null +++ b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.junit.Assert; +import org.junit.Test; + +public class ColumnParserTest { + + // TODO(wb) try to keep ut consistent with be's ut + @Test + public void testBoundCheck() { + // tinyint + TinyIntParser tinyIntParser = new TinyIntParser(); + // 1 normal + String tinyint = "100"; + Assert.assertTrue(tinyIntParser.parse(tinyint)); + // 2 upper + String tinyintUpper = "128"; + Assert.assertFalse(tinyIntParser.parse(tinyintUpper)); + // 3 lower + String tinyintLower = "-129"; + Assert.assertFalse(tinyIntParser.parse(tinyintLower)); + + // smallint + SmallIntParser smallIntParser = new SmallIntParser(); + // 1 normal + String smallint = "100"; + Assert.assertTrue(smallIntParser.parse(smallint)); + // 2 upper + String smallintUpper = "32768"; + Assert.assertFalse(smallIntParser.parse(smallintUpper)); + // 3 lower + String smallintLower = "-32769"; + Assert.assertFalse(smallIntParser.parse(smallintLower)); + + // int + IntParser intParser = new IntParser(); + // 1 normal + String intValue = "100"; + Assert.assertTrue(intParser.parse(intValue)); + // 2 upper + String intUpper = "2147483648"; + Assert.assertFalse(intParser.parse(intUpper)); + // 3 lower + String intLower = "-2147483649"; + Assert.assertFalse(intParser.parse(intLower)); + + // bigint + BigIntParser bigIntParser = new BigIntParser(); + // 1 normal + String bigint = "100"; + Assert.assertTrue(bigIntParser.parse(bigint)); + // 2 upper + String bigintUpper = "9223372036854775808"; + Assert.assertFalse(bigIntParser.parse(bigintUpper)); + // 3 lower + String bigintLower = "-9223372036854775809"; + Assert.assertFalse(bigIntParser.parse(bigintLower)); + + // largeint + LargeIntParser largeIntParser = new LargeIntParser(); + // 1 normal + String largeint = "100"; + Assert.assertTrue(largeIntParser.parse(largeint)); + // 2 upper + String largeintUpper = "170141183460469231731687303715884105728"; + Assert.assertFalse(largeIntParser.parse(largeintUpper)); + // 3 lower + String largeintLower = "-170141183460469231731687303715884105729"; + Assert.assertFalse(largeIntParser.parse(largeintLower)); + + // float + FloatParser floatParser = new FloatParser(); + // normal + String floatValue = "1.1"; + Assert.assertTrue(floatParser.parse(floatValue)); + // inf + String inf = "Infinity"; + Assert.assertFalse(floatParser.parse(inf)); + // nan + String nan = "NaN"; + // failed + Assert.assertFalse(floatParser.parse(nan)); + + // double + DoubleParser doubleParser = new DoubleParser(); + // normal + Assert.assertTrue(doubleParser.parse(floatValue)); + // inf + Assert.assertFalse(doubleParser.parse(inf)); + // nan + Assert.assertFalse(doubleParser.parse(nan)); + + // decimal + EtlJobConfig.EtlColumn etlColumn = new EtlJobConfig.EtlColumn(); + etlColumn.precision = 5; + etlColumn.scale = 3; + DecimalParser decimalParser = new DecimalParser(etlColumn); + // normal + String decimalValue = "10.333"; + Assert.assertTrue(decimalParser.parse(decimalValue)); + // overflow + String decimalOverflow = "1000.3333333333"; + Assert.assertFalse(decimalParser.parse(decimalOverflow)); + + // string + EtlJobConfig.EtlColumn stringColumn = new EtlJobConfig.EtlColumn(); + stringColumn.stringLength = 3; + StringParser stringParser = new StringParser(stringColumn); + // normal + String stringnormal = "a"; + Assert.assertTrue(stringParser.parse(stringnormal)); + // overflow + String stringoverflow = "中文"; + Assert.assertFalse(stringParser.parse(stringoverflow)); + } + +} diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java new file mode 100644 index 00000000..131018ed --- /dev/null +++ b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class DorisRangePartitionerTest { + + @Test + public void testRangePartitioner() { + List startKeys = new ArrayList<>(); + startKeys.add(new Integer(0)); + List endKeys = new ArrayList<>(); + endKeys.add(new Integer(100)); + EtlJobConfig.EtlPartition partition1 = new EtlJobConfig.EtlPartition( + 10000, startKeys, endKeys, false, 3); + + List startKeys2 = new ArrayList<>(); + startKeys2.add(new Integer(100)); + List endKeys2 = new ArrayList<>(); + endKeys2.add(new Integer(200)); + EtlJobConfig.EtlPartition partition2 = new EtlJobConfig.EtlPartition( + 10001, startKeys2, endKeys2, false, 4); + + List startKeys3 = new ArrayList<>(); + startKeys3.add(new Integer(200)); + List endKeys3 = new ArrayList<>(); + endKeys3.add(new Integer(300)); + EtlJobConfig.EtlPartition partition3 = new EtlJobConfig.EtlPartition( + 10002, startKeys3, endKeys3, false, 5); + + List partitions = new ArrayList<>(); + partitions.add(partition1); + partitions.add(partition2); + partitions.add(partition3); + + List partitionColumns = new ArrayList<>(); + partitionColumns.add("id"); + List bucketColumns = new ArrayList<>(); + bucketColumns.add("key"); + EtlJobConfig.EtlPartitionInfo partitionInfo = new EtlJobConfig.EtlPartitionInfo( + "RANGE", partitionColumns, bucketColumns, partitions); + List partitionRangeKeys = new ArrayList<>(); + for (EtlJobConfig.EtlPartition partition : partitions) { + DorisRangePartitioner.PartitionRangeKey partitionRangeKey = new DorisRangePartitioner.PartitionRangeKey(); + partitionRangeKey.isMaxPartition = false; + partitionRangeKey.startKeys = new DppColumns(partition.startKeys); + partitionRangeKey.endKeys = new DppColumns(partition.endKeys); + partitionRangeKeys.add(partitionRangeKey); + } + List partitionKeyIndexes = new ArrayList<>(); + partitionKeyIndexes.add(0); + DorisRangePartitioner rangePartitioner = new DorisRangePartitioner(partitionInfo, partitionKeyIndexes, partitionRangeKeys); + int num = rangePartitioner.numPartitions(); + Assert.assertEquals(3, num); + + List fields1 = new ArrayList<>(); + fields1.add(-100); + fields1.add("name"); + DppColumns record1 = new DppColumns(fields1); + int id1 = rangePartitioner.getPartition(record1); + Assert.assertEquals(-1, id1); + + List fields2 = new ArrayList<>(); + fields2.add(10); + fields2.add("name"); + DppColumns record2 = new DppColumns(fields2); + int id2 = rangePartitioner.getPartition(record2); + Assert.assertEquals(0, id2); + + List fields3 = new ArrayList<>(); + fields3.add(110); + fields3.add("name"); + DppColumns record3 = new DppColumns(fields3); + int id3 = rangePartitioner.getPartition(record3); + Assert.assertEquals(1, id3); + + List fields4 = new ArrayList<>(); + fields4.add(210); + fields4.add("name"); + DppColumns record4 = new DppColumns(fields4); + int id4 = rangePartitioner.getPartition(record4); + Assert.assertEquals(2, id4); + + List fields5 = new ArrayList<>(); + fields5.add(310); + fields5.add("name"); + DppColumns record5 = new DppColumns(fields5); + int id5 = rangePartitioner.getPartition(record5); + Assert.assertEquals(-1, id5); + } + + @Test + public void testUnpartitionedPartitioner() { + List bucketColumns = new ArrayList<>(); + bucketColumns.add("key"); + EtlJobConfig.EtlPartitionInfo partitionInfo = new EtlJobConfig.EtlPartitionInfo( + "UNPARTITIONED", null, bucketColumns, null); + List partitionSchema = new ArrayList<>(); + partitionSchema.add(Integer.class); + List partitionKeyIndexes = new ArrayList<>(); + partitionKeyIndexes.add(0); + DorisRangePartitioner rangePartitioner = new DorisRangePartitioner(partitionInfo, partitionKeyIndexes, null); + int num = rangePartitioner.numPartitions(); + Assert.assertEquals(1, num); + + List fields = new ArrayList<>(); + fields.add(100); + fields.add("name"); + DppColumns record = new DppColumns(fields); + int id = rangePartitioner.getPartition(record); + Assert.assertEquals(0, id); + } +} diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java new file mode 100644 index 00000000..e7cea5d0 --- /dev/null +++ b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructType; +import org.junit.Assert; +import org.junit.Test; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +public class DppUtilsTest { + + @Test + public void testGetClassFromDataType() { + DppUtils dppUtils = new DppUtils(); + + Class stringResult = dppUtils.getClassFromDataType(DataTypes.StringType); + Assert.assertEquals(String.class, stringResult); + + Class booleanResult = dppUtils.getClassFromDataType(DataTypes.BooleanType); + Assert.assertEquals(Boolean.class, booleanResult); + + Class shortResult = dppUtils.getClassFromDataType(DataTypes.ShortType); + Assert.assertEquals(Short.class, shortResult); + + Class integerResult = dppUtils.getClassFromDataType(DataTypes.IntegerType); + Assert.assertEquals(Integer.class, integerResult); + + Class longResult = dppUtils.getClassFromDataType(DataTypes.LongType); + Assert.assertEquals(Long.class, longResult); + + Class floatResult = dppUtils.getClassFromDataType(DataTypes.FloatType); + Assert.assertEquals(Float.class, floatResult); + + Class doubleResult = dppUtils.getClassFromDataType(DataTypes.DoubleType); + Assert.assertEquals(Double.class, doubleResult); + + Class dateResult = dppUtils.getClassFromDataType(DataTypes.DateType); + Assert.assertEquals(Date.class, dateResult); + } + + @Test + public void testGetClassFromColumn() { + DppUtils dppUtils = new DppUtils(); + + try { + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnType = "CHAR"; + Class charResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(String.class, charResult); + + column.columnType = "HLL"; + Class hllResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(String.class, hllResult); + + column.columnType = "OBJECT"; + Class objectResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(String.class, objectResult); + + column.columnType = "BOOLEAN"; + Class booleanResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Boolean.class, booleanResult); + + column.columnType = "TINYINT"; + Class tinyResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Short.class, tinyResult); + + column.columnType = "SMALLINT"; + Class smallResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Short.class, smallResult); + + column.columnType = "INT"; + Class integerResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Integer.class, integerResult); + + column.columnType = "DATETIME"; + Class datetimeResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(java.sql.Timestamp.class, datetimeResult); + + column.columnType = "FLOAT"; + Class floatResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Float.class, floatResult); + + column.columnType = "DOUBLE"; + Class doubleResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Double.class, doubleResult); + + column.columnType = "DATE"; + Class dateResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(Date.class, dateResult); + + column.columnType = "DECIMALV2"; + column.precision = 10; + column.scale = 2; + Class decimalResult = dppUtils.getClassFromColumn(column); + Assert.assertEquals(BigDecimal.valueOf(10, 2).getClass(), decimalResult); + } catch (Exception e) { + Assert.assertFalse(false); + } + + } + + @Test + public void testGetDataTypeFromColumn() { + DppUtils dppUtils = new DppUtils(); + + try { + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnType = "VARCHAR"; + DataType stringResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.StringType, stringResult); + + column.columnType = "CHAR"; + DataType charResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.StringType, charResult); + + column.columnType = "HLL"; + DataType hllResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.StringType, hllResult); + + column.columnType = "OBJECT"; + DataType objectResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.StringType, objectResult); + + column.columnType = "BOOLEAN"; + DataType booleanResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.StringType, booleanResult); + + column.columnType = "TINYINT"; + DataType tinyResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.ByteType, tinyResult); + + column.columnType = "SMALLINT"; + DataType smallResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.ShortType, smallResult); + + column.columnType = "INT"; + DataType integerResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.IntegerType, integerResult); + + column.columnType = "BIGINT"; + DataType longResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.LongType, longResult); + + column.columnType = "DATETIME"; + DataType datetimeResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.TimestampType, datetimeResult); + + column.columnType = "FLOAT"; + DataType floatResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.FloatType, floatResult); + + column.columnType = "DOUBLE"; + DataType doubleResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.DoubleType, doubleResult); + + column.columnType = "DATE"; + DataType dateResult = dppUtils.getDataTypeFromColumn(column, false); + Assert.assertEquals(DataTypes.DateType, dateResult); + } catch (Exception e) { + Assert.assertTrue(false); + } + } + + @Test + public void testCreateDstTableSchema() { + DppUtils dppUtils = new DppUtils(); + + EtlJobConfig.EtlColumn column1 = new EtlJobConfig.EtlColumn( + "column1", "INT", + true, true, + "NONE", "0", + 0, 0, 0); + EtlJobConfig.EtlColumn column2 = new EtlJobConfig.EtlColumn( + "column2", "SMALLINT", + true, true, + "NONE", "0", + 0, 0, 0); + List columns = new ArrayList<>(); + columns.add(column1); + columns.add(column2); + + try { + StructType schema = dppUtils.createDstTableSchema(columns, false, false); + Assert.assertEquals(2, schema.fieldNames().length); + Assert.assertEquals("column1", schema.fieldNames()[0]); + Assert.assertEquals("column2", schema.fieldNames()[1]); + + StructType schema2 = dppUtils.createDstTableSchema(columns, true, false); + Assert.assertEquals(3, schema2.fieldNames().length); + Assert.assertEquals("__bucketId__", schema2.fieldNames()[0]); + Assert.assertEquals("column1", schema2.fieldNames()[1]); + Assert.assertEquals("column2", schema2.fieldNames()[2]); + } catch (Exception e) { + Assert.assertTrue(false); + } + } + + @Test + public void testParseColumnsFromPath() { + DppUtils dppUtils = new DppUtils(); + + String path = "/path/to/file/city=beijing/date=2020-04-10/data"; + List columnFromPaths = new ArrayList<>(); + columnFromPaths.add("city"); + columnFromPaths.add("date"); + try { + List columnFromPathValues = dppUtils.parseColumnsFromPath(path, columnFromPaths); + Assert.assertEquals(2, columnFromPathValues.size()); + Assert.assertEquals("beijing", columnFromPathValues.get(0)); + Assert.assertEquals("2020-04-10", columnFromPathValues.get(1)); + } catch (Exception e) { + Assert.assertTrue(false); + } + } +} diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java new file mode 100644 index 00000000..90c95cf0 --- /dev/null +++ b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class MinimumCoverageRollupTreeBuilderTest { + + @Test + public void testBuild() { + EtlJobConfig.EtlColumn column1 = new EtlJobConfig.EtlColumn( + "column1", "INT", + true, true, + "NONE", "0", + 0, 0, 0); + EtlJobConfig.EtlColumn column2 = new EtlJobConfig.EtlColumn( + "column2", "SMALLINT", + true, true, + "NONE", "0", + 0, 0, 0); + EtlJobConfig.EtlColumn column3 = new EtlJobConfig.EtlColumn( + "column3", "VARCHAR", + true, true, + "NONE", "", + 0, 0, 0); + EtlJobConfig.EtlColumn column4 = new EtlJobConfig.EtlColumn( + "column4", "INT", + true, false, + "SUM", "", + 0, 0, 0); + List baseColumns = new ArrayList<>(); + baseColumns.add(column1); + baseColumns.add(column2); + baseColumns.add(column3); + baseColumns.add(column4); + EtlJobConfig.EtlIndex baseIndex = new EtlJobConfig.EtlIndex(10000, + baseColumns, 12345, "DUPLICATE", true); + List roll1Columns = new ArrayList<>(); + roll1Columns.add(column1); + roll1Columns.add(column2); + roll1Columns.add(column4); + EtlJobConfig.EtlIndex roll1Index = new EtlJobConfig.EtlIndex(10001, + roll1Columns, 12346, "AGGREGATE", false); + List roll2Columns = new ArrayList<>(); + roll2Columns.add(column1); + roll2Columns.add(column4); + EtlJobConfig.EtlIndex roll2Index = new EtlJobConfig.EtlIndex(10002, + roll2Columns, 12347, "AGGREGATE", false); + + List roll3Columns = new ArrayList<>(); + roll3Columns.add(column3); + roll3Columns.add(column4); + EtlJobConfig.EtlIndex roll3Index = new EtlJobConfig.EtlIndex(10003, + roll3Columns, 12348, "AGGREGATE", false); + + List indexes = new ArrayList<>(); + indexes.add(baseIndex); + indexes.add(roll1Index); + indexes.add(roll2Index); + indexes.add(roll3Index); + EtlJobConfig.EtlTable table = new EtlJobConfig.EtlTable(indexes, null); + + MinimumCoverageRollupTreeBuilder builder = new MinimumCoverageRollupTreeBuilder(); + RollupTreeNode resultNode = builder.build(table); + Assert.assertEquals(resultNode.parent, null); + Assert.assertEquals(resultNode.indexId, 10000); + Assert.assertEquals(resultNode.level, 0); + Assert.assertEquals(resultNode.children.size(), 2); + + RollupTreeNode index1Node = resultNode.children.get(0); + Assert.assertEquals(index1Node.parent.indexId, 10000); + Assert.assertEquals(index1Node.indexId, 10001); + Assert.assertEquals(index1Node.level, 1); + Assert.assertEquals(index1Node.children.size(), 1); + + RollupTreeNode index3Node = resultNode.children.get(1); + Assert.assertEquals(index3Node.parent.indexId, 10000); + Assert.assertEquals(index3Node.indexId, 10003); + Assert.assertEquals(index3Node.level, 1); + Assert.assertEquals(index3Node.children, null); + + RollupTreeNode index2Node = index1Node.children.get(0); + Assert.assertEquals(index2Node.parent.indexId, 10001); + Assert.assertEquals(index2Node.indexId, 10002); + Assert.assertEquals(index2Node.level, 2); + Assert.assertEquals(index2Node.children, null); + } +} diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java new file mode 100644 index 00000000..7522a69c --- /dev/null +++ b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.apache.spark.sql.RowFactory; +import org.junit.Assert; +import org.junit.Test; + +import java.math.BigDecimal; + +public class SparkDppTest { + + @Test + public void testValidateData() { + SparkDpp sparkDpp = new SparkDpp(); + + // decimal + EtlJobConfig.EtlColumn etlColumn = new EtlJobConfig.EtlColumn(); + etlColumn.columnType = "DECIMALV2"; + etlColumn.precision = 3; + etlColumn.scale = 2; + + DecimalParser decimalParser = new DecimalParser(etlColumn); + // test max/min + Assert.assertEquals(decimalParser.getMaxValue().toString(), "9.99"); + Assert.assertEquals(decimalParser.getMinValue().toString(), "-9.99"); + // normal + BigDecimal bigDecimal = new BigDecimal("1.21"); + Assert.assertTrue(sparkDpp.validateData(bigDecimal, etlColumn, decimalParser, RowFactory.create(bigDecimal))); + // failed + BigDecimal bigDecimalFailed = new BigDecimal("10"); + Assert.assertFalse(sparkDpp.validateData(bigDecimalFailed, etlColumn, decimalParser, RowFactory.create(bigDecimalFailed))); + + // string + EtlJobConfig.EtlColumn stringColumn = new EtlJobConfig.EtlColumn(); + stringColumn.stringLength = 3; + stringColumn.columnType = "VARCHAR"; + StringParser stringParser = new StringParser(stringColumn); + // normal + String normalString = "a1"; + Assert.assertTrue(sparkDpp.validateData(normalString, stringColumn, stringParser, RowFactory.create(normalString))); + // cn normal + String normalStringCN = "中"; + Assert.assertTrue(sparkDpp.validateData(normalStringCN, stringColumn, stringParser, RowFactory.create(normalStringCN))); + // cn failed + String failedStringCN = "中a"; + Assert.assertFalse(sparkDpp.validateData(failedStringCN, stringColumn, stringParser, RowFactory.create(failedStringCN))); + } + +} diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java new file mode 100644 index 00000000..0ea7f660 --- /dev/null +++ b/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.etl; + +import org.apache.doris.common.jmockit.Deencapsulation; +import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumn; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumnMapping; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlFileGroup; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlIndex; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlJobProperty; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlPartition; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlPartitionInfo; +import org.apache.doris.sparkdpp.EtlJobConfig.EtlTable; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import mockit.Expectations; +import mockit.Mocked; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PositionedReadable; +import org.apache.hadoop.fs.Seekable; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class SparkEtlJobTest { + private long tableId; + private long index1Id; + private long index2Id; + private long partition1Id; + private long partition2Id; + private EtlJobConfig etlJobConfig; + + @Before + public void setUp() { + tableId = 0L; + index1Id = 1L; + index2Id = 2L; + partition1Id = 3L; + partition2Id = 4L; + + // indexes + EtlColumn k1 = new EtlColumn("k1", "INT", false, true, "NONE", "0", 0, 0, 0); + EtlColumn k2 = new EtlColumn("k2", "VARCHAR", false, true, "NONE", "0", 10, 0, 0); + EtlColumn v1 = new EtlColumn("v1", "BIGINT", false, false, "NONE", "0", 0, 0, 0); + EtlIndex index1 = new EtlIndex(index1Id, Lists.newArrayList(k1, k2, v1), 666666, "DUPLICATE", true); + v1 = new EtlColumn("v1", "BIGINT", false, false, "SUM", "0", 0, 0, 0); + EtlIndex index2 = new EtlIndex(index2Id, Lists.newArrayList(k1, v1), 888888, "AGGREGATE", true); + List indexes = Lists.newArrayList(index1, index2); + // partition info + List partitions = Lists.newArrayList(); + partitions.add(new EtlPartition(partition1Id, Lists.newArrayList(0), Lists.newArrayList(100), false, 2)); + partitions.add(new EtlPartition(partition2Id, Lists.newArrayList(100), Lists.newArrayList(), true, 3)); + EtlPartitionInfo partitionInfo = new EtlPartitionInfo("RANGE", Lists.newArrayList("k1"), Lists.newArrayList("k2"), partitions); + EtlTable table = new EtlTable(indexes, partitionInfo); + // file group + Map columnMappings = Maps.newHashMap(); + columnMappings.put("k1", new EtlColumnMapping("k1 + 1")); + table.addFileGroup(new EtlFileGroup(EtlJobConfig.SourceType.FILE, Lists.newArrayList("hdfs://127.0.0.1:10000/file"), + Lists.newArrayList(), Lists.newArrayList(), "\t", "\n", false, null, + Maps.newHashMap(), "", Lists.newArrayList(partition1Id, partition2Id))); + // tables + Map tables = Maps.newHashMap(); + tables.put(tableId, table); + // others + String outputFilePattern = "V1.label0.%d.%d.%d.%d.%d.parquet"; + String label = "label0"; + EtlJobProperty properties = new EtlJobProperty(); + properties.strictMode = false; + properties.timezone = "Asia/Shanghai"; + etlJobConfig = new EtlJobConfig(tables, outputFilePattern, label, properties); + } + + @Test + public void testInitConfig(@Mocked FileSystem fs) throws IOException { + new Expectations() { + { + fs.open(new Path("hdfs://127.0.0.1:10000/jobconfig.json")); + result = new FSDataInputStream(new SeekableByteArrayInputStream(etlJobConfig.configToJson().getBytes())); + } + }; + + SparkEtlJob job = Deencapsulation.newInstance(SparkEtlJob.class, "hdfs://127.0.0.1:10000/jobconfig.json"); + Deencapsulation.invoke(job, "initConfig"); + EtlJobConfig parsedConfig = Deencapsulation.getField(job, "etlJobConfig"); + Assert.assertTrue(parsedConfig.tables.containsKey(tableId)); + EtlTable table = parsedConfig.tables.get(tableId); + Assert.assertEquals(2, table.indexes.size()); + Assert.assertEquals(2, table.partitionInfo.partitions.size()); + Assert.assertEquals(false, parsedConfig.properties.strictMode); + Assert.assertEquals("label0", parsedConfig.label); + } + + @Test + public void testCheckConfigWithoutBitmapDictColumns() { + SparkEtlJob job = Deencapsulation.newInstance(SparkEtlJob.class, "hdfs://127.0.0.1:10000/jobconfig.json"); + Deencapsulation.setField(job, "etlJobConfig", etlJobConfig); + Deencapsulation.invoke(job, "checkConfig"); + Map> tableToBitmapDictColumns = Deencapsulation.getField(job, "tableToBitmapDictColumns"); + // check bitmap dict columns empty + Assert.assertTrue(tableToBitmapDictColumns.isEmpty()); + } + + @Test + public void testCheckConfigWithBitmapDictColumns() { + SparkEtlJob job = Deencapsulation.newInstance(SparkEtlJob.class, "hdfs://127.0.0.1:10000/jobconfig.json"); + EtlTable table = etlJobConfig.tables.get(tableId); + table.indexes.get(0).columns.add( + new EtlColumn("v2", "BITMAP", false, false, "BITMAP_UNION", "0", 0, 0, 0) + ); + EtlFileGroup fileGroup = table.fileGroups.get(0); + fileGroup.sourceType = EtlJobConfig.SourceType.HIVE; + fileGroup.columnMappings.put( + "v2", new EtlColumnMapping("bitmap_dict", Lists.newArrayList("v2")) + ); + Deencapsulation.setField(job, "etlJobConfig", etlJobConfig); + Deencapsulation.invoke(job, "checkConfig"); + // check hive source + Set hiveSourceTables = Deencapsulation.getField(job, "hiveSourceTables"); + Assert.assertTrue(hiveSourceTables.contains(tableId)); + // check bitmap dict columns has v2 + Map> tableToBitmapDictColumns = Deencapsulation.getField(job, "tableToBitmapDictColumns"); + Assert.assertTrue(tableToBitmapDictColumns.containsKey(tableId)); + Assert.assertTrue(tableToBitmapDictColumns.get(tableId).contains("v2")); + // check remove v2 bitmap_dict func mapping from file group column mappings + Assert.assertFalse(table.fileGroups.get(0).columnMappings.containsKey("v2")); + } + + private static class SeekableByteArrayInputStream extends ByteArrayInputStream implements Seekable, PositionedReadable { + public SeekableByteArrayInputStream(byte[] buf) { + super(buf); + } + + public void seek(long position) { + if (position < 0 || position >= buf.length) { + throw new IllegalArgumentException("pos = " + position + " length = " + buf.length); + } + this.pos = (int) position; + } + + public long getPos() { + return this.pos; + } + + @Override + public boolean seekToNewSource(long targetPos) throws IOException { + return false; + } + + @Override + public int read(long position, byte[] buffer, int offset, int length) throws IOException { + this.seek(position); + return this.read(buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { + if (position + length > buf.length) { + throw new EOFException("End of file reached before reading fully."); + } + System.arraycopy(buf, (int) position, buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer) throws IOException { + readFully(position, buffer, 0, buffer.length); + } + } +} diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml new file mode 100644 index 00000000..d305ed92 --- /dev/null +++ b/spark-load/spark-load-core/pom.xml @@ -0,0 +1,94 @@ + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + + + spark-load-core + + + 8 + 8 + UTF-8 + + + + + com.fasterxml.jackson.core + jackson-databind + + + org.projectlombok + lombok + provided + + + commons-cli + commons-cli + + + com.google.guava + guava + + + org.apache.spark + spark-launcher_${scala.major.version} + + + org.apache.spark + spark-core_${scala.major.version} + + + org.apache.hadoop + hadoop-client + + + + + org.apache.spark + spark-catalyst_${scala.major.version} + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-client + + + ${project.groupId} + fe-common + + + org.apache.logging.log4j + log4j-core + + + + org.apache.logging.log4j + log4j-api + + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + org.slf4j + slf4j-api + + + + + + + + + \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java new file mode 100644 index 00000000..b6efc4e1 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -0,0 +1,129 @@ +package org.apache.doris; + +import org.apache.doris.common.CommandLineOptions; +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.LoadManager; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.Recoverable; +import org.apache.doris.util.JsonUtils; + +import com.google.common.base.Preconditions; +import com.google.common.base.Strings; +import io.netty.util.internal.logging.InternalLoggerFactory; +import io.netty.util.internal.logging.Log4JLoggerFactory; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.File; +import java.io.IOException; + +public class SparkLoadRunner { + + private static final Logger LOG = LogManager.getLogger(SparkLoadRunner.class); + + public static final String SPARK_LOAD_HOME = System.getenv("SPARK_LOAD_HOME"); + + static { + InternalLoggerFactory.setDefaultFactory(Log4JLoggerFactory.INSTANCE); + } + + public static void main(String[] args) { + + if (StringUtils.isBlank(SPARK_LOAD_HOME)) { + System.err.println("env SPARK_LOAD_HOME is not set."); + System.exit(-1); + } + + CommandLineOptions cmdOptions = parseArgs(args); + if (Strings.isNullOrEmpty(cmdOptions.getConfigPath())) { + System.err.println("config path is empty"); + System.exit(-1); + } + + JobConfig jobConfig = readConfig(cmdOptions.getConfigPath()); + try { + checkConfig(jobConfig); + } catch (IllegalArgumentException e) { + System.err.println("check config failed, msg: " + e.getMessage()); + System.exit(-1); + } + + LoadManager loadManager = LoadManager.getInstance(); + Loader loader = loadManager.createLoader(jobConfig, cmdOptions.getRecovery()); + try { + + loader.prepare(); + do { + if (loader instanceof Recoverable) { + if (((Recoverable) loader).canBeRecovered()) { + LOG.info("recovery check passed, start prepare recovery."); + ((Recoverable) loader).prepareRecover(); + break; + } + } + loader.execute(); + } while (false); + + loader.afterFinished(); + + } catch (Exception e) { + loader.afterFailed(e); + LOG.error("start load failed", e); + System.err.println("start load failed, exit."); + System.exit(-1); + } + + } + + private static CommandLineOptions parseArgs(String[] args) { + CommandLineParser parser = new DefaultParser(); + Options options = new Options(); + options.addOption("c", "config", true, "Spark load config file"); + options.addOption("r", "recovery", false, "Recovery mode"); + CommandLine cmd = null; + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + System.err.println("failed to parse argument, exit."); + System.exit(-1); + } + + if (cmd.hasOption('c') || cmd.hasOption("config")) { + String configPath = cmd.getOptionValue("config"); + boolean recovery = cmd.hasOption('r') || cmd.hasOption("recovery"); + return new CommandLineOptions(configPath, recovery); + } + + throw new IllegalArgumentException(); + + } + + private static JobConfig readConfig(String path) { + JobConfig jobConfig = null; + try { + jobConfig = JsonUtils.readValue(new File(path), JobConfig.class); + } catch (IOException e) { + LOG.error("failed to read config file", e); + System.err.println("failed to read config file, exit."); + System.exit(-1); + } + return jobConfig; + } + + private static void checkConfig(JobConfig jobConfig) { + jobConfig.checkFeAddress(); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getLabel()), "label is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getUser()), "user is empty"); + Preconditions.checkArgument(jobConfig.getPassword() != null, "password cannot be null"); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getDatabase()), "database is empty"); + jobConfig.checkTaskInfo(); + jobConfig.checkSparkInfo(); + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java new file mode 100644 index 00000000..c4fc8154 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -0,0 +1,185 @@ +package org.apache.doris.client; + +import org.apache.doris.common.LoadInfo; +import org.apache.doris.common.ResponseEntity; +import org.apache.doris.common.SparkLoadException; +import org.apache.doris.common.meta.LoadInfoResponse; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.util.HttpUtils; +import org.apache.doris.util.JsonUtils; + +import org.apache.http.HttpHeaders; +import org.apache.http.HttpStatus; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Base64; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class DorisClient { + + private static volatile FeClient FE; + private static BeClient BE; + + public static FeClient getFeClient(String feAddresses, String user, String password) { + if (FE == null) { + synchronized (FeClient.class) { + if (FE == null) { + FE = new FeClient(feAddresses, user, password); + } + } + } + return FE; + } + + public static class FeClient { + + public static final String BASE_URL = "http://%s%s"; + + public static final String RAW_LOAD_URL_PATTERN = "/api/spark_load/%s/%s"; + + public static final String CREATE_ACTION = "_create"; + + public static final String UPDATE_ACTION = "_update"; + + public static final String GET_LOAD_INFO = "/api/%s/_load_info"; + + private final List feNodes; + + private final String auth; + + public FeClient(String feAddresses, String user, String password) { + this.feNodes = parseFeNodes(feAddresses); + this.auth = parseAuth(user, password); + } + + private List parseFeNodes(String feAddresses) { + String[] feArr = feAddresses.split(","); + if (feArr.length == 0) { + throw new IllegalArgumentException(); + } + return Arrays.stream(feArr).collect(Collectors.toList()); + } + + private String parseAuth(String user, String password) { + return Base64.getEncoder().encodeToString((user + ":" + password).getBytes(StandardCharsets.UTF_8)); + } + + public LoadMeta createSparkLoad(String db, Map> tableToPartition, String label, + Map properties) throws SparkLoadException { + try { + String path = String.format(RAW_LOAD_URL_PATTERN, db, CREATE_ACTION); + HttpPost httpPost = new HttpPost(); + addCommonHeaders(httpPost); + Map params = new HashMap<>(); + params.put("label", label); + params.put("tableToPartition", tableToPartition); + params.put("properties", properties); + httpPost.setEntity(new StringEntity(JsonUtils.writeValueAsString(params))); + String content = executeRequest(httpPost, path, null); + ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); + if (res.getCode() != 0) { + throw new SparkLoadException(String.format("create load failed, code: %d, msg: %s, reason: %s", + res.getCode(), res.getMsg(), res.getData().isNull() ? null : res.getData().asText())); + } + return JsonUtils.readValue(res.getData().traverse(), LoadMeta.class); + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("create spark load failed", e); + } + } + + private void addCommonHeaders(HttpRequestBase req) { + req.setHeader(HttpHeaders.AUTHORIZATION, "Basic " + auth); + } + + private String executeRequest(HttpRequestBase req, String apiPath, Map params) + throws IOException, URISyntaxException { + try (CloseableHttpClient client = HttpUtils.getClient()) { + for (String feNode : feNodes) { + String url = String.format(BASE_URL, feNode, apiPath); + URIBuilder uriBuilder = new URIBuilder(URI.create(url)); + if (params != null && !params.isEmpty()) { + params.forEach(uriBuilder::addParameter); + } + req.setURI(uriBuilder.build()); + addCommonHeaders(req); + CloseableHttpResponse res; + try { + res = client.execute(req); + } catch (IOException e) { + + continue; + } + if (res.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { + continue; + } + return HttpUtils.getEntityContent(res.getEntity()); + } + } + return null; + } + + public void updateSparkLoad(String db, Long loadId, Map statusInfo) + throws SparkLoadException { + + String path = String.format(RAW_LOAD_URL_PATTERN, db, UPDATE_ACTION); + HttpPost httpPost = new HttpPost(); + addCommonHeaders(httpPost); + Map params = new HashMap<>(); + params.put("loadId", loadId); + params.put("statusInfo", statusInfo); + try { + httpPost.setEntity(new StringEntity(JsonUtils.writeValueAsString(params))); + String content = executeRequest(httpPost, path, null); + ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); + if (res.getCode() != 0) { + throw new SparkLoadException(String.format("update load failed, code: %d, msg: %s, reason: %s", + res.getCode(), res.getMsg(), res.getData().isNull() ? null : res.getData().asText())); + } + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("update spark load failed", e); + } + + } + + public LoadInfo getLoadInfo(String db, String label) throws SparkLoadException { + + String path = String.format(GET_LOAD_INFO, db); + HttpGet httpGet = new HttpGet(); + addCommonHeaders(httpGet); + try { + Map params = new HashMap<>(); + params.put("label", label); + String content = executeRequest(httpGet, path, params); + LoadInfoResponse res = JsonUtils.readValue(content, LoadInfoResponse.class); + if (!"ok".equalsIgnoreCase(res.getStatus())) { + throw new SparkLoadException(String.format("get load info failed, status: %s, msg: %s, jobInfo: %s", + res.getStatus(), res.getMsg(), JsonUtils.writeValueAsString(res.getJobInfo()))); + } + return res.getJobInfo(); + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("update spark load failed", e); + } + + } + + } + + private static class BeClient { + + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java new file mode 100644 index 00000000..2a849bb5 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java @@ -0,0 +1,16 @@ +package org.apache.doris.common; + +import lombok.Getter; + +@Getter +public class CommandLineOptions { + + private final String configPath; + + private final Boolean recovery; + + public CommandLineOptions(String configPath, Boolean recovery) { + this.configPath = configPath; + this.recovery = recovery; + } +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java new file mode 100644 index 00000000..ef807f69 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java @@ -0,0 +1,7 @@ +package org.apache.doris.common; + +public interface Constants { + + String HIVE_METASTORE_URIS = "hive.metastore.uris"; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java new file mode 100644 index 00000000..1574e5ae --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java @@ -0,0 +1,54 @@ +package org.apache.doris.common; + +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.io.Serializable; + +public class DppResult implements Serializable { + + @JsonProperty(value = "is_success", required = true) + public boolean isSuccess; + + @JsonProperty(value = "failed_reason", required = true) + public String failedReason; + + @JsonProperty(value = "scanned_rows", required = true) + public long scannedRows; + + @JsonProperty(value = "file_number", required = true) + public long fileNumber; + + @JsonProperty(value = "file_size", required = true) + public long fileSize; + + @JsonProperty(value = "normal_rows", required = true) + public long normalRows; + + @JsonProperty(value = "abnormal_rows", required = true) + public long abnormalRows; + + @JsonProperty(value = "unselect_rows", required = true) + public long unselectRows; + + // only part of abnormal rows will be returned + @JsonProperty("partial_abnormal_rows") + public String partialAbnormalRows; + + @JsonProperty("scanned_bytes") + public long scannedBytes; + + public DppResult() { + isSuccess = true; + failedReason = ""; + scannedRows = 0; + fileNumber = 0; + fileSize = 0; + normalRows = 0; + abnormalRows = 0; + unselectRows = 0; + partialAbnormalRows = ""; + scannedBytes = 0; + } + + +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java new file mode 100644 index 00000000..2f149b6c --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java @@ -0,0 +1,9 @@ +package org.apache.doris.common; + +public enum JobStatus { + + RUNNING, + FAILED, + SUCCESS + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java new file mode 100644 index 00000000..4a45900d --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java @@ -0,0 +1,18 @@ +package org.apache.doris.common; + +import lombok.Data; + +import java.util.List; + +@Data +public class LoadInfo { + + private String dbName; + private List tblNames; + private String label; + private String clusterName; + private String state; + private String failMsg; + private String trackingUrl; + +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java new file mode 100644 index 00000000..31ac0c59 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java @@ -0,0 +1,5 @@ +package org.apache.doris.common; + +public enum LoadMode { + PUSH, PULL; +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java new file mode 100644 index 00000000..ca03e35b --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java @@ -0,0 +1,14 @@ +package org.apache.doris.common; + +import com.fasterxml.jackson.databind.JsonNode; +import lombok.Data; + +@Data +public class ResponseEntity { + + private Integer code; + private String msg; + private JsonNode data; + private Integer count; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java new file mode 100644 index 00000000..bdde745d --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java @@ -0,0 +1,12 @@ +package org.apache.doris.common; + +public class SparkLoadException extends Exception { + + public SparkLoadException(String message) { + super(message); + } + + public SparkLoadException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java new file mode 100644 index 00000000..0bc64406 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java @@ -0,0 +1,14 @@ +package org.apache.doris.common.meta; + +import org.apache.doris.common.LoadInfo; + +import lombok.Data; + +@Data +public class LoadInfoResponse { + + private String status; + private String msg; + private LoadInfo jobInfo; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java new file mode 100644 index 00000000..1c62edb2 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -0,0 +1,72 @@ +package org.apache.doris.common.meta; + +import org.apache.doris.common.Constants; +import org.apache.doris.config.JobConfig; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import lombok.Data; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +@Data +public class LoadMeta { + + private Long loadId; + private Long txnId; + private Long dbId; + private Long signature; + private Map tableMeta; + + public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) { + Map tables = new HashMap<>(); + getTableMeta().forEach((name, meta) -> { + EtlJobConfig.EtlTable etlTable = new EtlJobConfig.EtlTable(meta.getIndexes().stream().map( + TableMeta.EtlIndex::toEtlIndex).collect(Collectors.toList()), + meta.getPartitionInfo().toEtlPartitionInfo()); + JobConfig.TaskInfo taskInfo = jobConfig.getLoadTasks().get(name); + EtlJobConfig.EtlFileGroup fileGroup; + Map columnMappingMap = taskInfo.toEtlColumnMappingMap(); + List partitionIds = meta.getPartitionInfo().partitions.stream() + .map(p -> p.partitionId).collect(Collectors.toList()); + switch (taskInfo.getType()) { + case HIVE: + Map properties = new HashMap<>(jobConfig.getHadoopProperties()); + properties.put(Constants.HIVE_METASTORE_URIS, taskInfo.getHiveMetastoreUris()); + fileGroup = + new EtlJobConfig.EtlFileGroup(EtlJobConfig.SourceType.HIVE, taskInfo.getHiveFullTableName(), + properties, false, columnMappingMap, taskInfo.getWhere(), + partitionIds); + break; + case FILE: + List columnList = + Arrays.stream(taskInfo.getColumns().split(",")).collect(Collectors.toList()); + List columnFromPathList = taskInfo.getColumnFromPath() == null ? Collections.emptyList() : + Arrays.stream(taskInfo.getColumnFromPath().split(",")).collect(Collectors.toList()); + fileGroup = + new EtlJobConfig.EtlFileGroup(EtlJobConfig.SourceType.FILE, taskInfo.getPaths(), columnList, + columnFromPathList, taskInfo.getFieldSep(), taskInfo.getLineDelim(), false, + taskInfo.getFormat(), columnMappingMap, taskInfo.getWhere(), partitionIds); + break; + default: + throw new IllegalArgumentException("Unsupported task type: " + taskInfo.getType()); + } + etlTable.addFileGroup(fileGroup); + tables.put(meta.getId(), etlTable); + }); + String outputFilePattern = EtlJobConfig.getOutputFilePattern(jobConfig.getLabel(), + EtlJobConfig.FilePatternVersion.V1); + String label = jobConfig.getLabel(); + EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); + EtlJobConfig etlJobConfig = new EtlJobConfig(tables, outputFilePattern, label, properties); + etlJobConfig.outputPath = + EtlJobConfig.getOutputPath(jobConfig.getSpark().getWorkingDir(), getDbId(), label, + getSignature()); + return etlJobConfig; + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java new file mode 100644 index 00000000..2abaea20 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java @@ -0,0 +1,67 @@ +package org.apache.doris.common.meta; + +import com.google.gson.annotations.SerializedName; +import lombok.Data; + +import org.apache.doris.sparkdpp.EtlJobConfig; + +import java.io.Serializable; +import java.util.List; +import java.util.stream.Collectors; + +@Data +public class TableMeta { + + private Long id; + private List indexes; + private EtlPartitionInfo partitionInfo; + + public static class EtlIndex implements Serializable { + public long indexId; + public List columns; + public int schemaHash; + public String indexType; + public boolean isBaseIndex; + + public EtlIndex() { + + } + + public EtlJobConfig.EtlIndex toEtlIndex() { + return new EtlJobConfig.EtlIndex(indexId, columns, schemaHash, indexType, isBaseIndex); + } + + } + + public static class EtlPartitionInfo implements Serializable { + public String partitionType; + public List partitionColumnRefs; + public List distributionColumnRefs; + public List partitions; + + public EtlPartitionInfo() { + } + + public EtlJobConfig.EtlPartitionInfo toEtlPartitionInfo() { + return new EtlJobConfig.EtlPartitionInfo(partitionType, partitionColumnRefs, distributionColumnRefs, + partitions.stream().map(EtlPartition::toEtlPartition).collect(Collectors.toList())); + } + + } + + public static class EtlPartition implements Serializable { + public long partitionId; + public List startKeys; + public List endKeys; + public boolean isMaxPartition; + public int bucketNum; + + public EtlPartition() { + } + + public EtlJobConfig.EtlPartition toEtlPartition() { + return new EtlJobConfig.EtlPartition(partitionId, startKeys, endKeys, isMaxPartition, bucketNum); + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java new file mode 100644 index 00000000..5a7cd63e --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -0,0 +1,188 @@ +package org.apache.doris.config; + +import org.apache.doris.SparkLoadRunner; +import org.apache.doris.common.LoadMode; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import lombok.Data; +import org.apache.commons.lang3.StringUtils; + +import java.io.File; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@Data +public class JobConfig { + + @JsonProperty(required = true) + private String feAddresses; + + @JsonProperty(required = true) + private String label; + + @JsonProperty(required = true) + private String user; + + @JsonProperty(required = true) + private String password; + + @JsonProperty(required = true) + private String database; + + @JsonProperty(required = true) + private Map loadTasks; + + @JsonProperty(required = true) + private SparkInfo spark; + + private LoadMode loadMode = LoadMode.PULL; + + private Map hadoopProperties = Collections.emptyMap(); + + private Map jobProperties = Collections.emptyMap(); + + private Map env = Collections.emptyMap(); + + @Data + public static class TaskInfo { + + private TaskType type; + + private String hiveMetastoreUris; + + private String hiveDatabase; + + private String hiveTable; + + private List paths; + + private String format; + + private String columns; + + private String columnFromPath; + + private String fieldSep; + + private String lineDelim = "\n"; + + private List columnMappings = Collections.emptyList(); + + private String where; + + private List targetPartitions = Collections.emptyList(); + + public String getHiveFullTableName() { + return hiveDatabase + "." + hiveTable; + } + + public Map toEtlColumnMappingMap() { + Map map = new HashMap<>(); + for (String columnMapping : columnMappings) { + String[] arr = columnMapping.split("="); + map.put(arr[0], new EtlJobConfig.EtlColumnMapping(arr[1])); + } + return map; + } + + } + + @Data + public static class SparkInfo { + + private String sparkHome; + + private String workingDir; + + private String master; + + private String deployMode; + + private Integer numExecutors; + + private Integer executorCores; + + private String executorMemory; + + private String driverMemory; + + private String dppJarPath = + SparkLoadRunner.SPARK_LOAD_HOME + "/spark-dpp-1.0-SNAPSHOT-jar-with-dependencies.jar"; + + private Map properties = Collections.emptyMap(); + + } + + public void checkFeAddress() { + Preconditions.checkArgument(StringUtils.isNoneBlank(getFeAddresses()), "feAddress is empty"); + String[] feAddressArr = getFeAddresses().split(","); + if (feAddressArr.length == 0) { + throw new IllegalArgumentException("feAddress format is incorrect"); + } + for (String feAddress : feAddressArr) { + String[] arr = feAddress.split(":"); + if (arr.length != 2) { + throw new IllegalArgumentException("feAddress format is incorrect"); + } + } + } + + public void checkTaskInfo() { + Map tasks = getLoadTasks(); + Preconditions.checkArgument(!tasks.isEmpty(), "loadTasks is empty"); + for (Map.Entry entry : tasks.entrySet()) { + TaskInfo taskInfo = entry.getValue(); + switch (taskInfo.getType()) { + case HIVE: + Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveDatabase()), + "hive database is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveTable()), "hive table is empty"); + break; + case FILE: + Preconditions.checkArgument(taskInfo.getPaths() != null && !taskInfo.getPaths().isEmpty(), + "file path is empty"); + Preconditions.checkArgument( + StringUtils.equalsAnyIgnoreCase(taskInfo.getFormat(), "parquet", "orc", "csv"), + "format only support parquet or orc or csv"); + if ("csv".equalsIgnoreCase(taskInfo.getFormat())) { + Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getFieldSep()), + "field separator is empty"); + } + break; + default: + throw new IllegalArgumentException("task type only supports hive or file"); + } + } + } + + public void checkSparkInfo() { + SparkInfo sparkInfo = getSpark(); + Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getSparkHome()), + "spark config item sparkHome is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getWorkingDir()), + "spark config item workingDir is empty"); + Preconditions.checkArgument( + StringUtils.equalsAnyIgnoreCase(sparkInfo.getMaster(), "yarn", "standalone", "local"), + "spark master only supports yarn or standalone or local "); + Preconditions.checkArgument( + StringUtils.equalsAnyIgnoreCase(sparkInfo.getDeployMode(), "cluster", "client"), + "spark deployMode only supports cluster or client "); + if ("local".equalsIgnoreCase(sparkInfo.getMaster())) { + Preconditions.checkArgument("client".equalsIgnoreCase(sparkInfo.getDeployMode()), + "local master only supports client mode"); + } + if (LoadMode.PULL == getLoadMode()) { + if (StringUtils.isBlank(getSpark().getDppJarPath())) { + throw new IllegalArgumentException("dpp jar file path is empty "); + } + if (!new File(getSpark().getDppJarPath()).exists()) { + throw new IllegalArgumentException("dpp jar file is not exists, path: " + getSpark().getDppJarPath()); + } + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java new file mode 100644 index 00000000..e05c88ab --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java @@ -0,0 +1,8 @@ +package org.apache.doris.config; + +public enum TaskType { + + HIVE, + FILE + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java new file mode 100644 index 00000000..7817cfa0 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java @@ -0,0 +1,13 @@ +package org.apache.doris.load; + +public class JobMonitor { + + public void registerJob() { + + } + + private void startListen() { + + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java new file mode 100644 index 00000000..f0fa6546 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java @@ -0,0 +1,37 @@ +package org.apache.doris.load; + +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.PullLoader; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +public class LoadManager { + + private static final Logger LOG = LogManager.getLogger(LoadManager.class); + + private static volatile LoadManager INSTANCE = null; + + public static LoadManager getInstance() { + if (INSTANCE == null) { + synchronized (LoadManager.class) { + if (INSTANCE == null) { + INSTANCE = new LoadManager(); + } + } + } + return INSTANCE; + } + + public Loader createLoader(JobConfig jobConfig, Boolean isRecoveryMode) { + switch (jobConfig.getLoadMode()) { + case PULL: + return new PullLoader(jobConfig, isRecoveryMode); + case PUSH: + default: + throw new UnsupportedOperationException(); + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java new file mode 100644 index 00000000..a940db8a --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java @@ -0,0 +1,17 @@ +package org.apache.doris.load; + +public class TransactionManager { + + public long beginTxn() { + return -1L; + } + + public void commitTxn(long txnId) { + + } + + public void abortTxn(long txnId) { + + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java new file mode 100644 index 00000000..0dc4f136 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -0,0 +1,92 @@ +package org.apache.doris.load.job; + +import org.apache.doris.common.JobStatus; +import org.apache.doris.common.SparkLoadException; +import org.apache.doris.config.JobConfig; + +import lombok.Getter; +import org.apache.spark.launcher.SparkAppHandle; +import org.apache.spark.launcher.SparkLauncher; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.locks.LockSupport; + +public abstract class Loader { + + private static final String SPARK_HADOOP_PREFIX = "spark.hadoop."; + + protected JobConfig jobConfig; + + protected boolean isRecoveryMode = false; + + @Getter + protected SparkAppHandle appHandle; + + @Getter + protected JobStatus jobStatus = JobStatus.RUNNING; + + protected final Map statusInfo = new HashMap<>(); + + public abstract void prepare() throws SparkLoadException; + + public void execute() throws SparkLoadException { + try { + appHandle = submitSparkJob(getMainClass(), getAppArgs(), getLogPath()); + } catch (IOException e) { + throw new SparkLoadException("submit spark job failed", e); + } + do { + if (SparkAppHandle.State.FAILED == appHandle.getState() + || SparkAppHandle.State.KILLED == appHandle.getState() + || SparkAppHandle.State.FINISHED == appHandle.getState()) { + if (SparkAppHandle.State.FAILED == appHandle.getState() + || SparkAppHandle.State.KILLED == appHandle.getState()) { + statusInfo.put("msg", + String.format("spark job run failed, appId: %s, state: %s", appHandle.getAppId(), + appHandle.getState())); + jobStatus = JobStatus.FAILED; + } else { + jobStatus = JobStatus.SUCCESS; + } + break; + } + statusInfo.put("appId", appHandle.getAppId()); + LockSupport.parkNanos(Duration.ofSeconds(5).toNanos()); + } while (true); + } + + private SparkAppHandle submitSparkJob(String mainClass, String[] appArgs, String logPath) throws IOException { + File logFile = new File(logPath); + if (!logFile.getParentFile().exists()) { + logFile.getParentFile().mkdir(); + } + JobConfig.SparkInfo sparkInfo = jobConfig.getSpark(); + SparkLauncher launcher = new SparkLauncher(jobConfig.getEnv()) + .setMaster(sparkInfo.getMaster()) + .setDeployMode(sparkInfo.getDeployMode()) + .setAppName("spark-load-" + jobConfig.getLabel()) + .setAppResource(sparkInfo.getDppJarPath()) + .setSparkHome(sparkInfo.getSparkHome()) + .setMainClass(mainClass) + .addAppArgs(appArgs) + .redirectError(logFile); + sparkInfo.getProperties().forEach(launcher::setConf); + jobConfig.getHadoopProperties().forEach((k, v) -> launcher.setConf(SPARK_HADOOP_PREFIX + k, v)); + return launcher.startApplication(); + } + + protected abstract String getMainClass(); + + protected abstract String[] getAppArgs(); + + protected abstract String getLogPath(); + + public abstract void afterFinished() throws SparkLoadException; + + public abstract void afterFailed(Exception e); + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java new file mode 100644 index 00000000..a57736fc --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -0,0 +1,350 @@ +package org.apache.doris.load.job; + +import org.apache.doris.SparkLoadRunner; +import org.apache.doris.client.DorisClient; +import org.apache.doris.common.DppResult; +import org.apache.doris.common.JobStatus; +import org.apache.doris.common.LoadInfo; +import org.apache.doris.common.SparkLoadException; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.JobConfig; +import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.util.DateUtils; +import org.apache.doris.util.HadoopUtils; +import org.apache.doris.util.JsonUtils; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.launcher.SparkAppHandle; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.locks.LockSupport; +import java.util.stream.Collectors; + +public class PullLoader extends Loader implements Recoverable { + + private static final Logger LOG = LogManager.getLogger(PullLoader.class); + + private static final String LOAD_META_JSON = "load_meta.json"; + + private static final String DPP_RESULT_JSON = "dpp_result.json"; + + private static final String SPARK_ETL_JOB_CLASS = "org.apache.doris.load.loadv2.etl.SparkEtlJob"; + + private LoadMeta loadMeta; + + private EtlJobConfig etlJobConfig; + + public PullLoader(JobConfig jobConfig, Boolean isRecoveryMode) { + this.jobConfig = jobConfig; + this.isRecoveryMode = isRecoveryMode; + } + + @Override + public void prepare() throws SparkLoadException { + DorisClient.FeClient feClient = DorisClient.getFeClient(jobConfig.getFeAddresses(), jobConfig.getUser(), + jobConfig.getPassword()); + Map> tableToPartition = jobConfig.getLoadTasks().entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().getTargetPartitions())); + loadMeta = feClient.createSparkLoad(jobConfig.getDatabase(), tableToPartition, jobConfig.getLabel(), + jobConfig.getJobProperties()); + etlJobConfig = loadMeta.getEtlJobConfig(jobConfig); + } + + @Override + public void execute() throws SparkLoadException { + + try { + cleanOutputPath(); + } catch (IOException e) { + throw new SparkLoadException("clean output path failed", e); + } + uploadMetaInfo(loadMeta, etlJobConfig.getOutputPath()); + + String etlJobConfPath = etlJobConfig.outputPath + "/configs/jobconfig.json"; + try { + HadoopUtils.createFile(jobConfig, etlJobConfig.configToJson(), etlJobConfPath, true); + } catch (IOException e) { + LOG.error("create job config file failed", e); + throw new SparkLoadException("create job config file failed", e); + } + + JobConfig.SparkInfo spark = jobConfig.getSpark(); + + String logDir = SparkLoadRunner.SPARK_LOAD_HOME + "/log"; + File file = new File(logDir); + if (!file.exists()) { + file.mkdir(); + } + + LOG.info("submit spark job in mode: " + spark.getMaster() + "-" + spark.getDeployMode()); + + super.execute(); + boolean isRunning = true; + do { + if (SparkAppHandle.State.FAILED == appHandle.getState() + || SparkAppHandle.State.KILLED == appHandle.getState() + || SparkAppHandle.State.FINISHED == appHandle.getState()) { + isRunning = false; + if (SparkAppHandle.State.FAILED == appHandle.getState() + || SparkAppHandle.State.KILLED == appHandle.getState()) { + statusInfo.put("msg", + String.format("spark job run failed, appId: %s, state: %s", appHandle.getAppId(), + appHandle.getState())); + LOG.error("spark job run failed, appId: " + appHandle.getAppId() + ", state: " + + appHandle.getState()); + jobStatus = JobStatus.FAILED; + } + LOG.info("spark job run finished, appId: " + appHandle.getAppId() + ", state: " + appHandle.getState()); + } + statusInfo.put("appId", appHandle.getAppId()); + } while (isRunning); + + jobStatus = JobStatus.SUCCESS; + + } + + @Override + public void afterFinished() throws SparkLoadException { + DorisClient.FeClient feClient = DorisClient.getFeClient(jobConfig.getFeAddresses(), jobConfig.getUser(), + jobConfig.getPassword()); + statusInfo.put("status", jobStatus.name()); + statusInfo.put("msg", ""); + statusInfo.put("appId", appHandle == null ? null : appHandle.getAppId()); + try { + String dppResultStr = null; + int checkCnt = 0; + while (checkCnt < 3) { + dppResultStr = getDppResultString(); + if (dppResultStr != null) { + break; + } + checkCnt++; + LockSupport.parkNanos(Duration.ofMillis(500).toNanos()); + } + if (dppResultStr == null) { + throw new SparkLoadException("get dpp result str failed"); + } + statusInfo.put("dppResult", dppResultStr); + statusInfo.put("filePathToSize", JsonUtils.writeValueAsString(getFilePathToSize())); + statusInfo.put("hadoopProperties", JsonUtils.writeValueAsString(jobConfig.getHadoopProperties())); + } catch (IOException e) { + throw new SparkLoadException("update job status failed", e); + } + feClient.updateSparkLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); + do { + LoadInfo loadInfo = feClient.getLoadInfo(jobConfig.getDatabase(), jobConfig.getLabel()); + switch (loadInfo.getState().toUpperCase(Locale.ROOT)) { + case "FINISHED": + LOG.info("loading job finished"); + try { + cleanOutputPath(); + } catch (IOException e) { + LOG.warn("clean output path failed", e); + } + return; + case "CANCELLED": + LOG.error("loading job failed, failed msg: " + loadInfo.getFailMsg()); + throw new SparkLoadException("loading job failed, " + loadInfo.getFailMsg()); + default: + LOG.info("load job unfinished, state: " + loadInfo.getState()); + break; + } + LockSupport.parkNanos(Duration.ofSeconds(15).toNanos()); + } while (true); + } + + @Override + public void afterFailed(Exception e) { + if (loadMeta == null) { + LOG.info("load job not start, skip update."); + return; + } + DorisClient.FeClient feClient = DorisClient.getFeClient(jobConfig.getFeAddresses(), jobConfig.getUser(), + jobConfig.getPassword()); + statusInfo.put("status", jobStatus.name()); + statusInfo.put("msg", e.getMessage()); + statusInfo.put("appId", appHandle == null ? null : appHandle.getAppId()); + try { + feClient.updateSparkLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); + } catch (SparkLoadException ex) { + LOG.warn("update load failed status failed", ex); + } + } + + @Override + public boolean canBeRecovered() throws SparkLoadException { + if (isRecoveryMode) { + String outputPath = etlJobConfig.getOutputPath(); + String parentOutputPath = outputPath.substring(0, StringUtils.lastIndexOf(outputPath, "/")); + try { + if (HadoopUtils.exists(jobConfig, parentOutputPath)) { + FileStatus[] fileStatuses = HadoopUtils.list(jobConfig, parentOutputPath); + if (fileStatuses.length != 1) { + return false; + } + fileStatuses = HadoopUtils.list(jobConfig, fileStatuses[0].getPath().toString()); + boolean hasDppResult = false; + for (FileStatus fileStatus : fileStatuses) { + String fileName = fileStatus.getPath().getName(); + if (DPP_RESULT_JSON.equalsIgnoreCase(fileName)) { + hasDppResult = true; + String content = HadoopUtils.readFile(jobConfig, fileStatus.getPath().toString()); + DppResult dppResult = JsonUtils.readValue(content, DppResult.class); + if (!checkDppResult(dppResult)) { + LOG.info("previous etl job is failed, cannot be recovered"); + return false; + } + } + // check meta consist + if (LOAD_META_JSON.equalsIgnoreCase(fileName)) { + String content = HadoopUtils.readFile(jobConfig, fileStatus.getPath().toString()); + LoadMeta oldLoadMeta = JsonUtils.readValue(content, LoadMeta.class); + for (Map.Entry entry : loadMeta.getTableMeta().entrySet()) { + TableMeta tableMeta = entry.getValue(); + TableMeta oldTableMeta = oldLoadMeta.getTableMeta().get(entry.getKey()); + // index count is not consistent + if (oldTableMeta == null + || oldTableMeta.getIndexes().size() != tableMeta.getIndexes().size()) { + LOG.info("index size mismatch, cannot be recovered"); + return false; + } + Map indexMap = tableMeta.getIndexes().stream() + .collect(Collectors.toMap(etlIndex -> etlIndex.indexId, + TableMeta.EtlIndex::toEtlIndex)); + Map oldIndexMap = oldTableMeta.getIndexes().stream() + .collect(Collectors.toMap(etlIndex -> etlIndex.indexId, + TableMeta.EtlIndex::toEtlIndex)); + for (Map.Entry indexEntry : indexMap.entrySet()) { + EtlJobConfig.EtlIndex index = indexEntry.getValue(); + EtlJobConfig.EtlIndex oldIndex = oldIndexMap.get(indexEntry.getKey()); + // index not exists or index mismatch + if (oldIndex == null || oldIndex.indexId != index.indexId + || oldIndex.schemaHash != index.schemaHash) { + LOG.info("index mismatch, old index: " + oldIndex + ", now index: " + index + + ", cannot be recovered"); + return false; + } + } + // check partition consistent + Set partitionSet = tableMeta.getPartitionInfo().partitions.stream().map( + p -> p.partitionId).collect(Collectors.toSet()); + Set oldPartitionSet = oldTableMeta.getPartitionInfo().partitions.stream().map( + p -> p.partitionId).collect(Collectors.toSet()); + if (oldPartitionSet.size() != partitionSet.size()) { + LOG.info("partition size mismatch, old partition size: " + oldPartitionSet.size() + + ", now partition size: " + partitionSet.size() + + ", cannot be recovered"); + return false; + } + for (Long partitionId : partitionSet) { + if (!oldPartitionSet.contains(partitionId)) { + LOG.info("partition id mismatch, partition id: " + partitionId + + ", cannot be recovered"); + return false; + } + } + } + } + } + return hasDppResult; + } + } catch (IOException e) { + throw new SparkLoadException("check recovery failed", e); + } + } + return false; + } + + @Override + public void prepareRecover() throws SparkLoadException { + String outputPath = etlJobConfig.getOutputPath(); + String parentOutputPath = outputPath.substring(0, StringUtils.lastIndexOf(outputPath, "/")); + try { + FileStatus[] fileStatuses = HadoopUtils.list(jobConfig, parentOutputPath); + HadoopUtils.move(jobConfig, fileStatuses[0].getPath().toString(), outputPath); + HadoopUtils.delete(jobConfig, outputPath + "/load_meta.json"); + uploadMetaInfo(loadMeta, etlJobConfig.getOutputPath()); + } catch (IOException e) { + throw new SparkLoadException("prepare recovery failed", e); + } + } + + private boolean checkDppResult(DppResult dppResult) { + if (!dppResult.isSuccess) { + return false; + } + int maxFilterRatio = Integer.parseInt(jobConfig.getJobProperties().getOrDefault("max_filter_ratio", "0")); + return dppResult.abnormalRows <= (dppResult.abnormalRows + dppResult.normalRows) * maxFilterRatio; + } + + private void uploadMetaInfo(LoadMeta metaInfo, String outputPath) throws SparkLoadException { + try { + if (!HadoopUtils.exists(jobConfig, outputPath)) { + HadoopUtils.mkdir(jobConfig, outputPath); + } + HadoopUtils.createFile(jobConfig, JsonUtils.writeValueAsBytes(metaInfo), + outputPath + "/load_meta.json", true); + } catch (IOException e) { + throw new SparkLoadException("upload load meta failed", e); + } + } + + @Override + protected String getMainClass() { + return SPARK_ETL_JOB_CLASS; + } + + @Override + protected String[] getAppArgs() { + return new String[] {etlJobConfig.outputPath + "/configs/jobconfig.json"}; + } + + @Override + protected String getLogPath() { + String formattedNow = DateUtils.getFormattedNow(DateUtils.NUMBER_FORMATER); + return SparkLoadRunner.SPARK_LOAD_HOME + "/logs/" + jobConfig.getLabel() + "-" + formattedNow + ".log"; + } + + public void cleanOutputPath() throws IOException { + if (HadoopUtils.exists(jobConfig, etlJobConfig.outputPath)) { + LOG.info("clean output: " + etlJobConfig.outputPath); + HadoopUtils.delete(jobConfig, etlJobConfig.outputPath); + } + } + + private String getDppResultString() throws SparkLoadException { + try { + return HadoopUtils.readFile(jobConfig, etlJobConfig.outputPath + "/dpp_result.json"); + } catch (IOException e) { + throw new SparkLoadException("get dpp result failed", e); + } + } + + private Map getFilePathToSize() throws SparkLoadException { + Map filePathToSize = new HashMap<>(); + try { + FileStatus[] fileStatuses = HadoopUtils.list(jobConfig, etlJobConfig.outputPath); + for (FileStatus fileStatus : fileStatuses) { + if (fileStatus.isDirectory()) { + continue; + } + filePathToSize.put(fileStatus.getPath().toString(), fileStatus.getLen()); + } + } catch (IOException e) { + throw new SparkLoadException("get dpp result failed", e); + } + return filePathToSize; + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java new file mode 100644 index 00000000..32661e8a --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java @@ -0,0 +1,11 @@ +package org.apache.doris.load.job; + +import org.apache.doris.common.SparkLoadException; + +public interface Recoverable { + + boolean canBeRecovered() throws SparkLoadException; + + void prepareRecover() throws SparkLoadException; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java new file mode 100644 index 00000000..8d0c3179 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java @@ -0,0 +1,21 @@ +package org.apache.doris.util; + +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; + +public class DateUtils { + + public static final DateTimeFormatter NORMAL_FORMATER = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").withZone( + ZoneId.systemDefault()); + + public static final DateTimeFormatter NUMBER_FORMATER = + DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone( + ZoneId.systemDefault()); + + public static String getFormattedNow(DateTimeFormatter formatter) { + return formatter.format(LocalDateTime.now(ZoneId.systemDefault())); + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java new file mode 100644 index 00000000..387a6b1e --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java @@ -0,0 +1,99 @@ +package org.apache.doris.util; + +import org.apache.doris.config.JobConfig; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Map; + +public class HadoopUtils { + + private static final String DEFAULT_FS_KEY = "fs.defaultFS"; + + private static FileSystem getFs(JobConfig config) throws IOException { + Configuration conf = new Configuration(); + Map props = config.getHadoopProperties(); + props.forEach(conf::set); + String defaultFs = props.getOrDefault(DEFAULT_FS_KEY, ""); + if (StringUtils.isBlank(defaultFs)) { + throw new IllegalArgumentException("fs.defaultFS is not set"); + } + return FileSystem.get(conf); + } + + public static void createFile(JobConfig config, String content, String path, Boolean overwrite) throws IOException { + try (FileSystem fs = getFs(config)) { + FSDataOutputStream outputStream = fs.create(new Path(path), overwrite); + outputStream.write(content.getBytes(StandardCharsets.UTF_8)); + outputStream.close(); + } + } + + public static void createFile(JobConfig config, byte[] contentBytes, String path, Boolean overwrite) + throws IOException { + try (FileSystem fs = getFs(config)) { + FSDataOutputStream outputStream = fs.create(new Path(path), overwrite); + outputStream.write(contentBytes); + outputStream.close(); + } + } + + public static void delete(JobConfig config, String path) throws IOException { + try (FileSystem fs = getFs(config)) { + fs.delete(new Path(path), true); + } + } + + public static boolean exists(JobConfig config, String path) throws IOException { + try (FileSystem fs = getFs(config)) { + return fs.exists(new Path(path)); + } + } + + public static FileStatus[] list(JobConfig config, String path) throws IOException { + try (FileSystem fs = getFs(config)) { + return fs.listStatus(new Path(path)); + } + } + + public static String readFile(JobConfig config, String path) throws IOException { + try (FileSystem fs = getFs(config)) { + Path p = new Path(path); + if (fs.exists(p) && fs.getFileStatus(p).isFile()) { + FSDataInputStream inputStream = fs.open(p); + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); + StringBuilder sb = new StringBuilder(); + String line; + while ((line = reader.readLine()) != null) { + sb.append(line); + } + return sb.toString(); + } + } + return null; + } + + public static void move(JobConfig config, String src, String dst) throws IOException { + try (FileSystem fs = getFs(config)) { + fs.rename(new Path(src), new Path(dst)); + } + } + + public static void mkdir(JobConfig config, String path) throws IOException { + try (FileSystem fs = getFs(config)) { + fs.mkdirs(new Path(path), new FsPermission(644)); + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java new file mode 100644 index 00000000..7df6c0e9 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java @@ -0,0 +1,42 @@ +package org.apache.doris.util; + +import org.apache.http.HttpEntity; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +public class HttpUtils { + + public static final int DEFAULT_CONN_TIMEOUT = 60 * 1000; + public static final int DEFAULT_SO_TIMEOUT = 60 * 1000; + + public static CloseableHttpClient getClient() { + return getClient(DEFAULT_CONN_TIMEOUT, DEFAULT_SO_TIMEOUT); + } + + public static CloseableHttpClient getClient(int connectionTimeout, int socketTimeout) { + RequestConfig requestConfig = RequestConfig.custom() + .setConnectTimeout(connectionTimeout) + .setSocketTimeout(socketTimeout) + .build(); + return HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); + } + + public static String getEntityContent(HttpEntity entity) throws IOException { + StringBuilder sb = new StringBuilder(); + try (InputStream is = entity.getContent(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is))) { + String line; + while ((line = reader.readLine()) != null) { + sb.append(line); + } + } + return sb.toString(); + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java new file mode 100644 index 00000000..5cd94300 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java @@ -0,0 +1,49 @@ +package org.apache.doris.util; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.json.JsonMapper; + +import java.io.File; +import java.io.IOException; + +/** + * json utilities + */ +public class JsonUtils { + + private static final ObjectMapper MAPPER = + JsonMapper.builder().enable(MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS).build(); + + public static T readValue(String s, Class clazz) throws JsonProcessingException { + return MAPPER.readValue(s, clazz); + } + + public static T readValue(String s, TypeReference ref) throws JsonProcessingException { + return MAPPER.readValue(s, ref); + } + + public static T readValue(File file, Class clazz) throws IOException { + return MAPPER.readValue(file, clazz); + } + + public static T readValue(JsonParser parser, Class clazz) throws IOException { + return MAPPER.readValue(parser, clazz); + } + + public static T readValue(JsonParser parser, TypeReference ref) throws IOException { + return MAPPER.readValue(parser, ref); + } + + public static String writeValueAsString(Object o) throws JsonProcessingException { + return MAPPER.writeValueAsString(o); + } + + public static byte[] writeValueAsBytes(Object o) throws JsonProcessingException { + return MAPPER.writeValueAsBytes(o); + } + +} diff --git a/spark-load/spark-load-core/src/main/resources/log4j.properties b/spark-load/spark-load-core/src/main/resources/log4j.properties new file mode 100644 index 00000000..1c90987e --- /dev/null +++ b/spark-load/spark-load-core/src/main/resources/log4j.properties @@ -0,0 +1,8 @@ +log4j.rootLogger=INFO,console +log4j.additivity.org.apache=true +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.Threshold=INFO +log4j.appender.console.ImmediateFlush=true +log4j.appender.console.Target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %p (%t|%tid) [%C{1}.%M():%L] %m%n \ No newline at end of file From abe3f2db586f124c8a0b91625492e10798ea3525 Mon Sep 17 00:00:00 2001 From: gnehil Date: Mon, 20 May 2024 17:15:28 +0800 Subject: [PATCH 02/45] add v2 type cast --- .../doris/load/loadv2/dpp/SparkDpp.java | 110 ++++++++++++------ 1 file changed, 74 insertions(+), 36 deletions(-) diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java index b282d7d1..401c45fa 100644 --- a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java +++ b/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -25,6 +25,7 @@ import com.google.common.collect.Maps; import com.google.gson.Gson; import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.IteratorUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.conf.Configuration; @@ -76,6 +77,7 @@ import java.util.Map; import java.util.Queue; import java.util.Set; +import java.util.stream.Collectors; // This class is a Spark-based data preprocessing program, // which will make use of the distributed compute framework of spark to // do ETL job/sort/preaggregate jobs in spark job @@ -153,17 +155,15 @@ private JavaPairRDD, Object[]> processRDDAggregate(JavaPairRDD, Object[]> result = currentPairRDD.mapToPair( + return currentPairRDD.mapToPair( new EncodeBaseAggregateTableFunction(sparkRDDAggregators)) .reduceByKey(new AggregateReduceFunction(sparkRDDAggregators)); - return result; } else { - JavaPairRDD, Object[]> result = currentPairRDD + return currentPairRDD .mapToPair(new EncodeRollupAggregateTableFunction( getColumnIndexInParentRollup(curNode.keyColumnNames, curNode.valueColumnNames, curNode.parent.keyColumnNames, curNode.parent.valueColumnNames))) .reduceByKey(new AggregateReduceFunction(sparkRDDAggregators)); - return result; } // Duplicate Table } else { @@ -190,7 +190,7 @@ private JavaPairRDD, Object[]> processRDDAggregate(JavaPairRDD, Object[]> resultRDD, String pathPattern, long tableId, EtlJobConfig.EtlIndex indexMeta, - SparkRDDAggregator[] sparkRDDAggregators) { + SparkRDDAggregator[] sparkRDDAggregators) { // TODO(wb) should deal largeint as BigInteger instead of string when using biginteger as key, // data type may affect sorting logic StructType dstSchema = DppUtils.createDstTableSchema(indexMeta.columns, false, true); @@ -225,9 +225,8 @@ private void writeRepartitionAndSortedRDDToParquet(JavaPairRDD, Obj columnObjects.add(sparkRDDAggregators[i].finalize(valueColumns[i])); } - Row rowWithoutBucketKey = RowFactory.create(columnObjects.toArray()); // if the bucket key is new, it will belong to a new tablet - if (lastBucketKey == null || !curBucketKey.equals(lastBucketKey)) { + if (!curBucketKey.equals(lastBucketKey)) { if (parquetWriter != null) { parquetWriter.close(); // rename tmpPath to path @@ -351,18 +350,18 @@ private Pair getColumnIndexInParentRollup(List chi List keyMap = new ArrayList<>(); List valueMap = new ArrayList<>(); // find column index in parent rollup schema - for (int i = 0; i < childRollupKeyColumns.size(); i++) { + for (String childRollupKeyColumn : childRollupKeyColumns) { for (int j = 0; j < parentRollupKeyColumns.size(); j++) { - if (StringUtils.equalsIgnoreCase(childRollupKeyColumns.get(i), parentRollupKeyColumns.get(j))) { + if (StringUtils.equalsIgnoreCase(childRollupKeyColumn, parentRollupKeyColumns.get(j))) { keyMap.add(j); break; } } } - for (int i = 0; i < childRollupValueColumns.size(); i++) { + for (String childRollupValueColumn : childRollupValueColumns) { for (int j = 0; j < parentRollupValueColumns.size(); j++) { - if (StringUtils.equalsIgnoreCase(childRollupValueColumns.get(i), parentRollupValueColumns.get(j))) { + if (StringUtils.equalsIgnoreCase(childRollupValueColumn, parentRollupValueColumns.get(j))) { valueMap.add(j); break; } @@ -375,7 +374,7 @@ private Pair getColumnIndexInParentRollup(List chi childRollupKeyColumns.size(), keyMap.size(), childRollupValueColumns.size(), valueMap.size())); } - return Pair.of(keyMap.toArray(new Integer[keyMap.size()]), valueMap.toArray(new Integer[valueMap.size()])); + return Pair.of(keyMap.toArray(new Integer[0]), valueMap.toArray(new Integer[0])); } /** @@ -472,13 +471,13 @@ private JavaPairRDD, Object[]> fillTupleWithPartitionColumn(Dataset List keyAndPartitionColumns = new ArrayList<>(); List keyColumns = new ArrayList<>(); List valueColumns = new ArrayList<>(valueColumnNames.size()); - for (int i = 0; i < keyAndPartitionColumnNames.size(); i++) { - String columnName = keyAndPartitionColumnNames.get(i); + for (String columnName : keyAndPartitionColumnNames) { Object columnObject = row.get(row.fieldIndex(columnName)); if (!validateData(columnObject, baseIndex.getColumn(columnName), parsers.get(columnName), row)) { + LOG.info("invalid row: " + row); abnormalRowAcc.add(1); - return result.iterator(); + return IteratorUtils.emptyIterator(); } keyAndPartitionColumns.add(columnObject); @@ -487,13 +486,12 @@ private JavaPairRDD, Object[]> fillTupleWithPartitionColumn(Dataset } } - for (int i = 0; i < valueColumnNames.size(); i++) { - String columnName = valueColumnNames.get(i); + for (String columnName : valueColumnNames) { Object columnObject = row.get(row.fieldIndex(columnName)); if (!validateData(columnObject, baseIndex.getColumn(columnName), parsers.get(columnName), row)) { abnormalRowAcc.add(1); - return result.iterator(); + return IteratorUtils.emptyIterator(); } valueColumns.add(columnObject); } @@ -545,6 +543,7 @@ private Dataset convertSrcDataframeToDstDataframe(EtlJobConfig.EtlIndex bas Dataset srcDataframe, StructType dstTableSchema, EtlJobConfig.EtlFileGroup fileGroup) throws SparkDppException { + Dataset dataframe = srcDataframe; StructType srcSchema = dataframe.schema(); Set srcColumnNames = new HashSet<>(); @@ -578,10 +577,11 @@ private Dataset convertSrcDataframeToDstDataframe(EtlJobConfig.EtlIndex bas throw new SparkDppException("Reason: no data for column:" + dstField.name()); } } - if (column.columnType.equalsIgnoreCase("DATE")) { + if (column.columnType.equalsIgnoreCase("DATE") || column.columnType.equalsIgnoreCase("DATEV2")) { dataframe = dataframe.withColumn(dstField.name(), dataframe.col(dstField.name()).cast(DataTypes.DateType)); - } else if (column.columnType.equalsIgnoreCase("DATETIME")) { + } else if (column.columnType.equalsIgnoreCase("DATETIME") + || column.columnType.equalsIgnoreCase("DATETIMEV2")) { dataframe = dataframe.withColumn(dstField.name(), dataframe.col(dstField.name()).cast(DataTypes.TimestampType)); } else if (column.columnType.equalsIgnoreCase("BOOLEAN")) { @@ -637,8 +637,7 @@ private Dataset loadDataFromPath(SparkSession spark, for (int i = 0; i < baseIndex.columns.size(); i++) { dstColumnNameToIndex.put(baseIndex.columns.get(i).columnName, i); } - List srcColumnsWithColumnsFromPath = new ArrayList<>(); - srcColumnsWithColumnsFromPath.addAll(dataSrcColumns); + List srcColumnsWithColumnsFromPath = new ArrayList<>(dataSrcColumns); if (fileGroup.columnsFromPath != null) { srcColumnsWithColumnsFromPath.addAll(fileGroup.columnsFromPath); } @@ -652,6 +651,9 @@ private Dataset loadDataFromPath(SparkSession spark, fileGroup.columnsFromPath.get(k), functions.lit(columnValueFromPath.get(k))); } } + if (!Strings.isNullOrEmpty(fileGroup.where)) { + dataFrame = dataFrame.where(fileGroup.where); + } return dataFrame; } @@ -663,6 +665,9 @@ private Dataset loadDataFromPath(SparkSession spark, fileGroup.columnsFromPath.get(k), functions.lit(columnValueFromPath.get(k))); } } + if (!Strings.isNullOrEmpty(fileGroup.where)) { + dataFrame = dataFrame.where(fileGroup.where); + } return dataFrame; } @@ -673,7 +678,7 @@ private Dataset loadDataFromPath(SparkSession spark, for (EtlJobConfig.EtlColumn column : baseIndex.columns) { parsers.add(ColumnParser.create(column)); } - char separator = (char) fileGroup.columnSeparator.getBytes(Charset.forName("UTF-8"))[0]; + char separator = (char) fileGroup.columnSeparator.getBytes(StandardCharsets.UTF_8)[0]; JavaRDD rowRDD = sourceDataRdd.flatMap( record -> { scannedRowsAcc.add(1); @@ -759,13 +764,12 @@ private StructType createScrSchema(List srcColumns) { StructField field = DataTypes.createStructField(srcColumn, DataTypes.StringType, true); fields.add(field); } - StructType srcSchema = DataTypes.createStructType(fields); - return srcSchema; + return DataTypes.createStructType(fields); } // This method is to keep the splitting consistent with broker load / mini load private String[] splitLine(String line, char sep) { - if (line == null || line.equals("")) { + if (line == null || line.isEmpty()) { return new String[0]; } int index = 0; @@ -784,7 +788,7 @@ private String[] splitLine(String line, char sep) { // partition keys will be parsed into double from json // so need to convert it to partition columns' type - private Object convertPartitionKey(Object srcValue, Class dstClass) throws SparkDppException { + private Object convertPartitionKey(Object srcValue, Class dstClass, boolean isV2Type) throws SparkDppException { if (dstClass.equals(Float.class) || dstClass.equals(Double.class)) { return null; } @@ -805,6 +809,9 @@ private Object convertPartitionKey(Object srcValue, Class dstClass) throws Spark return convertToJavaDate((int) srcValueDouble); } else if (dstClass.equals(java.sql.Timestamp.class)) { double srcValueDouble = (double) srcValue; + if (isV2Type) { + return convertV2ToJavaDatetime((long) srcValueDouble); + } return convertToJavaDatetime((long) srcValueDouble); } else { // dst type is string @@ -819,7 +826,7 @@ private Object convertPartitionKey(Object srcValue, Class dstClass) throws Spark private java.sql.Timestamp convertToJavaDatetime(long src) { String dateTimeStr = Long.valueOf(src).toString(); if (dateTimeStr.length() != 14) { - throw new RuntimeException("invalid input date format for SparkDpp"); + throw new RuntimeException("invalid input date format for SparkDpp, src: " + dateTimeStr); } String year = dateTimeStr.substring(0, 4); @@ -832,6 +839,24 @@ private java.sql.Timestamp convertToJavaDatetime(long src) { return java.sql.Timestamp.valueOf(String.format("%s-%s-%s %s:%s:%s", year, month, day, hour, min, sec)); } + private java.sql.Timestamp convertV2ToJavaDatetime(long src) { + String dateTimeStr = Long.valueOf(src).toString(); + if (dateTimeStr.length() != 18) { + throw new RuntimeException("invalid input date format for SparkDpp, src: " + dateTimeStr); + } + + long year = (src >> 46); + long month = (src >> 42) & ((1L << 4) - 1); + long day = (src >> 37) & ((1L << 5) - 1); + long hour = (src >> 32) & ((1L << 5) - 1); + long min = (src >> 26) & ((1L << 6) - 1); + long sec = (src >> 20) & ((1L << 6) - 1); + long ms = src & ((1L << 20) - 1); + + return java.sql.Timestamp.valueOf( + String.format("%d-%02d-%02d %02d:%02d:%02d.%d", year, month, day, hour, min, sec, ms)); + } + private java.sql.Date convertToJavaDate(int originDate) { int day = originDate & 0x1f; originDate >>= 5; @@ -842,14 +867,17 @@ private java.sql.Date convertToJavaDate(int originDate) { } private List createPartitionRangeKeys( - EtlJobConfig.EtlPartitionInfo partitionInfo, List partitionKeySchema) throws SparkDppException { + EtlJobConfig.EtlPartitionInfo partitionInfo, List> partitionKeySchema, + Map partitionKeyIndexToType) throws SparkDppException { List partitionRangeKeys = new ArrayList<>(); for (EtlJobConfig.EtlPartition partition : partitionInfo.partitions) { DorisRangePartitioner.PartitionRangeKey partitionRangeKey = new DorisRangePartitioner.PartitionRangeKey(); List startKeyColumns = new ArrayList<>(); for (int i = 0; i < partition.startKeys.size(); i++) { Object value = partition.startKeys.get(i); - startKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i))); + boolean isV2Type = + partitionKeyIndexToType.get(i) != null && partitionKeyIndexToType.get(i).endsWith("V2"); + startKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i), isV2Type)); } partitionRangeKey.startKeys = new DppColumns(startKeyColumns); if (!partition.isMaxPartition) { @@ -857,7 +885,9 @@ private List createPartitionRangeKeys( List endKeyColumns = new ArrayList<>(); for (int i = 0; i < partition.endKeys.size(); i++) { Object value = partition.endKeys.get(i); - endKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i))); + boolean isV2Type = + partitionKeyIndexToType.get(i) != null && partitionKeyIndexToType.get(i).endsWith("V2"); + endKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i), isV2Type)); } partitionRangeKey.endKeys = new DppColumns(endKeyColumns); } else { @@ -929,6 +959,7 @@ private Dataset loadDataFromHiveTable(SparkSession spark, } Dataset dataframe = spark.sql(sql.toString()); + dataframe.show(); // Note(wb): in current spark load implementation, spark load can't be consistent with doris BE; // The reason is as follows // For stream load in doris BE, it runs as follow steps: @@ -1016,7 +1047,8 @@ public Iterator call(Row row) throws Exception { if (abnormalRowAcc.value() <= 5) { invalidRows.add(row.toString()); } - } else if (columnIndexNeedToRepalceNull.size() != 0) { + } else if (!columnIndexNeedToRepalceNull.isEmpty()) { + scannedRowsAcc.add(1); Object[] newRow = new Object[row.size()]; for (int i = 0; i < row.size(); i++) { if (columnIndexNeedToRepalceNull.contains(i)) { @@ -1027,6 +1059,7 @@ public Iterator call(Row row) throws Exception { } result.add(RowFactory.create(newRow)); } else { + scannedRowsAcc.add(1); result.add(row); } return result.iterator(); @@ -1071,7 +1104,7 @@ private void process() throws Exception { EtlJobConfig.EtlPartitionInfo partitionInfo = etlTable.partitionInfo; List partitionKeyIndex = new ArrayList(); - List partitionKeySchema = new ArrayList<>(); + List> partitionKeySchema = new ArrayList<>(); for (String key : partitionInfo.partitionColumnRefs) { for (int i = 0; i < baseIndex.columns.size(); ++i) { EtlJobConfig.EtlColumn column = baseIndex.columns.get(i); @@ -1082,8 +1115,15 @@ private void process() throws Exception { } } } + Map columnToType = baseIndex.columns.stream().collect( + Collectors.toMap(etlColumn -> etlColumn.columnName, etlColumn -> etlColumn.columnType)); + Map partitionKeyIndexToType = new HashMap<>(); + for (int i = 0; i < partitionInfo.partitionColumnRefs.size(); i++) { + String partitionColumn = partitionInfo.partitionColumnRefs.get(i); + partitionKeyIndexToType.put(i, columnToType.get(partitionColumn)); + } List partitionRangeKeys - = createPartitionRangeKeys(partitionInfo, partitionKeySchema); + = createPartitionRangeKeys(partitionInfo, partitionKeySchema, partitionKeyIndexToType); StructType dstTableSchema = DppUtils.createDstTableSchema(baseIndex.columns, false, false); dstTableSchema = DppUtils.replaceBinaryColsInSchema(binaryBitmapColumnSet, dstTableSchema); RollupTreeBuilder rollupTreeParser = new MinimumCoverageRollupTreeBuilder(); @@ -1157,8 +1197,6 @@ private void writeDppResult(DppResult dppResult) throws Exception { public void doDpp() throws Exception { try { process(); - } catch (Exception e) { - throw e; } finally { // write dpp result to file in outputPath writeDppResult(dppResult); From 795238765679e84424d49b4df92058438c03ece2 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:15:59 +0800 Subject: [PATCH 03/45] check hll type and bitmap type mapping --- .../apache/doris/common/meta/LoadMeta.java | 69 ++++++++++++++++++- .../doris/common/meta/LoadMetaTest.java | 57 +++++++++++++++ 2 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java index 1c62edb2..1f019bcb 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -1,9 +1,11 @@ package org.apache.doris.common.meta; import org.apache.doris.common.Constants; +import org.apache.doris.exception.SparkLoadException; import org.apache.doris.config.JobConfig; import org.apache.doris.sparkdpp.EtlJobConfig; +import com.google.common.annotations.VisibleForTesting; import lombok.Data; import java.util.Arrays; @@ -11,6 +13,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; @Data @@ -22,15 +27,18 @@ public class LoadMeta { private Long signature; private Map tableMeta; - public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) { + public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) throws SparkLoadException { Map tables = new HashMap<>(); - getTableMeta().forEach((name, meta) -> { + for (Map.Entry entry : getTableMeta().entrySet()) { + String name = entry.getKey(); + TableMeta meta = entry.getValue(); EtlJobConfig.EtlTable etlTable = new EtlJobConfig.EtlTable(meta.getIndexes().stream().map( TableMeta.EtlIndex::toEtlIndex).collect(Collectors.toList()), meta.getPartitionInfo().toEtlPartitionInfo()); JobConfig.TaskInfo taskInfo = jobConfig.getLoadTasks().get(name); EtlJobConfig.EtlFileGroup fileGroup; Map columnMappingMap = taskInfo.toEtlColumnMappingMap(); + checkMapping(etlTable, columnMappingMap); List partitionIds = meta.getPartitionInfo().partitions.stream() .map(p -> p.partitionId).collect(Collectors.toList()); switch (taskInfo.getType()) { @@ -57,7 +65,7 @@ public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) { } etlTable.addFileGroup(fileGroup); tables.put(meta.getId(), etlTable); - }); + } String outputFilePattern = EtlJobConfig.getOutputFilePattern(jobConfig.getLabel(), EtlJobConfig.FilePatternVersion.V1); String label = jobConfig.getLabel(); @@ -69,4 +77,59 @@ public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) { return etlJobConfig; } + @VisibleForTesting + public void checkMapping(EtlJobConfig.EtlTable etlTable, + Map columnMappingMap) throws SparkLoadException { + Optional baseIdx = etlTable.indexes.stream().filter(idx -> idx.isBaseIndex).findFirst(); + if (baseIdx.isPresent()) { + EtlJobConfig.EtlIndex etlIndex = baseIdx.get(); + for (EtlJobConfig.EtlColumn column : etlIndex.columns) { + if ("HLL".equalsIgnoreCase(column.columnType)) { + EtlJobConfig.EtlColumnMapping mapping = columnMappingMap.get(column.columnName); + checkHllMapping(column.columnName, mapping); + } + if ("BITMAP".equalsIgnoreCase(column.columnType)) { + EtlJobConfig.EtlColumnMapping mapping = columnMappingMap.get(column.columnName); + checkBitmapMapping(column.columnName, mapping); + } + } + } + } + + private void checkHllMapping(String columnName, EtlJobConfig.EtlColumnMapping mapping) throws SparkLoadException { + if (mapping == null) { + throw new SparkLoadException(""); + } + Pattern pattern = Pattern.compile("(\\w+)\\(.*\\)"); + Matcher matcher = pattern.matcher(mapping.expr); + if (matcher.find()) { + if ("hll_hash".equalsIgnoreCase(matcher.group(1)) + || "hll_empty".equalsIgnoreCase(matcher.group(1))) { + return; + } + throw new SparkLoadException("HLL column must use hll function, like " + columnName + "=hll_hash(xxx) or " + + columnName + "=hll_empty()"); + } + } + + private void checkBitmapMapping(String columnName, EtlJobConfig.EtlColumnMapping mapping) + throws SparkLoadException { + if (mapping == null) { + throw new SparkLoadException(""); + } + Pattern pattern = Pattern.compile("(\\w+)\\(.*\\)"); + Matcher matcher = pattern.matcher(mapping.expr); + if (matcher.find()) { + if ("to_bitmap".equalsIgnoreCase(matcher.group(1)) || "bitmap_hash".equalsIgnoreCase(matcher.group(1)) + || "bitmap_dict".equalsIgnoreCase(matcher.group(1)) + || "binary_bitmap".equalsIgnoreCase(matcher.group(1))) { + return; + } + throw new SparkLoadException( + "BITMAP column must use bitmap function, like " + columnName + "=to_bitmap(xxx) or " + + columnName + "=bitmap_hash() or " + columnName + "=bitmap_dict() or " + + columnName + "=binary_bitmap()"); + } + } + } diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java new file mode 100644 index 00000000..025c662a --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java @@ -0,0 +1,57 @@ +package org.apache.doris.common.meta; + + +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class LoadMetaTest { + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void checkMapping() throws SparkLoadException { + + List columns = new ArrayList<>(); + columns.add(new EtlJobConfig.EtlColumn("id", "BIGINT", false, true, "NONE", null, 0, 10, 0)); + columns.add(new EtlJobConfig.EtlColumn("c1", "HLL", true, false, "NONE", null, 0, 10, 0)); + columns.add(new EtlJobConfig.EtlColumn("c2", "BITMAP", true, false, "NONE", null, 0, 10, 0)); + + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1, columns, 1, "DUPLICATE", true); + EtlJobConfig.EtlPartition etlPartition = + new EtlJobConfig.EtlPartition(1L, Collections.singletonList(0), Collections.singletonList(1), true, 1); + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = + new EtlJobConfig.EtlPartitionInfo("RANGE", Collections.singletonList("id"), + Collections.singletonList("id"), Collections.singletonList(etlPartition)); + + EtlJobConfig.EtlTable etlTable = new EtlJobConfig.EtlTable(Collections.singletonList(etlIndex), + etlPartitionInfo); + + LoadMeta loadMeta = new LoadMeta(); + + Map columnMappingMap = new HashMap<>(); + columnMappingMap.put("c2", new EtlJobConfig.EtlColumnMapping("to_bitmap(c1)")); + Assert.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap)); + + Map columnMappingMap1 = new HashMap<>(); + columnMappingMap1.put("c1", new EtlJobConfig.EtlColumnMapping("hll_hash(c1)")); + Assert.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap1)); + + Map columnMappingMap2 = new HashMap<>(); + columnMappingMap2.put("c1", new EtlJobConfig.EtlColumnMapping("hll_hash(c1)")); + columnMappingMap2.put("c2", new EtlJobConfig.EtlColumnMapping("to_bitmap(c1)")); + loadMeta.checkMapping(etlTable, columnMappingMap2); + + } +} \ No newline at end of file From b1c983d4f57624270ee6d20679842d072c991aae Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:16:26 +0800 Subject: [PATCH 04/45] remove duplicate code --- .../org/apache/doris/load/job/PullLoader.java | 32 +++++-------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index a57736fc..ca770516 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -3,12 +3,12 @@ import org.apache.doris.SparkLoadRunner; import org.apache.doris.client.DorisClient; import org.apache.doris.common.DppResult; -import org.apache.doris.common.JobStatus; import org.apache.doris.common.LoadInfo; -import org.apache.doris.common.SparkLoadException; +import org.apache.doris.common.enums.JobStatus; import org.apache.doris.common.meta.LoadMeta; import org.apache.doris.common.meta.TableMeta; import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; import org.apache.doris.sparkdpp.EtlJobConfig; import org.apache.doris.util.DateUtils; import org.apache.doris.util.HadoopUtils; @@ -18,7 +18,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.spark.launcher.SparkAppHandle; import java.io.File; import java.io.IOException; @@ -59,6 +58,7 @@ public void prepare() throws SparkLoadException { loadMeta = feClient.createSparkLoad(jobConfig.getDatabase(), tableToPartition, jobConfig.getLabel(), jobConfig.getJobProperties()); etlJobConfig = loadMeta.getEtlJobConfig(jobConfig); + jobStatus = JobStatus.SUCCESS; } @Override @@ -87,30 +87,14 @@ public void execute() throws SparkLoadException { file.mkdir(); } - LOG.info("submit spark job in mode: " + spark.getMaster() + "-" + spark.getDeployMode()); + LOG.info("submit spark job on master: " + spark.getMaster() + ", deployMode: " + spark.getDeployMode()); super.execute(); - boolean isRunning = true; - do { - if (SparkAppHandle.State.FAILED == appHandle.getState() - || SparkAppHandle.State.KILLED == appHandle.getState() - || SparkAppHandle.State.FINISHED == appHandle.getState()) { - isRunning = false; - if (SparkAppHandle.State.FAILED == appHandle.getState() - || SparkAppHandle.State.KILLED == appHandle.getState()) { - statusInfo.put("msg", - String.format("spark job run failed, appId: %s, state: %s", appHandle.getAppId(), - appHandle.getState())); - LOG.error("spark job run failed, appId: " + appHandle.getAppId() + ", state: " - + appHandle.getState()); - jobStatus = JobStatus.FAILED; - } - LOG.info("spark job run finished, appId: " + appHandle.getAppId() + ", state: " + appHandle.getState()); - } - statusInfo.put("appId", appHandle.getAppId()); - } while (isRunning); - jobStatus = JobStatus.SUCCESS; + if (jobStatus == JobStatus.FAILED) { + throw new SparkLoadException("spark job run failed, msg: " + statusInfo.get("msg")); + } + LOG.info("spark job run finished."); } From 689cefe0ff9721f375214560a044e32ee855babc Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:18:10 +0800 Subject: [PATCH 05/45] package refactor --- .../src/main/java/org/apache/doris/client/DorisClient.java | 2 +- .../java/org/apache/doris/common/{ => enums}/JobStatus.java | 2 +- .../java/org/apache/doris/common/{ => enums}/LoadMode.java | 2 +- .../src/main/java/org/apache/doris/config/JobConfig.java | 3 ++- .../doris/{common => exception}/SparkLoadException.java | 2 +- .../src/main/java/org/apache/doris/load/job/Loader.java | 4 ++-- .../src/main/java/org/apache/doris/load/job/Recoverable.java | 2 +- 7 files changed, 9 insertions(+), 8 deletions(-) rename spark-load/spark-load-core/src/main/java/org/apache/doris/common/{ => enums}/JobStatus.java (62%) rename spark-load/spark-load-core/src/main/java/org/apache/doris/common/{ => enums}/LoadMode.java (51%) rename spark-load/spark-load-core/src/main/java/org/apache/doris/{common => exception}/SparkLoadException.java (86%) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java index c4fc8154..1125fcce 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -2,7 +2,7 @@ import org.apache.doris.common.LoadInfo; import org.apache.doris.common.ResponseEntity; -import org.apache.doris.common.SparkLoadException; +import org.apache.doris.exception.SparkLoadException; import org.apache.doris.common.meta.LoadInfoResponse; import org.apache.doris.common.meta.LoadMeta; import org.apache.doris.util.HttpUtils; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java similarity index 62% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java rename to spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java index 2f149b6c..b9900e3c 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/JobStatus.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java @@ -1,4 +1,4 @@ -package org.apache.doris.common; +package org.apache.doris.common.enums; public enum JobStatus { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java similarity index 51% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java rename to spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java index 31ac0c59..9eb9be6c 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadMode.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java @@ -1,4 +1,4 @@ -package org.apache.doris.common; +package org.apache.doris.common.enums; public enum LoadMode { PUSH, PULL; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 5a7cd63e..5245c62f 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -1,7 +1,8 @@ package org.apache.doris.config; import org.apache.doris.SparkLoadRunner; -import org.apache.doris.common.LoadMode; +import org.apache.doris.common.Constants; +import org.apache.doris.common.enums.LoadMode; import org.apache.doris.sparkdpp.EtlJobConfig; import com.fasterxml.jackson.annotation.JsonProperty; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java similarity index 86% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java rename to spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java index bdde745d..cc2b043e 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/SparkLoadException.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java @@ -1,4 +1,4 @@ -package org.apache.doris.common; +package org.apache.doris.exception; public class SparkLoadException extends Exception { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java index 0dc4f136..aed937c0 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -1,7 +1,7 @@ package org.apache.doris.load.job; -import org.apache.doris.common.JobStatus; -import org.apache.doris.common.SparkLoadException; +import org.apache.doris.common.enums.JobStatus; +import org.apache.doris.exception.SparkLoadException; import org.apache.doris.config.JobConfig; import lombok.Getter; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java index 32661e8a..54acdda6 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java @@ -1,6 +1,6 @@ package org.apache.doris.load.job; -import org.apache.doris.common.SparkLoadException; +import org.apache.doris.exception.SparkLoadException; public interface Recoverable { From 79d3dac19476a5bfb010730c5edf078e42f07b21 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:18:35 +0800 Subject: [PATCH 06/45] optimize spark app state check --- .../src/main/java/org/apache/doris/load/job/Loader.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java index aed937c0..18da95f1 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -40,9 +40,7 @@ public void execute() throws SparkLoadException { throw new SparkLoadException("submit spark job failed", e); } do { - if (SparkAppHandle.State.FAILED == appHandle.getState() - || SparkAppHandle.State.KILLED == appHandle.getState() - || SparkAppHandle.State.FINISHED == appHandle.getState()) { + if (appHandle.getState().isFinal()) { if (SparkAppHandle.State.FAILED == appHandle.getState() || SparkAppHandle.State.KILLED == appHandle.getState()) { statusInfo.put("msg", From 37477650a893e3e6c03b493f82f4af6e3f3a16e7 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:18:54 +0800 Subject: [PATCH 07/45] enhance spark master check --- .../org/apache/doris/config/JobConfig.java | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 5245c62f..42884a05 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils; import java.io.File; +import java.net.URI; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -141,7 +142,8 @@ public void checkTaskInfo() { case HIVE: Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveDatabase()), "hive database is empty"); - Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveTable()), "hive table is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveTable()), + "hive table is empty"); break; case FILE: Preconditions.checkArgument(taskInfo.getPaths() != null && !taskInfo.getPaths().isEmpty(), @@ -166,15 +168,14 @@ public void checkSparkInfo() { "spark config item sparkHome is empty"); Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getWorkingDir()), "spark config item workingDir is empty"); - Preconditions.checkArgument( - StringUtils.equalsAnyIgnoreCase(sparkInfo.getMaster(), "yarn", "standalone", "local"), + Preconditions.checkArgument(checkSparkMaster(sparkInfo.getMaster()), "spark master only supports yarn or standalone or local "); Preconditions.checkArgument( StringUtils.equalsAnyIgnoreCase(sparkInfo.getDeployMode(), "cluster", "client"), "spark deployMode only supports cluster or client "); - if ("local".equalsIgnoreCase(sparkInfo.getMaster())) { + if (!"yarn".equalsIgnoreCase(sparkInfo.getMaster())) { Preconditions.checkArgument("client".equalsIgnoreCase(sparkInfo.getDeployMode()), - "local master only supports client mode"); + "standalone and local master only supports client mode"); } if (LoadMode.PULL == getLoadMode()) { if (StringUtils.isBlank(getSpark().getDppJarPath())) { @@ -186,4 +187,16 @@ public void checkSparkInfo() { } } + private boolean checkSparkMaster(String master) { + if (StringUtils.isBlank(master)) { + return false; + } + if ("yarn".equalsIgnoreCase(master) || master.startsWith("local")) { + return true; + } + URI uri = URI.create(master); + return Constants.SPARK_STANDALONE_SCHEME.equalsIgnoreCase(uri.getScheme()) + && StringUtils.isNoneBlank(uri.getHost()) && uri.getPort() != -1; + } + } From 59fc6d2d18f3bd0aa9a385fb91b2a74451e2c531 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:19:15 +0800 Subject: [PATCH 08/45] add junit dependency --- spark-load/spark-load-core/pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index d305ed92..8c516836 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -84,6 +84,11 @@ org.slf4j slf4j-api + + org.junit.vintage + junit-vintage-engine + test + From 7108c90af37b51bb206bdaba2d8504147dd558e0 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:19:28 +0800 Subject: [PATCH 09/45] add const --- .../src/main/java/org/apache/doris/common/Constants.java | 1 + 1 file changed, 1 insertion(+) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java index ef807f69..60aec476 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java @@ -3,5 +3,6 @@ public interface Constants { String HIVE_METASTORE_URIS = "hive.metastore.uris"; + String SPARK_STANDALONE_SCHEME = "spark"; } From 812c7844f02f049c997822bdfc840bc4fa9fe6b2 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 21 May 2024 17:20:27 +0800 Subject: [PATCH 10/45] add git ignored item and remove files which should not be uploaded --- .gitignore | 10 + spark-load/spark-dpp/.flattened-pom.xml | 409 ------------------ .../spark-dpp/dependency-reduced-pom.xml | 237 ---------- 3 files changed, 10 insertions(+), 646 deletions(-) delete mode 100644 spark-load/spark-dpp/.flattened-pom.xml delete mode 100644 spark-load/spark-dpp/dependency-reduced-pom.xml diff --git a/.gitignore b/.gitignore index 4413142c..71e2e797 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,16 @@ spark-doris-connector/output/ spark-doris-connector/target/ spark-doris-connector/.idea/ +spark-load/spark-dpp/dependency-reduced-pom.xml +spark-load/spark-dpp/output/ +spark-load/spark-dpp/target/ +spark-load/spark-dpp/.idea/ + +spark-load/spark-load-core/dependency-reduced-pom.xml +spark-load/spark-load-core/output/ +spark-load/spark-load-core/target/ +spark-load/spark-load-core/.idea/ + ### Java template # Compiled class file diff --git a/spark-load/spark-dpp/.flattened-pom.xml b/spark-load/spark-dpp/.flattened-pom.xml deleted file mode 100644 index 0e8b1d66..00000000 --- a/spark-load/spark-dpp/.flattened-pom.xml +++ /dev/null @@ -1,409 +0,0 @@ - - - - 4.0.0 - org.apache.doris - spark-dpp - 1.2-SNAPSHOT - The Apache Software Foundation provides support for the Apache community of open-source software projects. - The Apache projects are characterized by a collaborative, consensus based development process, an open and - pragmatic software license, and a desire to create high quality software that leads the way in its field. - We consider ourselves not simply a group of projects sharing a server, but rather a community of developers - and users. - https://doris.apache.org/spark-dpp/ - - The Apache Software Foundation - https://www.apache.org/ - - - - Apache 2.0 License - https://www.apache.org/licenses/LICENSE-2.0.html - repo - - - - - Dev Mailing List - dev-subscribe@doris.apache.org - dev-unsubscribe@doris.apache.org - dev@doris.apache.org - - - Commits Mailing List - commits-subscribe@doris.apache.org - commits-unsubscribe@doris.apache.org - commits@doris.apache.org - - - - scm:git:https://git@github.com/apache/doris.git/spark-dpp - scm:git:https://git@github.com/apache/doris.git/spark-dpp - scm:git:https://git@github.com/apache/doris.git/spark-dpp - - - GitHub - https://github.com/apache/doris/issues - - - - apache.releases.https - Apache Release Distribution Repository - https://repository.apache.org/service/local/staging/deploy/maven2 - - - apache.snapshots.https - Apache Development Snapshot Repository - https://repository.apache.org/content/repositories/snapshots - - - - 4.7.2 - https://repository.apache.org/service/local/staging/deploy/maven2 - https://repository.apache.org/content/repositories/snapshots - 1.3.2 - https://sonarcloud.io - 2.15.2 - 2.0.3 - https://www.apache.org/images/asf_logo_wide_2016.png - 4.9.3 - 2.0.6 - 1.13 - 2.3.0 - Apache Release Distribution Repository - 1.0.1 - 1.5 - 0.4.6 - 1 - false - 1.22 - 9.35 - 6.5.1 - 1.2.0 - 3.4.1 - 2.18.0 - 1.12.669 - 3.4.0 - 2.1.1 - 3.1.5 - 6.4.5 - 4.0.2 - 0.8.13 - 1.22.0 - 3.0.9 - 1.70 - com.google.protobuf:protoc:3.24.3 - ${fe.dir}/../thirdparty - 2.9.3 - 2.22.2 - 6.7.2 - 4.5.13 - 1.1.1 - 1.33.0 - 2.10.1 - 9.4 - 2.7.4-11 - 15.0.2 - 3.4.0 - 3.42.0 - posix - 1.9.7 - 2.3 - 0.2.14 - 1.60.1 - 2.7 - 12.2.0.1 - 4.0.2 - 2.1 - 2.0 - 3.1.3 - 3.8.9.Final - 1.13.1 - apache - 1.2 - 4.4.15 - UTF-8 - 1.18.24 - 1.8 - 1.8.4 - 1.10.0 - 2022-12-11T19:18:10Z - 1.0.4 - 435 - shade-format-flatbuffers - 2.2 - 2.3.9 - 3.9.1 - source-release - 3.3.6 - 2.10.1 - 3.0.0 - 3.10.6.Final - 3.24.3 - 3.9 - 2.0.1.Final - 1.11.3 - 5.8.2 - Apache Development Snapshot Repository - 3.18.2-GA - 3.2.2 - 1.8 - 3.7.0 - 0.2.3 - 0.45.2-public - 1.8 - 1.4 - 2.7.13 - 4.1.104.Final - 0.8.10 - 2.8.1 - 2.6 - 1.2-SNAPSHOT - 1.4.3 - 8.5.86 - true - 2.3.2 - 2.12.10 - 9.4.53.v20231009 - 0.6.0-incubating - 32.1.2-jre - 2.1 - 1.5.1 - 1.12.0 - 3.0.0-8 - github - 2.2 - UTF-8 - 3.4.4 - 1.34.0 - 1.11-8 - 2.4.0 - 202 - 0.14.1 - 2.18.0 - 1.49 - 3.2.5 - 1.1.10.5 - 0.16.0 - 3.1.0 - 1.4.3 - io.grpc:protoc-gen-grpc-java:1.34.0 - 18.3.14-doris-SNAPSHOT - 1.5.4 - 1.2.5 - /Users/gnehil/doris/fe/spark-dpp/../../ - 0.11-a-czt02-cdh - 3.24.3 - 1.7 - - - - org.apache.doris - fe-common - 1.2-SNAPSHOT - compile - - - commons-codec - commons-codec - 1.13 - compile - - - org.apache.commons - commons-lang3 - 3.9 - compile - - - com.google.code.gson - gson - 2.10.1 - compile - - - org.apache.spark - spark-core_2.12 - 3.4.1 - provided - - - log4j - log4j - - - org.slf4j - slf4j-log4j12 - - - org.eclipse.jetty - jetty-util - - - io.netty - netty - - - - - io.netty - netty-all - 4.1.104.Final - compile - - - org.apache.spark - spark-sql_2.12 - 3.4.1 - provided - - - org.apache.arrow - arrow-vector - - - - - org.apache.hadoop - hadoop-common - 3.3.6 - compile - - - jdk.tools - jdk.tools - - - org.eclipse.jetty - jetty-util - - - org.eclipse.jetty - jetty-servlet - - - io.netty - netty-all - - - log4j - log4j - - - - - org.apache.parquet - parquet-column - 1.13.1 - compile - - - org.apache.parquet - parquet-hadoop - 1.13.1 - compile - - - org.apache.parquet - parquet-common - 1.13.1 - compile - - - commons-collections - commons-collections - 3.2.2 - compile - - - org.scala-lang - scala-library - 2.12.10 - provided - - - com.esotericsoftware - kryo-shaded - 4.0.2 - compile - - - org.apache.spark - spark-catalyst_2.12 - 3.4.1 - provided - - - com.google.guava - guava - 32.1.2-jre - compile - - - org.apache.logging.log4j - log4j-core - 2.18.0 - compile - - - org.apache.logging.log4j - log4j-slf4j-impl - 2.18.0 - compile - - - org.apache.logging.log4j - log4j-core - - - - - org.apache.logging.log4j - log4j-1.2-api - 2.18.0 - compile - - - org.awaitility - awaitility - 4.2.0 - compile - - - - - - always - - snapshots - apache snapshots maven repo https - https://repository.apache.org/content/repositories/snapshots/ - - - - false - - apache.snapshots - Apache Snapshot Repository - https://repository.apache.org/snapshots - - - diff --git a/spark-load/spark-dpp/dependency-reduced-pom.xml b/spark-load/spark-dpp/dependency-reduced-pom.xml deleted file mode 100644 index 7bf4da08..00000000 --- a/spark-load/spark-dpp/dependency-reduced-pom.xml +++ /dev/null @@ -1,237 +0,0 @@ - - - - fe - org.apache.doris - ${revision} - - 4.0.0 - spark-dpp - - spark-dpp-${project.version} - - - maven-surefire-plugin - - ${fe_ut_parallel} - false - -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} - - - - maven-dependency-plugin - - - copy-dependencies - package - - copy-dependencies - - - ${project.build.directory}/lib - false - false - true - runtime - ${skip.plugin} - - - - - - maven-assembly-plugin - - - make-assembly - package - - single - - - - - - - org.apache.doris.load.loadv2.etl.SparkEtlJob - - - - jar-with-dependencies - - - - - org.codehaus.mojo - cobertura-maven-plugin - 2.7 - - - 1024m - - - - - maven-clean-plugin - 3.1.0 - - - auto-clean - initialize - - clean - - - - - - maven-shade-plugin - - - package - - shade - - - - - - - com.google.code.findbugs:* - org.slf4j:* - - - - - org.roaringbitmap - org.apache.doris.shaded.org.roaringbitmap - com.google.guava - org.apache.doris.shaded.com.google.guava - - - - - - maven-javadoc-plugin - - true - - - - - - - org.apache.spark - spark-core_2.12 - 3.4.1 - provided - - - log4j - log4j - - - slf4j-log4j12 - org.slf4j - - - jetty-util - org.eclipse.jetty - - - netty - io.netty - - - - - org.apache.spark - spark-sql_2.12 - 3.4.1 - provided - - - arrow-vector - org.apache.arrow - - - - - org.scala-lang - scala-library - 2.12.10 - provided - - - org.apache.spark - spark-catalyst_2.12 - 3.4.1 - provided - - - org.junit.jupiter - junit-jupiter-engine - 5.8.2 - test - - - junit-platform-engine - org.junit.platform - - - junit-jupiter-api - org.junit.jupiter - - - apiguardian-api - org.apiguardian - - - - - org.junit.vintage - junit-vintage-engine - 5.8.2 - test - - - junit - junit - - - junit-platform-engine - org.junit.platform - - - apiguardian-api - org.apiguardian - - - - - org.junit.jupiter - junit-jupiter-params - 5.8.2 - test - - - junit-jupiter-api - org.junit.jupiter - - - apiguardian-api - org.apiguardian - - - - - org.jmockit - jmockit - 1.49 - test - - - - 1 - ${basedir}/../../ - - From 4935a56f4d2b27e9631316d644c4d744671a1405 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:16:04 +0800 Subject: [PATCH 11/45] change dpp module --- .../{spark-dpp => spark-load-dpp}/pom.xml | 43 ++++++++++++++++++- .../doris/common/SparkDppException.java | 0 .../doris/load/loadv2/dpp/ColumnParser.java | 0 .../load/loadv2/dpp/DorisKryoRegistrator.java | 0 .../loadv2/dpp/DorisRangePartitioner.java | 0 .../doris/load/loadv2/dpp/DppColumns.java | 0 .../doris/load/loadv2/dpp/DppUtils.java | 0 .../load/loadv2/dpp/GlobalDictBuilder.java | 0 .../dpp/MinimumCoverageRollupTreeBuilder.java | 0 .../load/loadv2/dpp/RollupTreeBuilder.java | 0 .../doris/load/loadv2/dpp/RollupTreeNode.java | 0 .../doris/load/loadv2/dpp/SparkDpp.java | 0 .../load/loadv2/dpp/SparkRDDAggregator.java | 0 .../load/loadv2/dpp/StringAccumulator.java | 0 .../doris/load/loadv2/etl/SparkEtlJob.java | 0 .../load/loadv2/dpp/ColumnParserTest.java | 0 .../loadv2/dpp/DorisRangePartitionerTest.java | 0 .../doris/load/loadv2/dpp/DppUtilsTest.java | 0 .../MinimumCoverageRollupTreeBuilderTest.java | 0 .../doris/load/loadv2/dpp/SparkDppTest.java | 0 .../load/loadv2/etl/SparkEtlJobTest.java | 0 21 files changed, 41 insertions(+), 2 deletions(-) rename spark-load/{spark-dpp => spark-load-dpp}/pom.xml (87%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/common/SparkDppException.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java (100%) rename spark-load/{spark-dpp => spark-load-dpp}/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java (100%) diff --git a/spark-load/spark-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml similarity index 87% rename from spark-load/spark-dpp/pom.xml rename to spark-load/spark-load-dpp/pom.xml index cc4516f1..a84414b1 100644 --- a/spark-load/spark-dpp/pom.xml +++ b/spark-load/spark-load-dpp/pom.xml @@ -25,7 +25,7 @@ under the License. ${revision} spark-load - spark-dpp + spark-load-dpp jar @@ -142,10 +142,49 @@ under the License. org.slf4j slf4j-api + + org.apache.hadoop + hadoop-aws + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + servlet-api + javax.servlet + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-bundle + + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-glue + + + com.amazonaws + aws-java-sdk-dynamodb + - spark-dpp-${project.version} + spark-load-dpp-${project.version} diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/common/SparkDppException.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/common/SparkDppException.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/common/SparkDppException.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/common/SparkDppException.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java diff --git a/spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java similarity index 100% rename from spark-load/spark-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java rename to spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java similarity index 100% rename from spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java rename to spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java similarity index 100% rename from spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java rename to spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java similarity index 100% rename from spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java rename to spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java similarity index 100% rename from spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java rename to spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java similarity index 100% rename from spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java rename to spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java diff --git a/spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java similarity index 100% rename from spark-load/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java rename to spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java From 8731d501b4bc852d483a48ada221f349cd3f6b36 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:16:30 +0800 Subject: [PATCH 12/45] rename fs util --- ...{HadoopUtils.java => FileSystemUtils.java} | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) rename spark-load/spark-load-core/src/main/java/org/apache/doris/util/{HadoopUtils.java => FileSystemUtils.java} (63%) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java similarity index 63% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java rename to spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java index 387a6b1e..a4998b79 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HadoopUtils.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java @@ -2,7 +2,6 @@ import org.apache.doris.config.JobConfig; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -17,24 +16,19 @@ import java.nio.charset.StandardCharsets; import java.util.Map; -public class HadoopUtils { +public class FileSystemUtils { - private static final String DEFAULT_FS_KEY = "fs.defaultFS"; - - private static FileSystem getFs(JobConfig config) throws IOException { + private static FileSystem getFs(JobConfig config, Path path) throws IOException { Configuration conf = new Configuration(); Map props = config.getHadoopProperties(); props.forEach(conf::set); - String defaultFs = props.getOrDefault(DEFAULT_FS_KEY, ""); - if (StringUtils.isBlank(defaultFs)) { - throw new IllegalArgumentException("fs.defaultFS is not set"); - } - return FileSystem.get(conf); + return FileSystem.get(path.toUri(), conf); } public static void createFile(JobConfig config, String content, String path, Boolean overwrite) throws IOException { - try (FileSystem fs = getFs(config)) { - FSDataOutputStream outputStream = fs.create(new Path(path), overwrite); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + FSDataOutputStream outputStream = fs.create(p, overwrite); outputStream.write(content.getBytes(StandardCharsets.UTF_8)); outputStream.close(); } @@ -42,34 +36,38 @@ public static void createFile(JobConfig config, String content, String path, Boo public static void createFile(JobConfig config, byte[] contentBytes, String path, Boolean overwrite) throws IOException { - try (FileSystem fs = getFs(config)) { - FSDataOutputStream outputStream = fs.create(new Path(path), overwrite); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + FSDataOutputStream outputStream = fs.create(p, overwrite); outputStream.write(contentBytes); outputStream.close(); } } public static void delete(JobConfig config, String path) throws IOException { - try (FileSystem fs = getFs(config)) { - fs.delete(new Path(path), true); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + fs.delete(p, true); } } public static boolean exists(JobConfig config, String path) throws IOException { - try (FileSystem fs = getFs(config)) { - return fs.exists(new Path(path)); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + return fs.exists(p); } } public static FileStatus[] list(JobConfig config, String path) throws IOException { - try (FileSystem fs = getFs(config)) { - return fs.listStatus(new Path(path)); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + return fs.listStatus(p); } } public static String readFile(JobConfig config, String path) throws IOException { - try (FileSystem fs = getFs(config)) { - Path p = new Path(path); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { if (fs.exists(p) && fs.getFileStatus(p).isFile()) { FSDataInputStream inputStream = fs.open(p); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); @@ -80,19 +78,22 @@ public static String readFile(JobConfig config, String path) throws IOException } return sb.toString(); } + throw new UnsupportedOperationException("read file is not exist or is not a file, path: " + path); } - return null; } public static void move(JobConfig config, String src, String dst) throws IOException { - try (FileSystem fs = getFs(config)) { - fs.rename(new Path(src), new Path(dst)); + Path srcPath = new Path(src); + Path dstpath = new Path(dst); + try (FileSystem fs = getFs(config, srcPath)) { + fs.rename(srcPath, dstpath); } } public static void mkdir(JobConfig config, String path) throws IOException { - try (FileSystem fs = getFs(config)) { - fs.mkdirs(new Path(path), new FsPermission(644)); + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + fs.mkdirs(p, new FsPermission(644)); } } From e1eedc076fa97211d650a0cde3d8ae80779a972e Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:16:50 +0800 Subject: [PATCH 13/45] rename default app jar name --- .../src/main/java/org/apache/doris/config/JobConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 42884a05..7bba2fc6 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -113,7 +113,7 @@ public static class SparkInfo { private String driverMemory; private String dppJarPath = - SparkLoadRunner.SPARK_LOAD_HOME + "/spark-dpp-1.0-SNAPSHOT-jar-with-dependencies.jar"; + SparkLoadRunner.SPARK_LOAD_HOME + "/spark-load-dpp-1.0-SNAPSHOT-jar-with-dependencies.jar"; private Map properties = Collections.emptyMap(); From 262a3b91f62902dabcb10f780c213bbc4fd999ef Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:17:08 +0800 Subject: [PATCH 14/45] add hadoop aws dependency --- spark-load/spark-load-core/pom.xml | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index 8c516836..8421506b 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -94,6 +94,45 @@ + + org.apache.hadoop + hadoop-aws + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + servlet-api + javax.servlet + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-bundle + + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-glue + + + com.amazonaws + aws-java-sdk-dynamodb + \ No newline at end of file From d46f4ef9a0951c3934fdd17e05530d4822ee7b9b Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:17:15 +0800 Subject: [PATCH 15/45] add hadoop aws dependency --- spark-load/pom.xml | 55 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 1ae40197..29ee383c 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -10,7 +10,7 @@ pom spark-load-core - spark-dpp + spark-load-dpp @@ -36,6 +36,7 @@ 2.17.1 2.0.7 1.2 + 1.12.669 @@ -151,6 +152,55 @@ + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + servlet-api + javax.servlet + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-bundle + + + + + com.amazonaws + aws-java-sdk-s3 + ${aws-java-sdk.version} + + + com.amazonaws + aws-java-sdk-glue + ${aws-java-sdk.version} + + + com.amazonaws + aws-java-sdk-dynamodb + ${aws-java-sdk.version} + + + + com.amazonaws + aws-java-sdk-logs + ${aws-java-sdk.version} + org.apache.parquet parquet-column @@ -229,6 +279,9 @@ httpclient5 ${httpclient5.version} + + + org.junit.jupiter From 24f4be8e53cfef38adfef5899dd799bf35e0eb91 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:18:35 +0800 Subject: [PATCH 16/45] change exception message --- .../org/apache/doris/load/job/PullLoader.java | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index ca770516..54b4253b 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -11,7 +11,7 @@ import org.apache.doris.exception.SparkLoadException; import org.apache.doris.sparkdpp.EtlJobConfig; import org.apache.doris.util.DateUtils; -import org.apache.doris.util.HadoopUtils; +import org.apache.doris.util.FileSystemUtils; import org.apache.doris.util.JsonUtils; import org.apache.commons.lang3.StringUtils; @@ -58,7 +58,6 @@ public void prepare() throws SparkLoadException { loadMeta = feClient.createSparkLoad(jobConfig.getDatabase(), tableToPartition, jobConfig.getLabel(), jobConfig.getJobProperties()); etlJobConfig = loadMeta.getEtlJobConfig(jobConfig); - jobStatus = JobStatus.SUCCESS; } @Override @@ -73,9 +72,8 @@ public void execute() throws SparkLoadException { String etlJobConfPath = etlJobConfig.outputPath + "/configs/jobconfig.json"; try { - HadoopUtils.createFile(jobConfig, etlJobConfig.configToJson(), etlJobConfPath, true); + FileSystemUtils.createFile(jobConfig, etlJobConfig.configToJson(), etlJobConfPath, true); } catch (IOException e) { - LOG.error("create job config file failed", e); throw new SparkLoadException("create job config file failed", e); } @@ -109,12 +107,16 @@ public void afterFinished() throws SparkLoadException { String dppResultStr = null; int checkCnt = 0; while (checkCnt < 3) { - dppResultStr = getDppResultString(); + try { + dppResultStr = getDppResultString(); + } catch (UnsupportedOperationException e) { + LOG.warn("retry get dpp result", e); + checkCnt++; + LockSupport.parkNanos(Duration.ofMillis(500).toNanos()); + } if (dppResultStr != null) { break; } - checkCnt++; - LockSupport.parkNanos(Duration.ofMillis(500).toNanos()); } if (dppResultStr == null) { throw new SparkLoadException("get dpp result str failed"); @@ -130,7 +132,7 @@ public void afterFinished() throws SparkLoadException { LoadInfo loadInfo = feClient.getLoadInfo(jobConfig.getDatabase(), jobConfig.getLabel()); switch (loadInfo.getState().toUpperCase(Locale.ROOT)) { case "FINISHED": - LOG.info("loading job finished"); + LOG.info("load job finished."); try { cleanOutputPath(); } catch (IOException e) { @@ -138,8 +140,7 @@ public void afterFinished() throws SparkLoadException { } return; case "CANCELLED": - LOG.error("loading job failed, failed msg: " + loadInfo.getFailMsg()); - throw new SparkLoadException("loading job failed, " + loadInfo.getFailMsg()); + throw new SparkLoadException("load job failed, " + loadInfo.getFailMsg()); default: LOG.info("load job unfinished, state: " + loadInfo.getState()); break; @@ -172,18 +173,18 @@ public boolean canBeRecovered() throws SparkLoadException { String outputPath = etlJobConfig.getOutputPath(); String parentOutputPath = outputPath.substring(0, StringUtils.lastIndexOf(outputPath, "/")); try { - if (HadoopUtils.exists(jobConfig, parentOutputPath)) { - FileStatus[] fileStatuses = HadoopUtils.list(jobConfig, parentOutputPath); + if (FileSystemUtils.exists(jobConfig, parentOutputPath)) { + FileStatus[] fileStatuses = FileSystemUtils.list(jobConfig, parentOutputPath); if (fileStatuses.length != 1) { return false; } - fileStatuses = HadoopUtils.list(jobConfig, fileStatuses[0].getPath().toString()); + fileStatuses = FileSystemUtils.list(jobConfig, fileStatuses[0].getPath().toString()); boolean hasDppResult = false; for (FileStatus fileStatus : fileStatuses) { String fileName = fileStatus.getPath().getName(); if (DPP_RESULT_JSON.equalsIgnoreCase(fileName)) { hasDppResult = true; - String content = HadoopUtils.readFile(jobConfig, fileStatus.getPath().toString()); + String content = FileSystemUtils.readFile(jobConfig, fileStatus.getPath().toString()); DppResult dppResult = JsonUtils.readValue(content, DppResult.class); if (!checkDppResult(dppResult)) { LOG.info("previous etl job is failed, cannot be recovered"); @@ -192,7 +193,7 @@ public boolean canBeRecovered() throws SparkLoadException { } // check meta consist if (LOAD_META_JSON.equalsIgnoreCase(fileName)) { - String content = HadoopUtils.readFile(jobConfig, fileStatus.getPath().toString()); + String content = FileSystemUtils.readFile(jobConfig, fileStatus.getPath().toString()); LoadMeta oldLoadMeta = JsonUtils.readValue(content, LoadMeta.class); for (Map.Entry entry : loadMeta.getTableMeta().entrySet()) { TableMeta tableMeta = entry.getValue(); @@ -255,10 +256,11 @@ public void prepareRecover() throws SparkLoadException { String outputPath = etlJobConfig.getOutputPath(); String parentOutputPath = outputPath.substring(0, StringUtils.lastIndexOf(outputPath, "/")); try { - FileStatus[] fileStatuses = HadoopUtils.list(jobConfig, parentOutputPath); - HadoopUtils.move(jobConfig, fileStatuses[0].getPath().toString(), outputPath); - HadoopUtils.delete(jobConfig, outputPath + "/load_meta.json"); + FileStatus[] fileStatuses = FileSystemUtils.list(jobConfig, parentOutputPath); + FileSystemUtils.move(jobConfig, fileStatuses[0].getPath().toString(), outputPath); + FileSystemUtils.delete(jobConfig, outputPath + "/load_meta.json"); uploadMetaInfo(loadMeta, etlJobConfig.getOutputPath()); + jobStatus = JobStatus.SUCCESS; } catch (IOException e) { throw new SparkLoadException("prepare recovery failed", e); } @@ -274,10 +276,10 @@ private boolean checkDppResult(DppResult dppResult) { private void uploadMetaInfo(LoadMeta metaInfo, String outputPath) throws SparkLoadException { try { - if (!HadoopUtils.exists(jobConfig, outputPath)) { - HadoopUtils.mkdir(jobConfig, outputPath); + if (!FileSystemUtils.exists(jobConfig, outputPath)) { + FileSystemUtils.mkdir(jobConfig, outputPath); } - HadoopUtils.createFile(jobConfig, JsonUtils.writeValueAsBytes(metaInfo), + FileSystemUtils.createFile(jobConfig, JsonUtils.writeValueAsBytes(metaInfo), outputPath + "/load_meta.json", true); } catch (IOException e) { throw new SparkLoadException("upload load meta failed", e); @@ -301,15 +303,15 @@ protected String getLogPath() { } public void cleanOutputPath() throws IOException { - if (HadoopUtils.exists(jobConfig, etlJobConfig.outputPath)) { + if (FileSystemUtils.exists(jobConfig, etlJobConfig.outputPath)) { LOG.info("clean output: " + etlJobConfig.outputPath); - HadoopUtils.delete(jobConfig, etlJobConfig.outputPath); + FileSystemUtils.delete(jobConfig, etlJobConfig.outputPath); } } private String getDppResultString() throws SparkLoadException { try { - return HadoopUtils.readFile(jobConfig, etlJobConfig.outputPath + "/dpp_result.json"); + return FileSystemUtils.readFile(jobConfig, etlJobConfig.outputPath + "/dpp_result.json"); } catch (IOException e) { throw new SparkLoadException("get dpp result failed", e); } @@ -318,7 +320,7 @@ private String getDppResultString() throws SparkLoadException { private Map getFilePathToSize() throws SparkLoadException { Map filePathToSize = new HashMap<>(); try { - FileStatus[] fileStatuses = HadoopUtils.list(jobConfig, etlJobConfig.outputPath); + FileStatus[] fileStatuses = FileSystemUtils.list(jobConfig, etlJobConfig.outputPath); for (FileStatus fileStatus : fileStatuses) { if (fileStatus.isDirectory()) { continue; From 2b78bad891f657822723122315c0eda238c120d9 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:18:54 +0800 Subject: [PATCH 17/45] add load cancel method --- .../main/java/org/apache/doris/load/job/Loader.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java index 18da95f1..dc4014e5 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -77,6 +77,17 @@ private SparkAppHandle submitSparkJob(String mainClass, String[] appArgs, String return launcher.startApplication(); } + public void cancel() { + if (jobStatus == JobStatus.RUNNING) { + if (appHandle != null) { + appHandle.kill(); + } + } else if (jobStatus == JobStatus.SUCCESS) { + jobStatus = JobStatus.FAILED; + afterFailed(new SparkLoadException("load client cancelled.")); + } + } + protected abstract String getMainClass(); protected abstract String[] getAppArgs(); From cdf9318e0ca90d55afae9bbf790aaed7a55b7b91 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 23 May 2024 18:19:06 +0800 Subject: [PATCH 18/45] add shutdown hook --- .../src/main/java/org/apache/doris/SparkLoadRunner.java | 1 + 1 file changed, 1 insertion(+) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java index b6efc4e1..36d68ba7 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -56,6 +56,7 @@ public static void main(String[] args) { LoadManager loadManager = LoadManager.getInstance(); Loader loader = loadManager.createLoader(jobConfig, cmdOptions.getRecovery()); + Runtime.getRuntime().addShutdownHook(new Thread(loader::cancel)); try { loader.prepare(); From 85aa2481b2e7ba48e633e18a50fc8ad00519693d Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 12 Jun 2024 16:32:42 +0800 Subject: [PATCH 19/45] complete cancel job --- .../main/java/org/apache/doris/load/job/Loader.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java index dc4014e5..93e88f61 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -80,12 +80,15 @@ private SparkAppHandle submitSparkJob(String mainClass, String[] appArgs, String public void cancel() { if (jobStatus == JobStatus.RUNNING) { if (appHandle != null) { - appHandle.kill(); + try { + appHandle.stop(); + } catch (Exception e) { + appHandle.kill(); + } } - } else if (jobStatus == JobStatus.SUCCESS) { - jobStatus = JobStatus.FAILED; - afterFailed(new SparkLoadException("load client cancelled.")); } + jobStatus = JobStatus.FAILED; + afterFailed(new SparkLoadException("load client cancelled.")); } protected abstract String getMainClass(); From 8a5c2857501b704406c14fb58b01db517249aeeb Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 12 Jun 2024 16:33:58 +0800 Subject: [PATCH 20/45] add default config value --- .../main/java/org/apache/doris/config/JobConfig.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 7bba2fc6..c7c179a2 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -96,13 +96,18 @@ public Map toEtlColumnMappingMap() { @Data public static class SparkInfo { + private static final String DEFAULT_DEPLOY_MODE = "client"; + + private static final String DEFAULT_DPP_JAR_PATH = + SparkLoadRunner.SPARK_LOAD_HOME + "/app/spark-load-dpp-1.0-SNAPSHOT.jar"; + private String sparkHome; private String workingDir; private String master; - private String deployMode; + private String deployMode = DEFAULT_DEPLOY_MODE; private Integer numExecutors; @@ -112,8 +117,7 @@ public static class SparkInfo { private String driverMemory; - private String dppJarPath = - SparkLoadRunner.SPARK_LOAD_HOME + "/spark-load-dpp-1.0-SNAPSHOT-jar-with-dependencies.jar"; + private String dppJarPath = DEFAULT_DPP_JAR_PATH; private Map properties = Collections.emptyMap(); From 3bfb068902e86dc022483499e4d8f8262a0137a0 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 12 Jun 2024 16:34:32 +0800 Subject: [PATCH 21/45] rename and remove useless getInstance method --- .../org/apache/doris/load/LoadManager.java | 37 ------------------- .../org/apache/doris/load/LoaderFactory.java | 19 ++++++++++ 2 files changed, 19 insertions(+), 37 deletions(-) delete mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java create mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java deleted file mode 100644 index f0fa6546..00000000 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoadManager.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.apache.doris.load; - -import org.apache.doris.config.JobConfig; -import org.apache.doris.load.job.Loader; -import org.apache.doris.load.job.PullLoader; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -public class LoadManager { - - private static final Logger LOG = LogManager.getLogger(LoadManager.class); - - private static volatile LoadManager INSTANCE = null; - - public static LoadManager getInstance() { - if (INSTANCE == null) { - synchronized (LoadManager.class) { - if (INSTANCE == null) { - INSTANCE = new LoadManager(); - } - } - } - return INSTANCE; - } - - public Loader createLoader(JobConfig jobConfig, Boolean isRecoveryMode) { - switch (jobConfig.getLoadMode()) { - case PULL: - return new PullLoader(jobConfig, isRecoveryMode); - case PUSH: - default: - throw new UnsupportedOperationException(); - } - } - -} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java new file mode 100644 index 00000000..196d65ba --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java @@ -0,0 +1,19 @@ +package org.apache.doris.load; + +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.PullLoader; + +public class LoaderFactory { + + public static Loader createLoader(JobConfig jobConfig, Boolean isRecoveryMode) { + switch (jobConfig.getLoadMode()) { + case PULL: + return new PullLoader(jobConfig, isRecoveryMode); + case PUSH: + default: + throw new UnsupportedOperationException(); + } + } + +} From 38edddfa96c6638094e67077e2b5310e46c30940 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 12 Jun 2024 16:34:52 +0800 Subject: [PATCH 22/45] fill columns and column from path as empty list --- .../org/apache/doris/common/meta/LoadMeta.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java index 1f019bcb..8ecb138a 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -7,6 +7,7 @@ import com.google.common.annotations.VisibleForTesting; import lombok.Data; +import org.apache.commons.lang3.StringUtils; import java.util.Arrays; import java.util.Collections; @@ -51,10 +52,15 @@ public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) throws SparkLoadExcepti partitionIds); break; case FILE: - List columnList = - Arrays.stream(taskInfo.getColumns().split(",")).collect(Collectors.toList()); - List columnFromPathList = taskInfo.getColumnFromPath() == null ? Collections.emptyList() : - Arrays.stream(taskInfo.getColumnFromPath().split(",")).collect(Collectors.toList()); + List columnList = Collections.emptyList(); + if (StringUtils.isNoneBlank(taskInfo.getColumns())) { + columnList = Arrays.stream(taskInfo.getColumns().split(",")).collect(Collectors.toList()); + } + List columnFromPathList = Collections.emptyList(); + if (StringUtils.isNoneBlank(taskInfo.getColumnFromPath())) { + columnFromPathList = + Arrays.stream(taskInfo.getColumnFromPath().split(",")).collect(Collectors.toList()); + } fileGroup = new EtlJobConfig.EtlFileGroup(EtlJobConfig.SourceType.FILE, taskInfo.getPaths(), columnList, columnFromPathList, taskInfo.getFieldSep(), taskInfo.getLineDelim(), false, @@ -79,7 +85,7 @@ public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) throws SparkLoadExcepti @VisibleForTesting public void checkMapping(EtlJobConfig.EtlTable etlTable, - Map columnMappingMap) throws SparkLoadException { + Map columnMappingMap) throws SparkLoadException { Optional baseIdx = etlTable.indexes.stream().filter(idx -> idx.isBaseIndex).findFirst(); if (baseIdx.isPresent()) { EtlJobConfig.EtlIndex etlIndex = baseIdx.get(); From 78257afeb6e59f3a37a00decf6fa216ed7a05e59 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 12 Jun 2024 16:35:39 +0800 Subject: [PATCH 23/45] loader factory --- .../src/main/java/org/apache/doris/SparkLoadRunner.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java index 36d68ba7..1c6b0e5d 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -2,7 +2,7 @@ import org.apache.doris.common.CommandLineOptions; import org.apache.doris.config.JobConfig; -import org.apache.doris.load.LoadManager; +import org.apache.doris.load.LoaderFactory; import org.apache.doris.load.job.Loader; import org.apache.doris.load.job.Recoverable; import org.apache.doris.util.JsonUtils; @@ -54,8 +54,7 @@ public static void main(String[] args) { System.exit(-1); } - LoadManager loadManager = LoadManager.getInstance(); - Loader loader = loadManager.createLoader(jobConfig, cmdOptions.getRecovery()); + Loader loader = LoaderFactory.createLoader(jobConfig, cmdOptions.getRecovery()); Runtime.getRuntime().addShutdownHook(new Thread(loader::cancel)); try { From 17b4773c26f421b8487b15f78cc3f76e36cc22f9 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 12 Jun 2024 16:36:09 +0800 Subject: [PATCH 24/45] package --- spark-load/pom.xml | 66 +++++++------- spark-load/spark-load-core/pom.xml | 2 +- spark-load/spark-load-dist/pom.xml | 85 +++++++++++++++++++ .../src/main/assembly/assembly.xml | 53 ++++++++++++ .../src/main/bin/spark-load.sh | 39 +++++++++ 5 files changed, 209 insertions(+), 36 deletions(-) create mode 100644 spark-load/spark-load-dist/pom.xml create mode 100644 spark-load/spark-load-dist/src/main/assembly/assembly.xml create mode 100644 spark-load/spark-load-dist/src/main/bin/spark-load.sh diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 29ee383c..742e143e 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -11,11 +11,12 @@ spark-load-core spark-load-dpp + spark-load-dist - 8 - 8 + 1.8 + 1.8 UTF-8 1.0-SNAPSHOT 1.2-SNAPSHOT @@ -27,7 +28,7 @@ 3.2.2 4.0.2 32.1.2-jre - 2.16.1 + 2.14.2 1.18.30 1.4 5.2.1 @@ -42,7 +43,7 @@ - ${project.groupId} + org.apache.doris fe-common ${doris.fe.version} @@ -66,6 +67,18 @@ org.slf4j slf4j-api + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-annotations + @@ -281,7 +294,6 @@ - org.junit.jupiter @@ -388,36 +400,20 @@ - - - org.apache.maven.plugins - maven-shade-plugin - 3.2.1 - - - - com.google.code.findbugs:* - org.slf4j:* - org.scala-lang:* - - - - - org.apache.hadoop - org.apache.doris.shaded.org.apache.hadoop - - - - - - package - - shade - - - - - + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + \ No newline at end of file diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index 8421506b..a9f2e5f2 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -62,7 +62,7 @@ hadoop-client - ${project.groupId} + org.apache.doris fe-common diff --git a/spark-load/spark-load-dist/pom.xml b/spark-load/spark-load-dist/pom.xml new file mode 100644 index 00000000..4f074f54 --- /dev/null +++ b/spark-load/spark-load-dist/pom.xml @@ -0,0 +1,85 @@ + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + + + pom + + spark-load-dist + + + 8 + 8 + UTF-8 + + + + + org.apache.doris + spark-load-core + ${revision} + + + org.apache.doris + spark-load-dpp + ${revision} + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.2 + + false + false + + + org.apache.doris + spark-load-dpp + ${project.version} + ${project.build.directory}/app + + + + + + + copy + + package + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + bin + + single + + package + + ${project.parent.artifactId}-${project.version} + + src/main/assembly/assembly.xml + + ${project.parent.build.directory} + + + + + + + + \ No newline at end of file diff --git a/spark-load/spark-load-dist/src/main/assembly/assembly.xml b/spark-load/spark-load-dist/src/main/assembly/assembly.xml new file mode 100644 index 00000000..a6e2d292 --- /dev/null +++ b/spark-load/spark-load-dist/src/main/assembly/assembly.xml @@ -0,0 +1,53 @@ + + bin + + tar.gz + + true + ${project.parent.artifactId}-${project.version}-bin + + + + false + runtime + true + lib + + org.apache.doris:spark-load-dpp + + + + + + + ./src/main/bin + bin + + spark-load.sh + + unix + 0755 + + + ${project.build.directory}/lib + lib + 0755 + + + ${project.build.directory}/app + app + 0755 + + + ${project.build.directory}/../src/main/resources + conf + unix + 0755 + + *.yml + *.properties + logback*.xml + + + + \ No newline at end of file diff --git a/spark-load/spark-load-dist/src/main/bin/spark-load.sh b/spark-load/spark-load-dist/src/main/bin/spark-load.sh new file mode 100644 index 00000000..a7472427 --- /dev/null +++ b/spark-load/spark-load-dist/src/main/bin/spark-load.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +if [ -z ${SPARK_LOAD_HOME} ]; then + cur_dir=$(dirname "$0")/../ + SPARK_LOAD_HOME=$(pwd ${cur_dir}) +fi + +export SPARK_LOAD_HOME + +if [[ -z "${JAVA_HOME}" ]]; then + if ! command -v java &>/dev/null; then + JAVA="" + else + JAVA="$(command -v java)" + fi +else + JAVA="${JAVA_HOME}/bin/java" +fi + +if [[ ! -x "${JAVA}" ]]; then + echo "The JAVA_HOME environment variable is not set correctly" + echo "This environment variable is required to run this program" + echo "Note: JAVA_HOME should point to a JDK and not a JRE" + echo "You can set JAVA_HOME in the fe.conf configuration file" + exit 1 +fi + +SPARK_LOAD_CORE_JAR= +for f in "${SPARK_LOAD_HOME}/lib"/*.jar; do + if [[ "${f}" == "spark-load-core"*".jar" ]]; then + SPARK_LOAD_CORE_JAR="${f}" + continue + fi + CLASSPATH="${f}:${CLASSPATH}" +done +CLASSPATH="${SPARK_LOAD_CORE_JAR}:${CLASSPATH}" +export CLASSPATH="${SPARK_LOAD_CORE_JAR}/conf:${CLASSPATH}:${SPARK_LOAD_CORE_JAR}/lib" + +${JAVA} org.apache.doris.SparkLoadRunner "$@" \ No newline at end of file From dcd7d8e997d08f53277a88bbc1be7cb74a6f8d27 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 18 Jun 2024 14:34:01 +0800 Subject: [PATCH 25/45] add license header --- .gitignore | 5 ++ .../doris/spark/sql/TestSparkConnector.scala | 73 +++++++++++++++++-- spark-load/pom.xml | 18 +++++ spark-load/spark-load-core/pom.xml | 18 +++++ .../org/apache/doris/SparkLoadRunner.java | 17 +++++ .../doris/common/CommandLineOptions.java | 17 +++++ .../org/apache/doris/common/Constants.java | 17 +++++ .../org/apache/doris/common/DppResult.java | 20 ++++- .../org/apache/doris/common/LoadInfo.java | 19 ++++- .../apache/doris/common/ResponseEntity.java | 17 +++++ .../apache/doris/common/enums/JobStatus.java | 17 +++++ .../apache/doris/common/enums/LoadMode.java | 17 +++++ .../doris/common/meta/LoadInfoResponse.java | 17 +++++ .../apache/doris/common/meta/LoadMeta.java | 19 ++++- .../apache/doris/common/meta/TableMeta.java | 22 +++++- .../org/apache/doris/config/JobConfig.java | 17 +++++ .../org/apache/doris/config/TaskType.java | 17 +++++ .../doris/exception/SparkLoadException.java | 17 +++++ .../org/apache/doris/load/JobMonitor.java | 13 ---- .../org/apache/doris/load/LoaderFactory.java | 17 +++++ .../apache/doris/load/TransactionManager.java | 17 ----- .../org/apache/doris/load/job/Loader.java | 19 ++++- .../org/apache/doris/load/job/PullLoader.java | 17 +++++ .../apache/doris/load/job/Recoverable.java | 17 +++++ .../java/org/apache/doris/util/DateUtils.java | 17 +++++ .../apache/doris/util/FileSystemUtils.java | 17 +++++ .../java/org/apache/doris/util/HttpUtils.java | 17 +++++ .../java/org/apache/doris/util/JsonUtils.java | 17 +++++ .../src/main/resources/log4j.properties | 17 +++++ .../doris/common/meta/LoadMetaTest.java | 17 +++++ spark-load/spark-load-dist/pom.xml | 18 +++++ .../src/main/assembly/assembly.xml | 19 +++++ .../src/main/bin/spark-load.sh | 16 ++++ 33 files changed, 574 insertions(+), 45 deletions(-) delete mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java delete mode 100644 spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java diff --git a/.gitignore b/.gitignore index 71e2e797..252ad80b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,10 +13,15 @@ spark-load/spark-dpp/output/ spark-load/spark-dpp/target/ spark-load/spark-dpp/.idea/ +spark-load/target spark-load/spark-load-core/dependency-reduced-pom.xml spark-load/spark-load-core/output/ spark-load/spark-load-core/target/ spark-load/spark-load-core/.idea/ +spark-load/spark-load-dist/dependency-reduced-pom.xml +spark-load/spark-load-dist/target/ +spark-load/spark-load-dpp/dependency-reduced-pom.xml +spark-load/spark-load-dpp/target/ ### Java template diff --git a/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala b/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala index a5e756c1..c564f789 100644 --- a/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala +++ b/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala @@ -17,19 +17,19 @@ package org.apache.doris.spark.sql -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} -import org.junit.Ignore -import org.junit.Test +import org.junit.{Ignore, Test} // This test need real connect info to run. // Set the connect info before comment out this @Ignore @Ignore class TestSparkConnector { - val dorisFeNodes = "your_fe_host:8030" + val dorisFeNodes = "10.16.10.6:8939" val dorisUser = "root" val dorisPwd = "" - val dorisTable = "test.test_tbl" + val dorisTable = "test.dwd_test" val kafkaServers = "" val kafkaTopics = "" @@ -111,11 +111,70 @@ class TestSparkConnector { .option("doris.fenodes", dorisFeNodes) .option("user", dorisUser) .option("password", dorisPwd) - .option("sink.batch.size",2) - .option("sink.max-retries",2) + .option("sink.batch.size", 2) + .option("sink.max-retries", 2) .start().awaitTermination() spark.stop() } + @Test + def sqlReadTest(): Unit = { + + val spark = SparkSession.builder() + .master("local") + .getOrCreate() + spark.sql( + s""" + |CREATE TEMPORARY VIEW t + |USING doris + |OPTIONS( + | "table.identifier"="${dorisTable}", + | "fenodes"="${dorisFeNodes}", + | "user"="${dorisUser}", + | "password"="${dorisPwd}" + |) + |""".stripMargin) + + spark.sql( + """ + |select * from t where dt = '2023-06-15' + |""".stripMargin) + // .explain() + .show(false) + + } + + @Test + def jsonDataWriteTest(): Unit = { + val schema = StructType(Array( + StructField("batch_id", StringType, true), + StructField("gen_uuid", StringType, true), + StructField("keyword", StringType, true), + StructField("step", StringType, true), + StructField("title", StringType, true), + StructField("original_keyword", StringType, true), + StructField("host_ip", StringType, true), + StructField("modify_at", StringType, true) + )) + val sparkSession = SparkSession.builder().appName("JSON DATA READ").master("local[*]").getOrCreate() + val df = sparkSession.read.schema(schema).json("/Users/gnehil/Downloads/social_google_trends_keyword_v2_fdc.json").coalesce(1) + // df.show(2) + df.write.format("doris").mode(SaveMode.Append).option( + "doris.table.identifier", "test.social_google_trends_keyword_v2_fdc_20240506" + ).option( + "doris.fenodes", "10.16.10.6:48733" + ).option("user", "root").option("password", "" + // ).option("doris.write.fields", fieldsString + ).option("sink.properties.format", "json" + ).option("sink.batch.size", 100000 + // ).option("doris.request.connect.timeout.ms", DORIS_REQUEST_CONNECT_TIMEOUT_MS + ).option( + "doris.query.port", 49733 + ).option( + "sink.max-retries", "1" + ).save() + sparkSession.stop() + } + } diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 742e143e..6803a92d 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -1,4 +1,22 @@ + diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index a9f2e5f2..b1a31e82 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -1,4 +1,22 @@ + diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java index 1c6b0e5d..7f56b27d 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris; import org.apache.doris.common.CommandLineOptions; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java index 2a849bb5..8c66abcb 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common; import lombok.Getter; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java index 60aec476..fb7498a3 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common; public interface Constants { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java index 1574e5ae..a27445bb 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common; import com.fasterxml.jackson.annotation.JsonProperty; @@ -50,5 +67,4 @@ public DppResult() { scannedBytes = 0; } - -} \ No newline at end of file +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java index 4a45900d..1c7e904c 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common; import lombok.Data; @@ -15,4 +32,4 @@ public class LoadInfo { private String failMsg; private String trackingUrl; -} \ No newline at end of file +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java index ca03e35b..a5a3f149 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common; import com.fasterxml.jackson.databind.JsonNode; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java index b9900e3c..6493b36b 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.enums; public enum JobStatus { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java index 9eb9be6c..d86b0738 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.enums; public enum LoadMode { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java index 0bc64406..60f28e9f 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.meta; import org.apache.doris.common.LoadInfo; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java index 8ecb138a..6ac4ff71 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -1,8 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.meta; import org.apache.doris.common.Constants; -import org.apache.doris.exception.SparkLoadException; import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; import org.apache.doris.sparkdpp.EtlJobConfig; import com.google.common.annotations.VisibleForTesting; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java index 2abaea20..0b2621d6 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java @@ -1,10 +1,26 @@ -package org.apache.doris.common.meta; +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. -import com.google.gson.annotations.SerializedName; -import lombok.Data; +package org.apache.doris.common.meta; import org.apache.doris.sparkdpp.EtlJobConfig; +import lombok.Data; + import java.io.Serializable; import java.util.List; import java.util.stream.Collectors; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index c7c179a2..5e80d45c 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.config; import org.apache.doris.SparkLoadRunner; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java index e05c88ab..ba3283ea 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.config; public enum TaskType { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java index cc2b043e..d25aca87 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.exception; public class SparkLoadException extends Exception { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java deleted file mode 100644 index 7817cfa0..00000000 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/JobMonitor.java +++ /dev/null @@ -1,13 +0,0 @@ -package org.apache.doris.load; - -public class JobMonitor { - - public void registerJob() { - - } - - private void startListen() { - - } - -} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java index 196d65ba..0b0fc786 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.load; import org.apache.doris.config.JobConfig; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java deleted file mode 100644 index a940db8a..00000000 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/TransactionManager.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.apache.doris.load; - -public class TransactionManager { - - public long beginTxn() { - return -1L; - } - - public void commitTxn(long txnId) { - - } - - public void abortTxn(long txnId) { - - } - -} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java index 93e88f61..d80caab0 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -1,8 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.load.job; import org.apache.doris.common.enums.JobStatus; -import org.apache.doris.exception.SparkLoadException; import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; import lombok.Getter; import org.apache.spark.launcher.SparkAppHandle; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index 54b4253b..b22c7820 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.load.job; import org.apache.doris.SparkLoadRunner; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java index 54acdda6..ccfd461a 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.load.job; import org.apache.doris.exception.SparkLoadException; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java index 8d0c3179..7305ef76 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.util; import java.time.LocalDateTime; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java index a4998b79..da54c601 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.util; import org.apache.doris.config.JobConfig; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java index 7df6c0e9..d1da38d3 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.util; import org.apache.http.HttpEntity; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java index 5cd94300..3d33e85b 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.util; import com.fasterxml.jackson.core.JsonParser; diff --git a/spark-load/spark-load-core/src/main/resources/log4j.properties b/spark-load/spark-load-core/src/main/resources/log4j.properties index 1c90987e..c1e97855 100644 --- a/spark-load/spark-load-core/src/main/resources/log4j.properties +++ b/spark-load/spark-load-core/src/main/resources/log4j.properties @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + log4j.rootLogger=INFO,console log4j.additivity.org.apache=true log4j.appender.console=org.apache.log4j.ConsoleAppender diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java index 025c662a..79c95739 100644 --- a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.meta; diff --git a/spark-load/spark-load-dist/pom.xml b/spark-load/spark-load-dist/pom.xml index 4f074f54..21e5d319 100644 --- a/spark-load/spark-load-dist/pom.xml +++ b/spark-load/spark-load-dist/pom.xml @@ -1,4 +1,22 @@ + diff --git a/spark-load/spark-load-dist/src/main/assembly/assembly.xml b/spark-load/spark-load-dist/src/main/assembly/assembly.xml index a6e2d292..71b9a3ae 100644 --- a/spark-load/spark-load-dist/src/main/assembly/assembly.xml +++ b/spark-load/spark-load-dist/src/main/assembly/assembly.xml @@ -1,3 +1,22 @@ + + bin diff --git a/spark-load/spark-load-dist/src/main/bin/spark-load.sh b/spark-load/spark-load-dist/src/main/bin/spark-load.sh index a7472427..241dd432 100644 --- a/spark-load/spark-load-dist/src/main/bin/spark-load.sh +++ b/spark-load/spark-load-dist/src/main/bin/spark-load.sh @@ -1,4 +1,20 @@ #!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. if [ -z ${SPARK_LOAD_HOME} ]; then cur_dir=$(dirname "$0")/../ From 90dc710b42a4b2ce3ad02ede709d67af5f1e312d Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 18 Jun 2024 15:31:01 +0800 Subject: [PATCH 26/45] build script --- spark-load/build.sh | 175 ++++++++++++++++++ spark-load/pom.xml | 5 + spark-load/spark-load-dpp/pom.xml | 8 +- .../doris/load/loadv2/dpp/SparkDpp.java | 3 +- 4 files changed, 187 insertions(+), 4 deletions(-) diff --git a/spark-load/build.sh b/spark-load/build.sh index e69de29b..88c2aeba 100644 --- a/spark-load/build.sh +++ b/spark-load/build.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to compile Spark-Doris-Connector +# Usage: +# sh build.sh +# +############################################################## + +# Bugzilla 37848: When no TTY is available, don't output to console +have_tty=0 +# shellcheck disable=SC2006 +if [[ "`tty`" != "not a tty" ]]; then + have_tty=1 +fi + +# Bugzilla 37848: When no TTY is available, don't output to console +have_tty=0 +# shellcheck disable=SC2006 +if [[ "`tty`" != "not a tty" ]]; then + have_tty=1 +fi + + # Only use colors if connected to a terminal +if [[ ${have_tty} -eq 1 ]]; then + PRIMARY=$(printf '\033[38;5;082m') + RED=$(printf '\033[31m') + GREEN=$(printf '\033[32m') + YELLOW=$(printf '\033[33m') + BLUE=$(printf '\033[34m') + BOLD=$(printf '\033[1m') + RESET=$(printf '\033[0m') +else + PRIMARY="" + RED="" + GREEN="" + YELLOW="" + BLUE="" + BOLD="" + RESET="" +fi + +echo_r () { + # Color red: Error, Failed + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $RED $RESET +} + +echo_g () { + # Color green: Success + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $GREEN $RESET +} + +echo_y () { + # Color yellow: Warning + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $YELLOW $RESET +} + +echo_w () { + # Color yellow: White + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $WHITE $RESET +} + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false +os400=false +# shellcheck disable=SC2006 +case "`uname`" in +CYGWIN*) cygwin=true;; +OS400*) os400=true;; +esac + +# resolve links - $0 may be a softlink +PRG="$0" + +while [[ -h "$PRG" ]]; do + # shellcheck disable=SC2006 + ls=`ls -ld "$PRG"` + # shellcheck disable=SC2006 + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + # shellcheck disable=SC2006 + PRG=`dirname "$PRG"`/"$link" + fi +done + +# Get standard environment variables +# shellcheck disable=SC2006 +ROOT=$(cd "$(dirname "$PRG")" &>/dev/null && pwd) +export DORIS_HOME=$(cd "$ROOT/../" &>/dev/null && pwd) + +. "${DORIS_HOME}"/env.sh + +# include custom environment variables +if [[ -f ${DORIS_HOME}/custom_env.sh ]]; then + . "${DORIS_HOME}"/custom_env.sh +fi + +selectSpark() { + echo 'Spark-Load supports multiple versions of spark. Which version do you need ?' + select spark in "2.x" "3.x" "other" + do + case $spark in + "2.x") + return 1 + ;; + "3.x") + return 2 + ;; + *) + echo "invalid selected, exit.." + exit 1 + ;; + esac + done +} + +SPARK_VERSION=0 +selectSpark +SparkVer=$? +if [ ${SparkVer} -eq 1 ]; then + SPARK_VERSION="spark2" + SCALA_VERSION="scala_2.11" +elif [ ${SparkVer} -eq 2 ]; then + SPARK_VERSION="spark3" + SCALA_VERSION="scala_2.12" +fi + +echo_g " spark load run based on : ${SPARK_VERSION} and ${SCALA_VERSION}" +echo_g " build starting..." + +${MVN_BIN} clean package -P${SPARK_VERSION},${SCALA_VERSION} "$@" + +EXIT_CODE=$? +if [ $EXIT_CODE -eq 0 ]; then + DIST_DIR=${DORIS_HOME}/dist + [ ! -d "$DIST_DIR" ] && mkdir "$DIST_DIR" + dist_jar=$(ls "${ROOT}"/target | grep "spark-load-") + rm -rf "${DIST_DIR}"/"${dist_jar}" + cp "${ROOT}"/target/"${dist_jar}" "$DIST_DIR" + + echo_g "*****************************************************************" + echo_g "Successfully build Spark-Load" + echo_g "dist: $DIST_DIR/$dist_jar " + echo_g "*****************************************************************" + exit 0; +else + echo_r "Failed build Spark-Load" + exit $EXIT_CODE; +fi diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 6803a92d..1816125f 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -430,6 +430,11 @@ maven-shade-plugin 3.2.1 + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.2 + diff --git a/spark-load/spark-load-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml index a84414b1..af5ca5c7 100644 --- a/spark-load/spark-load-dpp/pom.xml +++ b/spark-load/spark-load-dpp/pom.xml @@ -27,6 +27,10 @@ under the License. spark-load-dpp jar + + 1 + -Xmx512m + ${project.groupId} @@ -192,9 +196,9 @@ under the License. maven-surefire-plugin set larger, eg, 3, to reduce the time or running FE unit tests<--> - + ${fe_ut_parallel} not reuse forked jvm, so that each unit test will run in separate jvm. to avoid singleton confict<--> - + false -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java index 401c45fa..08137c70 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -25,7 +25,7 @@ import com.google.common.collect.Maps; import com.google.gson.Gson; import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.collections4.IteratorUtils; +import org.apache.commons.collections.IteratorUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.conf.Configuration; @@ -65,7 +65,6 @@ import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; From b1f5607bec2cc622a82489b57b84ff41c8e7877c Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 4 Jul 2024 16:33:56 +0800 Subject: [PATCH 27/45] add kerberos login --- .../org/apache/doris/common/Constants.java | 4 ++ .../org/apache/doris/config/JobConfig.java | 28 +++++++++++++ .../apache/doris/util/FileSystemUtils.java | 39 ++++++++++++++++--- 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java index fb7498a3..20130549 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java @@ -21,5 +21,9 @@ public interface Constants { String HIVE_METASTORE_URIS = "hive.metastore.uris"; String SPARK_STANDALONE_SCHEME = "spark"; + String HADOOP_AUTH_KERBEROS = "kerberos"; + String HADOOP_SECURITY_AUTHENTICATION = "hadoop.security.authentication"; + String HADOOP_KERBEROS_PRINCIPAL = "hadoop.kerberos.principal"; + String HADOOP_KERBEROS_KEYTAB = "hadoop.kerberos.keytab"; } diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 5e80d45c..9ffc4408 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -220,4 +220,32 @@ private boolean checkSparkMaster(String master) { && StringUtils.isNoneBlank(uri.getHost()) && uri.getPort() != -1; } + public void checkHadoopProperties() { + if (hadoopProperties == null || hadoopProperties.isEmpty()) { + return; + } + if (!hadoopProperties.containsKey("fs.defaultFS")) { + throw new IllegalArgumentException("fs.defaultFS is empty"); + } + // check auth + if (hadoopProperties.containsKey("hadoop.security.authentication") + && StringUtils.equalsIgnoreCase(hadoopProperties.get("hadoop.security.authentication"), "kerberos")) { + if (hadoopProperties.containsKey("hadoop.kerberos.principal") + && hadoopProperties.containsKey("hadoop.kerberos.keytab")) { + if (!FileUtils.getFile(hadoopProperties.get("hadoop.kerberos.principal")).exists()) { + throw new IllegalArgumentException("hadoop kerberos principal file is not exists, path: " + + hadoopProperties.get("hadoop.kerberos.principal")); + } + if (!FileUtils.getFile(hadoopProperties.get("hadoop.kerberos.keytab")).exists()) { + throw new IllegalArgumentException("hadoop kerberos keytab file is not exists, path: " + + hadoopProperties.get("hadoop.kerberos.keytab")); + } + } + } else { + if (!hadoopProperties.containsKey("hadoop.username")) { + throw new IllegalArgumentException("hadoop username is empty"); + } + } + } + } diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java index da54c601..2e6b5880 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java @@ -17,29 +17,33 @@ package org.apache.doris.util; +import org.apache.doris.common.Constants; import org.apache.doris.config.JobConfig; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; -import java.util.Map; public class FileSystemUtils { + private static final Logger LOG = LogManager.getLogger(FileSystemUtils.class); + private static FileSystem getFs(JobConfig config, Path path) throws IOException { - Configuration conf = new Configuration(); - Map props = config.getHadoopProperties(); - props.forEach(conf::set); - return FileSystem.get(path.toUri(), conf); + return FileSystem.get(path.toUri(), getConf(config)); } public static void createFile(JobConfig config, String content, String path, Boolean overwrite) throws IOException { @@ -114,4 +118,29 @@ public static void mkdir(JobConfig config, String path) throws IOException { } } + public static void kerberosLogin(JobConfig jobConfig) throws IOException { + Configuration conf = getConf(jobConfig); + conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION, "true"); + conf.set(CommonConfigurationKeysPublic.HADOOP_KERBEROS_KEYTAB_LOGIN_AUTORENEWAL_ENABLED, "true"); + UserGroupInformation.setConfiguration(conf); + String keytab = jobConfig.getHadoopProperties().get(Constants.HADOOP_KERBEROS_KEYTAB); + String principal = jobConfig.getHadoopProperties().get(Constants.HADOOP_KERBEROS_PRINCIPAL); + try { + UserGroupInformation ugi = UserGroupInformation.getLoginUser(); + if (ugi.hasKerberosCredentials() && StringUtils.equals(ugi.getUserName(), principal)) { + ugi.checkTGTAndReloginFromKeytab(); + return; + } + } catch (IOException e) { + LOG.warn("A SecurityException occurs with kerberos, do login immediately.", e); + } + UserGroupInformation.loginUserFromKeytab(principal, keytab); + } + + private static Configuration getConf(JobConfig jobConfig) { + Configuration conf = new Configuration(); + jobConfig.getHadoopProperties().forEach(conf::set); + return conf; + } + } From 3c36eb81a2aee18d4e47d75244f4b47cfe6afcfc Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 4 Jul 2024 16:34:38 +0800 Subject: [PATCH 28/45] add schema version to index meta --- .../main/java/org/apache/doris/common/meta/TableMeta.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java index 0b2621d6..3e97b97a 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java @@ -17,7 +17,8 @@ package org.apache.doris.common.meta; -import org.apache.doris.sparkdpp.EtlJobConfig; + +import org.apache.doris.config.EtlJobConfig; import lombok.Data; @@ -38,13 +39,14 @@ public static class EtlIndex implements Serializable { public int schemaHash; public String indexType; public boolean isBaseIndex; + public int schemaVersion; public EtlIndex() { } public EtlJobConfig.EtlIndex toEtlIndex() { - return new EtlJobConfig.EtlIndex(indexId, columns, schemaHash, indexType, isBaseIndex); + return new EtlJobConfig.EtlIndex(indexId, columns, schemaHash, indexType, isBaseIndex, schemaVersion); } } From 2f161b3bc72bc64ba067f4251a9b69bf926ab0b5 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 4 Jul 2024 16:35:14 +0800 Subject: [PATCH 29/45] add shutdown hoot for cancel load --- .../src/main/java/org/apache/doris/SparkLoadRunner.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java index 7f56b27d..a3207636 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -72,7 +72,10 @@ public static void main(String[] args) { } Loader loader = LoaderFactory.createLoader(jobConfig, cmdOptions.getRecovery()); - Runtime.getRuntime().addShutdownHook(new Thread(loader::cancel)); + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + LOG.info("Shutting down..."); + loader.cancel(); + })); try { loader.prepare(); From 901d974259e46c03c4226f6658abbba0365ade5b Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 4 Jul 2024 16:36:29 +0800 Subject: [PATCH 30/45] refactor recovery schema change check --- .../apache/doris/common/io/BitmapValue.java | 403 +++++ .../org/apache/doris/common/io/Codec.java | 56 + .../java/org/apache/doris/common/io/Hll.java | 374 +++++ .../apache/doris/common/io/Roaring64Map.java | 1412 +++++++++++++++++ .../apache/doris/common/jmockit/AutoType.java | 60 + .../common/jmockit/ConstructorReflection.java | 160 ++ .../doris/common/jmockit/Deencapsulation.java | 63 + .../doris/common/jmockit/FieldReflection.java | 287 ++++ .../common/jmockit/GeneratedClasses.java | 48 + .../common/jmockit/MethodReflection.java | 163 ++ .../common/jmockit/ParameterReflection.java | 167 ++ .../jmockit/ThrowOfCheckedException.java | 22 + .../org/apache/doris/config/EtlJobConfig.java | 488 ++++++ .../org/apache/doris/load/job/PullLoader.java | 24 +- 14 files changed, 3722 insertions(+), 5 deletions(-) create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java create mode 100644 spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java new file mode 100644 index 00000000..04bb368f --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java @@ -0,0 +1,403 @@ +package org.apache.doris.common.io; + +import org.roaringbitmap.Util; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class BitmapValue { + + public static final int EMPTY = 0; + public static final int SINGLE32 = 1; + public static final int BITMAP32 = 2; + public static final int SINGLE64 = 3; + public static final int BITMAP64 = 4; + + public static final int SINGLE_VALUE = 1; + public static final int BITMAP_VALUE = 2; + + public static final long UNSIGNED_32BIT_INT_MAX_VALUE = 4294967295L; + + private int bitmapType; + private long singleValue; + private Roaring64Map bitmap; + + public BitmapValue() { + bitmapType = EMPTY; + } + + public void add(int value) { + add(Util.toUnsignedLong(value)); + } + + public void add(long value) { + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + singleValue = value; + bitmapType = SINGLE_VALUE; + break; + case SINGLE_VALUE: + if (this.singleValue != value) { + bitmap = new Roaring64Map(); + bitmap.add(value); + bitmap.add(singleValue); + bitmapType = BITMAP_VALUE; + } + break; + case BITMAP_VALUE: + bitmap.addLong(value); + break; + } + } + + public boolean contains(int value) { + return contains(Util.toUnsignedLong(value)); + } + + public boolean contains(long value) { + switch (bitmapType) { + case EMPTY: + return false; + case SINGLE_VALUE: + return singleValue == value; + case BITMAP_VALUE: + return bitmap.contains(value); + default: + return false; + } + } + + public long cardinality() { + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + return 0; + case SINGLE_VALUE: + return 1; + case BITMAP_VALUE: + return bitmap.getLongCardinality(); + } + return 0; + } + + public void serialize(DataOutput output) throws IOException { + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + output.writeByte(EMPTY); + break; + case SINGLE_VALUE: + // is 32-bit enough + // FE is big end but BE is little end. + if (isLongValue32bitEnough(singleValue)) { + output.write(SINGLE32); + output.writeInt(Integer.reverseBytes((int) singleValue)); + } else { + output.writeByte(SINGLE64); + output.writeLong(Long.reverseBytes(singleValue)); + } + break; + case BITMAP_VALUE: + bitmap.serialize(output); + break; + } + } + + public void deserialize(DataInput input) throws IOException { + clear(); + int bitmapType = input.readByte(); + switch (bitmapType) { + case EMPTY: + break; + case SINGLE32: + singleValue = Util.toUnsignedLong(Integer.reverseBytes(input.readInt())); + this.bitmapType = SINGLE_VALUE; + break; + case SINGLE64: + singleValue = Long.reverseBytes(input.readLong()); + this.bitmapType = SINGLE_VALUE; + break; + case BITMAP32: + case BITMAP64: + bitmap = bitmap == null ? new Roaring64Map() : bitmap; + bitmap.deserialize(input, bitmapType); + this.bitmapType = BITMAP_VALUE; + break; + default: + throw new RuntimeException(String.format("unknown bitmap type %s ", bitmapType)); + } + } + + // In-place bitwise AND (intersection) operation. The current bitmap is modified. + public void and(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + clear(); + break; + case SINGLE_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (this.singleValue != other.singleValue) { + clear(); + } + break; + case BITMAP_VALUE: + if (!this.bitmap.contains(other.singleValue)) { + clear(); + } else { + clear(); + this.singleValue = other.singleValue; + this.bitmapType = SINGLE_VALUE; + } + break; + } + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (!other.bitmap.contains(this.singleValue)) { + clear(); + } + break; + case BITMAP_VALUE: + this.bitmap.and(other.bitmap); + convertToSmallerType(); + break; + } + break; + } + } + + // In-place bitwise OR (union) operation. The current bitmap is modified. + public void or(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + add(other.singleValue); + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + // deep copy the bitmap in case of multi-rollups update the bitmap repeatedly + this.bitmap = new Roaring64Map(); + this.bitmap.or(other.bitmap); + this.bitmapType = BITMAP_VALUE; + break; + case SINGLE_VALUE: + this.bitmap = new Roaring64Map(); + this.bitmap.or(other.bitmap); + this.bitmap.add(this.singleValue); + this.bitmapType = BITMAP_VALUE; + break; + case BITMAP_VALUE: + this.bitmap.or(other.bitmap); + break; + } + break; + } + } + + public void remove(long value) { + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (this.singleValue == value) { + clear(); + } + break; + case BITMAP_VALUE: + this.bitmap.removeLong(value); + convertToSmallerType(); + break; + } + } + + // In-place bitwise ANDNOT (difference) operation. The current bitmap is modified + public void not(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + remove(other.singleValue); + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (other.bitmap.contains(this.singleValue)) { + clear(); + } + break; + case BITMAP_VALUE: + this.bitmap.andNot(other.bitmap); + convertToSmallerType(); + break; + } + break; + } + } + + // In-place bitwise XOR (symmetric difference) operation. The current bitmap is modified + public void xor(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + add(other.singleValue); + break; + case SINGLE_VALUE: + if (this.singleValue != other.singleValue) { + add(other.singleValue); + } else { + clear(); + } + break; + case BITMAP_VALUE: + if (!this.bitmap.contains(other.singleValue)) { + this.bitmap.add(other.singleValue); + } else { + this.bitmap.removeLong(other.singleValue); + convertToSmallerType(); + } + break; + } + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + this.bitmap = other.bitmap; + this.bitmapType = BITMAP_VALUE; + break; + case SINGLE_VALUE: + this.bitmap = other.bitmap; + this.bitmapType = BITMAP_VALUE; + if (this.bitmap.contains(this.singleValue)) { + this.bitmap.removeLong(this.singleValue); + } else { + this.bitmap.add(this.bitmapType); + } + break; + case BITMAP_VALUE: + this.bitmap.xor(other.bitmap); + convertToSmallerType(); + break; + } + break; + } + } + + @Override + public boolean equals(Object other) { + if (other == null || !(other instanceof BitmapValue)) { + return false; + } + boolean ret = false; + if (this.bitmapType != ((BitmapValue) other).bitmapType) { + return false; + } + switch (((BitmapValue) other).bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + ret = true; + break; + case SINGLE_VALUE: + ret = this.singleValue == ((BitmapValue) other).singleValue; + break; + case BITMAP_VALUE: + ret = bitmap.equals(((BitmapValue) other).bitmap); + } + return ret; + } + + /** + * usage note: + * now getSizeInBytes is different from be' impl + * The reason is that java's roaring didn't implement method #shrinkToFit but be's getSizeInBytes need it + * Implementing java's shrinkToFit means refactor roaring whose fields are all unaccess in Doris Fe's package + * That would be an another big project + */ + // TODO(wb): keep getSizeInBytes consistent with be and refactor roaring + public long getSizeInBytes() { + long size = 0; + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + size = 1; + break; + case SINGLE_VALUE: + if (isLongValue32bitEnough(singleValue)) { + size = 1 + 4; + } else { + size = 1 + 8; + } + break; + case BITMAP_VALUE: + size = 1 + bitmap.getSizeInBytes(); + } + return size; + } + + @Override + public String toString() { + String toStringStr = "{}"; + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + toStringStr = String.format("{%s}", singleValue); + break; + case BITMAP_VALUE: + toStringStr = this.bitmap.toString(); + break; + } + return toStringStr; + } + + public void clear() { + this.bitmapType = EMPTY; + this.singleValue = -1; + this.bitmap = null; + } + + private void convertToSmallerType() { + if (bitmapType == BITMAP_VALUE) { + if (bitmap.getLongCardinality() == 0) { + this.bitmap = null; + this.bitmapType = EMPTY; + } else if (bitmap.getLongCardinality() == 1) { + this.singleValue = bitmap.select(0); + this.bitmapType = SINGLE_VALUE; + this.bitmap = null; + } + } + } + + private boolean isLongValue32bitEnough(long value) { + return value <= UNSIGNED_32BIT_INT_MAX_VALUE; + } + + // just for ut + public int getBitmapType() { + return bitmapType; + } + + // just for ut + public boolean is32BitsEnough() { + switch (bitmapType) { + case EMPTY: + return true; + case SINGLE_VALUE: + return isLongValue32bitEnough(singleValue); + case BITMAP_VALUE: + return bitmap.is32BitsEnough(); + default: + return false; + } + } +} \ No newline at end of file diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java new file mode 100644 index 00000000..2d783a3f --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class Codec { + + // not support encode negative value now + public static void encodeVarint64(long source, DataOutput out) throws IOException { + assert source >= 0; + short B = 128; // CHECKSTYLE IGNORE THIS LINE + + while (source >= B) { + out.write((int) (source & (B - 1) | B)); + source = source >> 7; + } + out.write((int) (source & (B - 1))); + } + + // not support decode negative value now + public static long decodeVarint64(DataInput in) throws IOException { + long result = 0; + int shift = 0; + short B = 128; // CHECKSTYLE IGNORE THIS LINE + + while (true) { + int oneByte = in.readUnsignedByte(); + boolean isEnd = (oneByte & B) == 0; + result = result | ((long) (oneByte & B - 1) << (shift * 7)); + if (isEnd) { + break; + } + shift++; + } + + return result; + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java new file mode 100644 index 00000000..8f8042ee --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java @@ -0,0 +1,374 @@ +package org.apache.doris.common.io; + +import org.apache.commons.codec.binary.StringUtils; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; +import java.util.HashSet; +import java.util.Set; + +public class Hll { + + public static final byte HLL_DATA_EMPTY = 0; + public static final byte HLL_DATA_EXPLICIT = 1; + public static final byte HLL_DATA_SPARSE = 2; + public static final byte HLL_DATA_FULL = 3; + + public static final int HLL_COLUMN_PRECISION = 14; + public static final int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); + public static final int HLL_EXPLICIT_INT64_NUM = 160; + public static final int HLL_SPARSE_THRESHOLD = 4096; + public static final int HLL_REGISTERS_COUNT = 16 * 1024; + public static final long M64 = 0xc6a4a7935bd1e995L; + public static final int R64 = 47; + public static final int SEED = 0xadc83b19; + private int type; + private Set hashSet; + private byte[] registers; + + public Hll() { + type = HLL_DATA_EMPTY; + this.hashSet = new HashSet<>(); + } + + public static byte getLongTailZeroNum(long hashValue) { + if (hashValue == 0) { + return 0; + } + long value = 1L; + byte idx = 0; + for (; ; idx++) { + if ((value & hashValue) != 0) { + return idx; + } + value = value << 1; + if (idx == 62) { + break; + } + } + return idx; + } + + private static long getLittleEndianLong(final byte[] data, final int index) { + return (((long) data[index] & 0xff)) + | (((long) data[index + 1] & 0xff) << 8) + | (((long) data[index + 2] & 0xff) << 16) + | (((long) data[index + 3] & 0xff) << 24) + | (((long) data[index + 4] & 0xff) << 32) + | (((long) data[index + 5] & 0xff) << 40) + | (((long) data[index + 6] & 0xff) << 48) + | (((long) data[index + 7] & 0xff) << 56); + } + + public static long hash64(final byte[] data, final int length, final int seed) { + long h = (seed & 0xffffffffL) ^ (length * M64); + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = (i << 3); + long k = getLittleEndianLong(data, index); + + k *= M64; + k ^= k >>> R64; + k *= M64; + + h ^= k; + h *= M64; + } + + final int index = (nblocks << 3); + switch (length - index) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case 7: + h ^= ((long) data[index + 6] & 0xff) << 48; + case 6: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 5] & 0xff) << 40; + case 5: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 4] & 0xff) << 32; + case 4: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 3] & 0xff) << 24; + case 3: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 2] & 0xff) << 16; + case 2: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 1] & 0xff) << 8; + case 1: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index] & 0xff); + h *= M64; + } + + h ^= h >>> R64; + h *= M64; + h ^= h >>> R64; + + return h; + } + + private void convertExplicitToRegister() { + assert this.type == HLL_DATA_EXPLICIT; + registers = new byte[HLL_REGISTERS_COUNT]; + for (Long value : hashSet) { + updateRegisters(value); + } + hashSet.clear(); + } + + private void updateRegisters(long hashValue) { + int idx; + // hash value less than zero means we get a unsigned long + // so need to transfer to BigInter to mod + if (hashValue < 0) { + BigInteger unint64HashValue = new BigInteger(Long.toUnsignedString(hashValue)); + unint64HashValue = unint64HashValue.mod(new BigInteger(Long.toUnsignedString(HLL_REGISTERS_COUNT))); + idx = unint64HashValue.intValue(); + } else { + idx = (int) (hashValue % HLL_REGISTERS_COUNT); + } + + hashValue >>>= HLL_COLUMN_PRECISION; + hashValue |= (1L << HLL_ZERO_COUNT_BITS); + byte firstOneBit = (byte) (getLongTailZeroNum(hashValue) + 1); + registers[idx] = registers[idx] > firstOneBit ? registers[idx] : firstOneBit; + } + + private void mergeRegisters(byte[] other) { + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + this.registers[i] = this.registers[i] > other[i] ? this.registers[i] : other[i]; + } + } + + public void updateWithHash(Object value) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(value)); + update(hash64(v, v.length, SEED)); + } + + public void update(long hashValue) { + switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + hashSet.add(hashValue); + type = HLL_DATA_EXPLICIT; + break; + case HLL_DATA_EXPLICIT: + if (hashSet.size() < HLL_EXPLICIT_INT64_NUM) { + hashSet.add(hashValue); + break; + } + convertExplicitToRegister(); + type = HLL_DATA_FULL; + case HLL_DATA_SPARSE: // CHECKSTYLE IGNORE THIS LINE: fall through + case HLL_DATA_FULL: + updateRegisters(hashValue); + break; + } + } + + public void merge(Hll other) { + if (other.type == HLL_DATA_EMPTY) { + return; + } + switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + this.type = other.type; + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + this.hashSet.addAll(other.hashSet); + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + this.registers = new byte[HLL_REGISTERS_COUNT]; + System.arraycopy(other.registers, 0, this.registers, 0, HLL_REGISTERS_COUNT); + break; + } + break; + case HLL_DATA_EXPLICIT: + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + this.hashSet.addAll(other.hashSet); + if (this.hashSet.size() > HLL_EXPLICIT_INT64_NUM) { + convertExplicitToRegister(); + this.type = HLL_DATA_FULL; + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + convertExplicitToRegister(); + mergeRegisters(other.registers); + this.type = HLL_DATA_FULL; + break; + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + for (long value : other.hashSet) { + update(value); + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + mergeRegisters(other.registers); + break; + } + break; + } + } + + public void serialize(DataOutput output) throws IOException { + switch (type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + output.writeByte(type); + break; + case HLL_DATA_EXPLICIT: + output.writeByte(type); + output.writeByte(hashSet.size()); + for (long value : hashSet) { + output.writeLong(Long.reverseBytes(value)); + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + int nonZeroRegisterNum = 0; + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + if (registers[i] != 0) { + nonZeroRegisterNum++; + } + } + if (nonZeroRegisterNum > HLL_SPARSE_THRESHOLD) { + output.writeByte(HLL_DATA_FULL); + for (byte value : registers) { + output.writeByte(value); + } + } else { + output.writeByte(HLL_DATA_SPARSE); + output.writeInt(Integer.reverseBytes(nonZeroRegisterNum)); + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + if (registers[i] != 0) { + output.writeShort(Short.reverseBytes((short) i)); + output.writeByte(registers[i]); + } + } + } + break; + } + } + + public boolean deserialize(DataInput input) throws IOException { + assert type == HLL_DATA_EMPTY; + + if (input == null) { + return false; + } + + this.type = input.readByte(); + switch (this.type) { + case HLL_DATA_EMPTY: + break; + case HLL_DATA_EXPLICIT: + int hashSetSize = input.readUnsignedByte(); + for (int i = 0; i < hashSetSize; i++) { + update(Long.reverseBytes(input.readLong())); + } + assert this.type == HLL_DATA_EXPLICIT; + break; + case HLL_DATA_SPARSE: + int sparseDataSize = Integer.reverseBytes(input.readInt()); + this.registers = new byte[HLL_REGISTERS_COUNT]; + for (int i = 0; i < sparseDataSize; i++) { + int idx = Short.reverseBytes(input.readShort()); + byte value = input.readByte(); + registers[idx] = value; + } + break; + case HLL_DATA_FULL: + this.registers = new byte[HLL_REGISTERS_COUNT]; + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + registers[i] = input.readByte(); + } + break; + default: + return false; + } + + return true; + } + + // use strictfp to force java follow IEEE 754 to deal float point strictly + public strictfp long estimateCardinality() { + if (type == HLL_DATA_EMPTY) { + return 0; + } + if (type == HLL_DATA_EXPLICIT) { + return hashSet.size(); + } + + int numStreams = HLL_REGISTERS_COUNT; + float alpha = 0; + + if (numStreams == 16) { + alpha = 0.673f; + } else if (numStreams == 32) { + alpha = 0.697f; + } else if (numStreams == 64) { + alpha = 0.709f; + } else { + alpha = 0.7213f / (1 + 1.079f / numStreams); + } + + float harmonicMean = 0; + int numZeroRegisters = 0; + + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + harmonicMean += Math.pow(2.0f, -registers[i]); + + if (registers[i] == 0) { + numZeroRegisters++; + } + } + + harmonicMean = 1.0f / harmonicMean; + double estimate = alpha * numStreams * numStreams * harmonicMean; + + if (estimate <= numStreams * 2.5 && numZeroRegisters != 0) { + estimate = numStreams * Math.log(((float) numStreams) / ((float) numZeroRegisters)); + } else if (numStreams == 16384 && estimate < 72000) { + double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) + - 1.4253 * 1.0e-12 * (estimate * estimate * estimate) + + 1.2940 * 1.0e-7 * (estimate * estimate) + - 5.2921 * 1.0e-3 * estimate + + 83.3216; + estimate -= estimate * (bias / 100); + } + + return (long) (estimate + 0.5); + } + + public int maxSerializedSize() { + switch (type) { + case HLL_DATA_EMPTY: + default: + return 1; + case HLL_DATA_EXPLICIT: + return 2 + hashSet.size() * 8; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + return 1 + HLL_REGISTERS_COUNT; + } + } + + // just for ut + public int getType() { + return type; + } + + // For convert to statistics used Hll128 + public byte[] getRegisters() { + return registers; + } + + // For convert to statistics used Hll128 + public Set getHashSet() { + return hashSet; + } +} \ No newline at end of file diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java new file mode 100644 index 00000000..85db5853 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java @@ -0,0 +1,1412 @@ +package org.apache.doris.common.io; + +import org.roaringbitmap.BitmapDataProvider; +import org.roaringbitmap.BitmapDataProviderSupplier; +import org.roaringbitmap.IntConsumer; +import org.roaringbitmap.IntIterator; +import org.roaringbitmap.InvalidRoaringFormat; +import org.roaringbitmap.RoaringBitmap; +import org.roaringbitmap.RoaringBitmapSupplier; +import org.roaringbitmap.Util; +import org.roaringbitmap.buffer.MutableRoaringBitmap; +import org.roaringbitmap.longlong.ImmutableLongBitmapDataProvider; +import org.roaringbitmap.longlong.LongConsumer; +import org.roaringbitmap.longlong.LongIterator; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; +import java.util.AbstractMap; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.SortedMap; +import java.util.TreeMap; + +public class Roaring64Map { + + private static final boolean DEFAULT_ORDER_IS_SIGNED = false; + private static final boolean DEFAULT_CARDINALITIES_ARE_CACHED = true; + /** + * the constant 2^64 + */ + private static final BigInteger TWO_64 = BigInteger.ONE.shiftLeft(64); + // Not final to enable initialization in Externalizable.readObject + private NavigableMap highToBitmap; + // If true, we handle longs a plain java longs: -1 if right before 0 + // If false, we handle longs as unsigned longs: 0 has no predecessor and Long.MAX_VALUE + 1L is + // expressed as a + // negative long + private boolean signedLongs = false; + private BitmapDataProviderSupplier supplier; + // By default, we cache cardinalities + private transient boolean doCacheCardinalities = true; + // Prevent recomputing all cardinalities when requesting consecutive ranks + private transient int firstHighNotValid = highestHigh() + 1; + // This boolean needs firstHighNotValid == Integer.MAX_VALUE to be allowed to be true + // If false, it means nearly all cumulated cardinalities are valid, except high=Integer.MAX_VALUE + // If true, it means all cumulated cardinalities are valid, even high=Integer.MAX_VALUE + private transient boolean allValid = false; + // TODO: I would prefer not managing arrays myself + private transient long[] sortedCumulatedCardinality = new long[0]; + private transient int[] sortedHighs = new int[0]; + // We guess consecutive .addLong will be on proximate longs: we remember the bitmap attached to + // this bucket in order + // to skip the indirection + private transient Map.Entry latestAddedHigh = null; + + /** + * By default, we consider longs are unsigned longs: normal longs: 0 is the lowest possible long. + * Long.MAX_VALUE is followed by Long.MIN_VALUE. -1L is the highest possible value + */ + public Roaring64Map() { + this(DEFAULT_ORDER_IS_SIGNED); + } + + /** + * By default, use RoaringBitmap as underlyings {@link BitmapDataProvider} + * + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + */ + public Roaring64Map(boolean signedLongs) { + this(signedLongs, DEFAULT_CARDINALITIES_ARE_CACHED); + } + + /** + * By default, use RoaringBitmap as underlyings {@link BitmapDataProvider} + * + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + * @param cacheCardinalities true if cardinalities have to be cached. It will prevent many + * iteration along the NavigableMap + */ + public Roaring64Map(boolean signedLongs, boolean cacheCardinalities) { + this(signedLongs, cacheCardinalities, new RoaringBitmapSupplier()); + } + + /** + * By default, longs are managed as unsigned longs and cardinalities are cached. + * + * @param supplier provide the logic to instantiate new {@link BitmapDataProvider}, typically + * instantiated once per high. + */ + public Roaring64Map(BitmapDataProviderSupplier supplier) { + this(DEFAULT_ORDER_IS_SIGNED, DEFAULT_CARDINALITIES_ARE_CACHED, supplier); + } + + /** + * By default, we activating cardinalities caching. + * + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + * @param supplier provide the logic to instantiate new {@link BitmapDataProvider}, typically + * instantiated once per high. + */ + public Roaring64Map(boolean signedLongs, BitmapDataProviderSupplier supplier) { + this(signedLongs, DEFAULT_CARDINALITIES_ARE_CACHED, supplier); + } + + /** + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + * @param cacheCardinalities true if cardinalities have to be cached. It will prevent many + * iteration along the NavigableMap + * @param supplier provide the logic to instantiate new {@link BitmapDataProvider}, typically + * instantiated once per high. + */ + public Roaring64Map(boolean signedLongs, boolean cacheCardinalities, + BitmapDataProviderSupplier supplier) { + this.signedLongs = signedLongs; + this.supplier = supplier; + + if (signedLongs) { + highToBitmap = new TreeMap<>(); + } else { + highToBitmap = new TreeMap<>(unsignedComparator()); + } + + this.doCacheCardinalities = cacheCardinalities; + resetPerfHelpers(); + } + + // From Arrays.binarySearch (Comparator). Check with org.roaringbitmap.Util.unsignedBinarySearch + private static int unsignedBinarySearch(int[] a, int fromIndex, int toIndex, int key, + Comparator c) { + int low = fromIndex; + int high = toIndex - 1; + + while (low <= high) { + int mid = (low + high) >>> 1; + int midVal = a[mid]; + int cmp = c.compare(midVal, key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + return -(low + 1); // key not found. + } + + /** + * Generate a bitmap with the specified values set to true. The provided longs values don't have + * to be in sorted order, but it may be preferable to sort them from a performance point of view. + * + * @param dat set values + * @return a new bitmap + */ + public static Roaring64Map bitmapOf(final long... dat) { + final Roaring64Map ans = new Roaring64Map(); + ans.add(dat); + return ans; + } + + /** + * @param id any long, positive or negative + * @return an int holding the 32 highest order bits of information of the input long + */ + public static int high(long id) { + return (int) (id >> 32); + } + + /** + * @param id any long, positive or negative + * @return an int holding the 32 lowest order bits of information of the input long + */ + public static int low(long id) { + return (int) id; + } + + /** + * @param high an integer representing the highest order bits of the output long + * @param low an integer representing the lowest order bits of the output long + * @return a long packing together the integers as computed by + * {@link #high(long)} and {@link #low(long)} + */ + // https://stackoverflow.com/questions/12772939/java-storing-two-ints-in-a-long + public static long pack(int high, int low) { + return (((long) high) << 32) | (low & 0xffffffffL); + } + + /** + * @param signedLongs true if long put in a {@link Roaring64Map} should be considered as + * signed long. + * @return the int representing the highest value which can be set as high value in a + */ + public static int highestHigh(boolean signedLongs) { + if (signedLongs) { + return Integer.MAX_VALUE; + } else { + return -1; + } + } + + /** + * @return A comparator for unsigned longs: a negative long is a long greater than Long.MAX_VALUE + */ + public static Comparator unsignedComparator() { + return new Comparator() { + + @Override + public int compare(Integer o1, Integer o2) { + return compareUnsigned(o1, o2); + } + }; + } + + /** + * Compares two {@code int} values numerically treating the values as unsigned. + * + * @param x the first {@code int} to compare + * @param y the second {@code int} to compare + * @return the value {@code 0} if {@code x == y}; a value less than {@code 0} if {@code x < y} as + * unsigned values; and a value greater than {@code 0} if {@code x > y} as unsigned values + * @since 1.8 + */ + // Duplicated from jdk8 Integer.compareUnsigned + public static int compareUnsigned(int x, int y) { + return Integer.compare(x + Integer.MIN_VALUE, y + Integer.MIN_VALUE); + } + + /** + * JDK8 Long.toUnsignedString was too complex to backport. Go for a slow version relying on + * BigInteger + */ + // https://stackoverflow.com/questions/7031198/java-signed-long-to-unsigned-long-string + static String toUnsignedString(long l) { + BigInteger b = BigInteger.valueOf(l); + if (b.signum() < 0) { + b = b.add(TWO_64); + } + return b.toString(); + } + + private void resetPerfHelpers() { + firstHighNotValid = highestHigh(signedLongs) + 1; + allValid = false; + + sortedCumulatedCardinality = new long[0]; + sortedHighs = new int[0]; + + latestAddedHigh = null; + } + + // Package-friendly: for the sake of unit-testing + // @VisibleForTesting + NavigableMap getHighToBitmap() { + return highToBitmap; + } + + // Package-friendly: for the sake of unit-testing + // @VisibleForTesting + int getLowestInvalidHigh() { + return firstHighNotValid; + } + + // Package-friendly: for the sake of unit-testing + // @VisibleForTesting + long[] getSortedCumulatedCardinality() { + return sortedCumulatedCardinality; + } + + /** + * Add the value to the container (set the value to "true"), whether it already appears or not. + *

+ * Java lacks native unsigned longs but the x argument is considered to be unsigned. Within + * bitmaps, numbers are ordered according to {@link Long#compareUnsigned}. We order the numbers + * like 0, 1, ..., 9223372036854775807, -9223372036854775808, -9223372036854775807,..., -1. + * + * @param x long value + */ + public void addLong(long x) { + int high = high(x); + int low = low(x); + + // Copy the reference to prevent race-condition + Map.Entry local = latestAddedHigh; + + BitmapDataProvider bitmap; + if (local != null && local.getKey().intValue() == high) { + bitmap = local.getValue(); + } else { + bitmap = highToBitmap.get(high); + if (bitmap == null) { + bitmap = newRoaringBitmap(); + pushBitmapForHigh(high, bitmap); + } + latestAddedHigh = new AbstractMap.SimpleImmutableEntry<>(high, bitmap); + } + bitmap.add(low); + + invalidateAboveHigh(high); + } + + /** + * Add the integer value to the container (set the value to "true"), whether it already appears or + * not. + *

+ * Javac lacks native unsigned integers but the x argument is considered to be unsigned. Within + * bitmaps, numbers are ordered according to {@link Integer#compareUnsigned}. We order the numbers + * like 0, 1, ..., 2147483647, -2147483648, -2147483647,..., -1. + * + * @param x integer value + */ + public void addInt(int x) { + addLong(Util.toUnsignedLong(x)); + } + + private BitmapDataProvider newRoaringBitmap() { + return supplier.newEmpty(); + } + + private void invalidateAboveHigh(int high) { + // The cardinalities after this bucket may not be valid anymore + if (compare(firstHighNotValid, high) > 0) { + // High was valid up to now + firstHighNotValid = high; + + int indexNotValid = binarySearch(sortedHighs, firstHighNotValid); + + final int indexAfterWhichToReset; + if (indexNotValid >= 0) { + indexAfterWhichToReset = indexNotValid; + } else { + // We have invalidate a high not already present: added a value for a brand new high + indexAfterWhichToReset = -indexNotValid - 1; + } + + // This way, sortedHighs remains sorted, without making a new/shorter array + Arrays.fill(sortedHighs, indexAfterWhichToReset, sortedHighs.length, highestHigh()); + } + allValid = false; + } + + private int compare(int x, int y) { + if (signedLongs) { + return Integer.compare(x, y); + } else { + return compareUnsigned(x, y); + } + } + + private void pushBitmapForHigh(int high, BitmapDataProvider bitmap) { + // TODO .size is too slow + // int nbHighBefore = highToBitmap.headMap(high).size(); + + BitmapDataProvider previous = highToBitmap.put(high, bitmap); + assert previous == null : "Should push only not-existing high"; + } + + /** + * Returns the number of distinct integers added to the bitmap (e.g., number of bits set). + * + * @return the cardinality + */ + public long getLongCardinality() { + if (doCacheCardinalities) { + if (highToBitmap.isEmpty()) { + return 0L; + } + int indexOk = ensureCumulatives(highestHigh()); + + // ensureCumulatives may have removed empty bitmaps + if (highToBitmap.isEmpty()) { + return 0L; + } + + + return sortedCumulatedCardinality[indexOk - 1]; + } else { + long cardinality = 0L; + for (BitmapDataProvider bitmap : highToBitmap.values()) { + cardinality += bitmap.getLongCardinality(); + } + return cardinality; + } + } + + /** + * @return the cardinality as an int + * @throws UnsupportedOperationException if the cardinality does not fit in an int + */ + public int getIntCardinality() throws UnsupportedOperationException { + long cardinality = getLongCardinality(); + + if (cardinality > Integer.MAX_VALUE) { + // TODO: we should handle cardinality fitting in an unsigned int + throw new UnsupportedOperationException( + "Can not call .getIntCardinality as the cardinality is bigger than Integer.MAX_VALUE"); + } + + return (int) cardinality; + } + + /** + * Return the jth value stored in this bitmap. + * + * @param j index of the value + * @return the value + * @throws IllegalArgumentException if j is out of the bounds of the bitmap cardinality + */ + public long select(final long j) throws IllegalArgumentException { + if (!doCacheCardinalities) { + return selectNoCache(j); + } + + // Ensure all cumulatives as we we have straightforward way to know in advance the high of the + // j-th value + int indexOk = ensureCumulatives(highestHigh()); + + if (highToBitmap.isEmpty()) { + return throwSelectInvalidIndex(j); + } + + // Use normal binarySearch as cardinality does not depends on considering longs signed or + // unsigned + // We need sortedCumulatedCardinality not to contain duplicated, else binarySearch may return + // any of the duplicates: we need to ensure it holds no high associated to an empty bitmap + int position = Arrays.binarySearch(sortedCumulatedCardinality, 0, indexOk, j); + + if (position >= 0) { + if (position == indexOk - 1) { + // .select has been called on this.getCardinality + return throwSelectInvalidIndex(j); + } + + // There is a bucket leading to this cardinality: the j-th element is the first element of + // next bucket + int high = sortedHighs[position + 1]; + BitmapDataProvider nextBitmap = highToBitmap.get(high); + return pack(high, nextBitmap.select(0)); + } else { + // There is no bucket with this cardinality + int insertionPoint = -position - 1; + + final long previousBucketCardinality; + if (insertionPoint == 0) { + previousBucketCardinality = 0L; + } else if (insertionPoint >= indexOk) { + return throwSelectInvalidIndex(j); + } else { + previousBucketCardinality = sortedCumulatedCardinality[insertionPoint - 1]; + } + + // We get a 'select' query for a single bitmap: should fit in an int + final int givenBitmapSelect = (int) (j - previousBucketCardinality); + + int high = sortedHighs[insertionPoint]; + BitmapDataProvider lowBitmap = highToBitmap.get(high); + int low = lowBitmap.select(givenBitmapSelect); + + return pack(high, low); + } + } + + // For benchmarks: compute without using cardinalities cache + // https://github.com/RoaringBitmap/CRoaring/blob/master/cpp/roaring64map.hh + private long selectNoCache(long j) { + long left = j; + + for (Map.Entry entry : highToBitmap.entrySet()) { + long lowCardinality = entry.getValue().getCardinality(); + + if (left >= lowCardinality) { + left -= lowCardinality; + } else { + // It is legit for left to be negative + int leftAsUnsignedInt = (int) left; + return pack(entry.getKey(), entry.getValue().select(leftAsUnsignedInt)); + } + } + + return throwSelectInvalidIndex(j); + } + + private long throwSelectInvalidIndex(long j) { + // see org.roaringbitmap.buffer.ImmutableRoaringBitmap.select(int) + throw new IllegalArgumentException( + "select " + j + " when the cardinality is " + this.getLongCardinality()); + } + + /** + * For better performance, consider the Use the {@link #forEach forEach} method. + * + * @return a custom iterator over set bits, the bits are traversed in ascending sorted order + */ + public Iterator iterator() { + final LongIterator it = getLongIterator(); + + return new Iterator() { + + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public Long next() { + return it.next(); + } + + @Override + public void remove() { + // TODO? + throw new UnsupportedOperationException(); + } + }; + } + + public void forEach(final LongConsumer lc) { + for (final Map.Entry highEntry : highToBitmap.entrySet()) { + highEntry.getValue().forEach(new IntConsumer() { + + @Override + public void accept(int low) { + lc.accept(pack(highEntry.getKey(), low)); + } + }); + } + } + + public long rankLong(long id) { + int high = high(id); + int low = low(id); + + if (!doCacheCardinalities) { + return rankLongNoCache(high, low); + } + + int indexOk = ensureCumulatives(high); + + int highPosition = binarySearch(sortedHighs, 0, indexOk, high); + + if (highPosition >= 0) { + // There is a bucket holding this item + + final long previousBucketCardinality; + if (highPosition == 0) { + previousBucketCardinality = 0; + } else { + previousBucketCardinality = sortedCumulatedCardinality[highPosition - 1]; + } + + BitmapDataProvider lowBitmap = highToBitmap.get(sortedHighs[highPosition]); + + // Rank is previous cardinality plus rank in current bitmap + return previousBucketCardinality + lowBitmap.rankLong(low); + } else { + // There is no bucket holding this item: insertionPoint is previous bitmap + int insertionPoint = -highPosition - 1; + + if (insertionPoint == 0) { + // this key is before all inserted keys + return 0; + } else { + // The rank is the cardinality of this previous bitmap + return sortedCumulatedCardinality[insertionPoint - 1]; + } + } + } + + // https://github.com/RoaringBitmap/CRoaring/blob/master/cpp/roaring64map.hh + private long rankLongNoCache(int high, int low) { + long result = 0L; + + BitmapDataProvider lastBitmap = highToBitmap.get(high); + if (lastBitmap == null) { + // There is no value with same high: the rank is a sum of cardinalities + for (Map.Entry bitmap : highToBitmap.entrySet()) { + if (bitmap.getKey().intValue() > high) { + break; + } else { + result += bitmap.getValue().getLongCardinality(); + } + } + } else { + for (BitmapDataProvider bitmap : highToBitmap.values()) { + if (bitmap == lastBitmap) { + result += bitmap.rankLong(low); + break; + } else { + result += bitmap.getLongCardinality(); + } + } + } + + return result; + } + + /** + * @param high for which high bucket should we compute the cardinality + * @return the highest validatedIndex + */ + protected int ensureCumulatives(int high) { + if (allValid) { + // the whole array is valid (up-to its actual length, not its capacity) + return highToBitmap.size(); + } else if (compare(high, firstHighNotValid) < 0) { + // The high is strictly below the first not valid: it is valid + + // sortedHighs may have only a subset of valid values on the right. However, these invalid + // values have been set to maxValue, and we are here as high < firstHighNotValid ==> high < + // maxHigh() + int position = binarySearch(sortedHighs, high); + + if (position >= 0) { + // This high has a bitmap: +1 as this index will be used as right (excluded) bound in a + // binary-search + return position + 1; + } else { + // This high has no bitmap: it could be between 2 highs with bitmaps + int insertionPosition = -position - 1; + return insertionPosition; + } + } else { + + // For each deprecated buckets + SortedMap tailMap = + highToBitmap.tailMap(firstHighNotValid, true); + + // TODO .size on tailMap make an iterator: arg + int indexOk = highToBitmap.size() - tailMap.size(); + + // TODO: It should be possible to compute indexOk based on sortedHighs array + // assert indexOk == binarySearch(sortedHighs, firstHighNotValid); + + Iterator> it = tailMap.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry e = it.next(); + int currentHigh = e.getKey(); + + if (compare(currentHigh, high) > 0) { + // No need to compute more than needed + break; + } else if (e.getValue().isEmpty()) { + // highToBitmap can not be modified as we iterate over it + if (latestAddedHigh != null && latestAddedHigh.getKey().intValue() == currentHigh) { + // Dismiss the cached bitmap as it is removed from the NavigableMap + latestAddedHigh = null; + } + it.remove(); + } else { + ensureOne(e, currentHigh, indexOk); + + // We have added one valid cardinality + indexOk++; + } + + } + + if (highToBitmap.isEmpty() || indexOk == highToBitmap.size()) { + // We have compute all cardinalities + allValid = true; + } + + return indexOk; + } + } + + private int binarySearch(int[] array, int key) { + if (signedLongs) { + return Arrays.binarySearch(array, key); + } else { + return unsignedBinarySearch(array, 0, array.length, key, + unsignedComparator()); + } + } + + private int binarySearch(int[] array, int from, int to, int key) { + if (signedLongs) { + return Arrays.binarySearch(array, from, to, key); + } else { + return unsignedBinarySearch(array, from, to, key, unsignedComparator()); + } + } + + private void ensureOne(Map.Entry e, int currentHigh, int indexOk) { + // sortedHighs are valid only up to some index + assert indexOk <= sortedHighs.length : indexOk + " is bigger than " + sortedHighs.length; + + final int index; + if (indexOk == 0) { + if (sortedHighs.length == 0) { + index = -1; + // } else if (sortedHighs[0] == currentHigh) { + // index = 0; + } else { + index = -1; + } + } else if (indexOk < sortedHighs.length) { + index = -indexOk - 1; + } else { + index = -sortedHighs.length - 1; + } + assert index == binarySearch(sortedHighs, 0, indexOk, currentHigh) : "Computed " + index + + " differs from dummy binary-search index: " + + binarySearch(sortedHighs, 0, indexOk, currentHigh); + + if (index >= 0) { + // This would mean calling .ensureOne is useless: should never got here at the first time + throw new IllegalStateException("Unexpectedly found " + currentHigh + " in " + + Arrays.toString(sortedHighs) + " strictly before index" + indexOk); + } else { + int insertionPosition = -index - 1; + + // This is a new key + if (insertionPosition >= sortedHighs.length) { + int previousSize = sortedHighs.length; + + // TODO softer growing factor + int newSize = Math.min(Integer.MAX_VALUE, sortedHighs.length * 2 + 1); + + // Insertion at the end + sortedHighs = Arrays.copyOf(sortedHighs, newSize); + sortedCumulatedCardinality = Arrays.copyOf(sortedCumulatedCardinality, newSize); + + // Not actually needed. But simplify the reading of array content + Arrays.fill(sortedHighs, previousSize, sortedHighs.length, highestHigh()); + Arrays.fill(sortedCumulatedCardinality, previousSize, sortedHighs.length, Long.MAX_VALUE); + } + sortedHighs[insertionPosition] = currentHigh; + + final long previousCardinality; + if (insertionPosition >= 1) { + previousCardinality = sortedCumulatedCardinality[insertionPosition - 1]; + } else { + previousCardinality = 0; + } + + sortedCumulatedCardinality[insertionPosition] = + previousCardinality + e.getValue().getLongCardinality(); + + if (currentHigh == highestHigh()) { + // We are already on the highest high. Do not set allValid as it is set anyway out of the + // loop + firstHighNotValid = currentHigh; + } else { + // The first not valid is the next high + // TODO: The entry comes from a NavigableMap: it may be quite cheap to know the next high + firstHighNotValid = currentHigh + 1; + } + } + } + + private int highestHigh() { + return highestHigh(signedLongs); + } + + /** + * In-place bitwise OR (union) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void or(final Roaring64Map x2) { + boolean firstBucket = true; + + for (Map.Entry e2 : x2.highToBitmap.entrySet()) { + // Keep object to prevent auto-boxing + Integer high = e2.getKey(); + + BitmapDataProvider lowBitmap1 = this.highToBitmap.get(high); + + BitmapDataProvider lowBitmap2 = e2.getValue(); + + // TODO Reviewers: is it a good idea to rely on BitmapDataProvider except in methods + // expecting an actual MutableRoaringBitmap? + // TODO This code may lead to closing a buffer Bitmap in current Navigable even if current is + // not on buffer + if ((lowBitmap1 == null || lowBitmap1 instanceof RoaringBitmap) + && lowBitmap2 instanceof RoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + RoaringBitmap lowBitmap2Clone = ((RoaringBitmap) lowBitmap2).clone(); + + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((RoaringBitmap) lowBitmap1).or((RoaringBitmap) lowBitmap2); + } + } else if ((lowBitmap1 == null || lowBitmap1 instanceof MutableRoaringBitmap) + && lowBitmap2 instanceof MutableRoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + BitmapDataProvider lowBitmap2Clone = ((MutableRoaringBitmap) lowBitmap2).clone(); + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((MutableRoaringBitmap) lowBitmap1).or((MutableRoaringBitmap) lowBitmap2); + } + } else { + throw new UnsupportedOperationException( + ".or is not between " + this.getClass() + " and " + lowBitmap2.getClass()); + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * In-place bitwise XOR (symmetric difference) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void xor(final Roaring64Map x2) { + boolean firstBucket = true; + + for (Map.Entry e2 : x2.highToBitmap.entrySet()) { + // Keep object to prevent auto-boxing + Integer high = e2.getKey(); + + BitmapDataProvider lowBitmap1 = this.highToBitmap.get(high); + + BitmapDataProvider lowBitmap2 = e2.getValue(); + + // TODO Reviewers: is it a good idea to rely on BitmapDataProvider except in methods + // expecting an actual MutableRoaringBitmap? + // TODO This code may lead to closing a buffer Bitmap in current Navigable even if current is + // not on buffer + if ((lowBitmap1 == null || lowBitmap1 instanceof RoaringBitmap) + && lowBitmap2 instanceof RoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + RoaringBitmap lowBitmap2Clone = ((RoaringBitmap) lowBitmap2).clone(); + + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((RoaringBitmap) lowBitmap1).xor((RoaringBitmap) lowBitmap2); + } + } else if ((lowBitmap1 == null || lowBitmap1 instanceof MutableRoaringBitmap) + && lowBitmap2 instanceof MutableRoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + BitmapDataProvider lowBitmap2Clone = ((MutableRoaringBitmap) lowBitmap2).clone(); + + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((MutableRoaringBitmap) lowBitmap1).xor((MutableRoaringBitmap) lowBitmap2); + } + } else { + throw new UnsupportedOperationException( + ".or is not between " + this.getClass() + " and " + lowBitmap2.getClass()); + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * In-place bitwise AND (intersection) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void and(final Roaring64Map x2) { + boolean firstBucket = true; + + Iterator> thisIterator = highToBitmap.entrySet().iterator(); + while (thisIterator.hasNext()) { + Map.Entry e1 = thisIterator.next(); + + // Keep object to prevent auto-boxing + Integer high = e1.getKey(); + + BitmapDataProvider lowBitmap2 = x2.highToBitmap.get(high); + + if (lowBitmap2 == null) { + // None of given high values are present in x2 + thisIterator.remove(); + } else { + BitmapDataProvider lowBitmap1 = e1.getValue(); + + if (lowBitmap2 instanceof RoaringBitmap && lowBitmap1 instanceof RoaringBitmap) { + ((RoaringBitmap) lowBitmap1).and((RoaringBitmap) lowBitmap2); + } else if (lowBitmap2 instanceof MutableRoaringBitmap + && lowBitmap1 instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) lowBitmap1).and((MutableRoaringBitmap) lowBitmap2); + } else { + throw new UnsupportedOperationException( + ".and is not between " + this.getClass() + " and " + lowBitmap1.getClass()); + } + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * In-place bitwise ANDNOT (difference) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void andNot(final Roaring64Map x2) { + boolean firstBucket = true; + + Iterator> thisIterator = highToBitmap.entrySet().iterator(); + while (thisIterator.hasNext()) { + Map.Entry e1 = thisIterator.next(); + + // Keep object to prevent auto-boxing + Integer high = e1.getKey(); + + BitmapDataProvider lowBitmap2 = x2.highToBitmap.get(high); + + if (lowBitmap2 != null) { + BitmapDataProvider lowBitmap1 = e1.getValue(); + + if (lowBitmap2 instanceof RoaringBitmap && lowBitmap1 instanceof RoaringBitmap) { + ((RoaringBitmap) lowBitmap1).andNot((RoaringBitmap) lowBitmap2); + } else if (lowBitmap2 instanceof MutableRoaringBitmap + && lowBitmap1 instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) lowBitmap1).andNot((MutableRoaringBitmap) lowBitmap2); + } else { + throw new UnsupportedOperationException( + ".and is not between " + this.getClass() + " and " + lowBitmap1.getClass()); + } + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * A string describing the bitmap. + * + * @return the string + */ + @Override + public String toString() { + final StringBuilder answer = new StringBuilder(); + final LongIterator i = this.getLongIterator(); + answer.append("{"); + if (i.hasNext()) { + if (signedLongs) { + answer.append(i.next()); + } else { + answer.append(toUnsignedString(i.next())); + } + } + while (i.hasNext()) { + answer.append(","); + // to avoid using too much memory, we limit the size + if (answer.length() > 0x80000) { + answer.append("..."); + break; + } + if (signedLongs) { + answer.append(i.next()); + } else { + answer.append(toUnsignedString(i.next())); + } + + } + answer.append("}"); + return answer.toString(); + } + + /** + * For better performance, consider the Use the {@link #forEach forEach} method. + * + * @return a custom iterator over set bits, the bits are traversed in ascending sorted order + */ + public LongIterator getLongIterator() { + final Iterator> it = highToBitmap.entrySet().iterator(); + + return toIterator(it, false); + } + + protected LongIterator toIterator(final Iterator> it, + final boolean reversed) { + return new LongIterator() { + + protected int currentKey; + protected IntIterator currentIt; + + @Override + public boolean hasNext() { + if (currentIt == null) { + // Were initially empty + if (!moveToNextEntry(it)) { + return false; + } + } + + while (true) { + if (currentIt.hasNext()) { + return true; + } else { + if (!moveToNextEntry(it)) { + return false; + } + } + } + } + + /** + * + * @param it the underlying iterator which has to be moved to next long + * @return true if we MAY have more entries. false if there is definitely nothing more + */ + private boolean moveToNextEntry(Iterator> it) { + if (it.hasNext()) { + Map.Entry next = it.next(); + currentKey = next.getKey(); + if (reversed) { + currentIt = next.getValue().getReverseIntIterator(); + } else { + currentIt = next.getValue().getIntIterator(); + } + + // We may have more long + return true; + } else { + // We know there is nothing more + return false; + } + } + + @Override + public long next() { + if (hasNext()) { + return pack(currentKey, currentIt.next()); + } else { + throw new IllegalStateException("empty"); + } + } + + @Override + public LongIterator clone() { + throw new UnsupportedOperationException("TODO"); + } + }; + } + + public boolean contains(long x) { + int high = high(x); + BitmapDataProvider lowBitmap = highToBitmap.get(high); + if (lowBitmap == null) { + return false; + } + + int low = low(x); + return lowBitmap.contains(low); + } + + public int getSizeInBytes() { + return (int) getLongSizeInBytes(); + } + + public long getLongSizeInBytes() { + long size = 8; + + // Size of containers + size += highToBitmap.values().stream().mapToLong(p -> p.getLongSizeInBytes()).sum(); + + // Size of Map data-structure: we consider each TreeMap entry costs 40 bytes + // http://java-performance.info/memory-consumption-of-java-data-types-2/ + size += 8 + 40 * highToBitmap.size(); + + // Size of (boxed) Integers used as keys + size += 16 * highToBitmap.size(); + + // The cache impacts the size in heap + size += 8 * sortedCumulatedCardinality.length; + size += 4 * sortedHighs.length; + + return size; + } + + public boolean isEmpty() { + return getLongCardinality() == 0L; + } + + public ImmutableLongBitmapDataProvider limit(long x) { + throw new UnsupportedOperationException("TODO"); + } + + /** + * Use a run-length encoding where it is estimated as more space efficient + * + * @return whether a change was applied + */ + public boolean runOptimize() { + boolean hasChanged = false; + for (BitmapDataProvider lowBitmap : highToBitmap.values()) { + if (lowBitmap instanceof RoaringBitmap) { + hasChanged |= ((RoaringBitmap) lowBitmap).runOptimize(); + } else if (lowBitmap instanceof MutableRoaringBitmap) { + hasChanged |= ((MutableRoaringBitmap) lowBitmap).runOptimize(); + } + } + return hasChanged; + } + + public long serializedSizeInBytes() { + long nbBytes = 0L; + + // .writeBoolean for signedLongs boolean + nbBytes += 1; + + // .writeInt for number of different high values + nbBytes += 4; + + for (Map.Entry entry : highToBitmap.entrySet()) { + // .writeInt for high + nbBytes += 4; + + // The low bitmap size in bytes + nbBytes += entry.getValue().serializedSizeInBytes(); + } + + return nbBytes; + } + + /** + * reset to an empty bitmap; result occupies as much space a newly created bitmap. + */ + public void clear() { + this.highToBitmap.clear(); + resetPerfHelpers(); + } + + /** + * Return the set values as an array, if the cardinality is smaller than 2147483648. The long + * values are in sorted order. + * + * @return array representing the set values. + */ + public long[] toArray() { + long cardinality = this.getLongCardinality(); + if (cardinality > Integer.MAX_VALUE) { + throw new IllegalStateException("The cardinality does not fit in an array"); + } + + final long[] array = new long[(int) cardinality]; + + int pos = 0; + LongIterator it = getLongIterator(); + + while (it.hasNext()) { + array[pos++] = it.next(); + } + return array; + } + + /* ------------------ method below from Roaring64NavigableMap and being overwritten ----------------------------- */ + + /** + * Set all the specified values to true. This can be expected to be slightly faster than calling + * "add" repeatedly. The provided integers values don't have to be in sorted order, but it may be + * preferable to sort them from a performance point of view. + * + * @param dat set values + */ + public void add(long... dat) { + for (long oneLong : dat) { + addLong(oneLong); + } + } + + /** + * Add to the current bitmap all longs in [rangeStart,rangeEnd). + * + * @param rangeStart inclusive beginning of range + * @param rangeEnd exclusive ending of range + */ + public void add(final long rangeStart, final long rangeEnd) { + int startHigh = high(rangeStart); + int startLow = low(rangeStart); + + int endHigh = high(rangeEnd); + int endLow = low(rangeEnd); + + for (int high = startHigh; high <= endHigh; high++) { + final int currentStartLow; + if (startHigh == high) { + // The whole range starts in this bucket + currentStartLow = startLow; + } else { + // Add the bucket from the beginning + currentStartLow = 0; + } + + long startLowAsLong = Util.toUnsignedLong(currentStartLow); + + final long endLowAsLong; + if (endHigh == high) { + // The whole range ends in this bucket + endLowAsLong = Util.toUnsignedLong(endLow); + } else { + // Add the bucket until the end: we have a +1 as, in RoaringBitmap.add(long,long), the end + // is excluded + endLowAsLong = Util.toUnsignedLong(-1) + 1; + } + + if (endLowAsLong > startLowAsLong) { + // Initialize the bitmap only if there is access data to write + BitmapDataProvider bitmap = highToBitmap.get(high); + if (bitmap == null) { + bitmap = new MutableRoaringBitmap(); + pushBitmapForHigh(high, bitmap); + } + + if (bitmap instanceof RoaringBitmap) { + ((RoaringBitmap) bitmap).add(startLowAsLong, endLowAsLong); + } else if (bitmap instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) bitmap).add(startLowAsLong, endLowAsLong); + } else { + throw new UnsupportedOperationException("TODO. Not for " + bitmap.getClass()); + } + } + } + + invalidateAboveHigh(startHigh); + } + + + + /*---------------------------- method below is new written for doris's own bitmap --------------------------------*/ + + public LongIterator getReverseLongIterator() { + return toIterator(highToBitmap.descendingMap().entrySet().iterator(), true); + } + + /*--------------- method below fetched from org.roaringbitmap.longlong RoaringIntPacking -----------------------*/ + + public void removeLong(long x) { + int high = high(x); + + BitmapDataProvider bitmap = highToBitmap.get(high); + + if (bitmap != null) { + int low = low(x); + bitmap.remove(low); + + // Invalidate only if actually modified + invalidateAboveHigh(high); + } + + } + + public void trim() { + for (BitmapDataProvider bitmap : highToBitmap.values()) { + bitmap.trim(); + } + } + + @Override + public int hashCode() { + return highToBitmap.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Roaring64Map other = (Roaring64Map) obj; + return Objects.equals(highToBitmap, other.highToBitmap); + } + + /** + * Add the value if it is not already present, otherwise remove it. + * + * @param x long value + */ + public void flip(final long x) { + int high = high(x); + BitmapDataProvider lowBitmap = highToBitmap.get(high); + if (lowBitmap == null) { + // The value is not added: add it without any flip specific code + addLong(x); + } else { + int low = low(x); + + // .flip is not in BitmapDataProvider contract + // TODO Is it relevant to calling .flip with a cast? + if (lowBitmap instanceof RoaringBitmap) { + ((RoaringBitmap) lowBitmap).flip(low); + } else if (lowBitmap instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) lowBitmap).flip(low); + } else { + // Fallback to a manual flip + if (lowBitmap.contains(low)) { + lowBitmap.remove(low); + } else { + lowBitmap.add(low); + } + } + } + + invalidateAboveHigh(high); + } + + /** + * Serialize this bitmap. + *

+ * Unlike RoaringBitmap, there is no specification for now: it may change from one java version + * to another, and from one RoaringBitmap version to another. + *

+ * Consider calling {@link #runOptimize} before serialization to improve compression. + *

+ * The current bitmap is not modified. + * + * @param out the DataOutput stream + * @throws IOException Signals that an I/O exception has occurred. + */ + public void serialize(DataOutput out) throws IOException { + if (highToBitmap.size() == 0) { + return; + } + if (is32BitsEnough()) { + out.write(BitmapValue.BITMAP32); + highToBitmap.get(0).serialize(out); + return; + } + + out.write(BitmapValue.BITMAP64); + Codec.encodeVarint64(highToBitmap.size(), out); + + for (Map.Entry entry : highToBitmap.entrySet()) { + // serialized in little end for BE cpp read in case of bugs when the value is larger than 32bits + out.writeInt(Integer.reverseBytes(entry.getKey().intValue())); + entry.getValue().serialize(out); + } + } + + /** + * Deserialize (retrieve) this bitmap. + *

+ * Unlike RoaringBitmap, there is no specification for now: it may change from one java version to + * another, and from one RoaringBitmap version to another. + *

+ * The current bitmap is overwritten. + * + * @param in the DataInput stream + * @throws IOException Signals that an I/O exception has occurred. + */ + public void deserialize(DataInput in, int bitmapType) throws IOException { + this.clear(); + highToBitmap = new TreeMap<>(); + + if (bitmapType == BitmapValue.BITMAP32) { + RoaringBitmap provider = new RoaringBitmap(); + provider.deserialize(in); + highToBitmap.put(0, provider); + return; + } + + if (bitmapType != BitmapValue.BITMAP64) { + throw new InvalidRoaringFormat("invalid bitmap type"); + } + + long nbHighs = Codec.decodeVarint64(in); + for (int i = 0; i < nbHighs; i++) { + // keep the same behavior with little-end serialize + int high = Integer.reverseBytes(in.readInt()); + RoaringBitmap provider = new RoaringBitmap(); + provider.deserialize(in); + highToBitmap.put(high, provider); + } + + resetPerfHelpers(); + } + + public boolean is32BitsEnough() { + return highToBitmap.size() == 1 && highToBitmap.get(0) != null; + } + +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java new file mode 100644 index 00000000..eeebc76e --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.util.HashMap; +import java.util.Map; + +/** + * Helper class to convert type between Java's wrapper type and primitive type + * There are 8 wrapper/primitive types in Java: + * |Wrapped Type |Primitive Type + * -------------------------------------- + * |Boolean |boolean + * |Character |char + * |Byte |byte + * |Short |short + * |Integer |int + * |Float |float + * |Long |long + * |Double |double + */ +public class AutoType { + private static final Map, Class> PRIMITIVE_TO_WRAPPER = new HashMap(); + private static final Map, Class> WRAPPER_TO_PRIMITIVE = new HashMap(); + + public static boolean isWrapperOfPrimitiveType(Class type) { + return WRAPPER_TO_PRIMITIVE.containsKey(type); + } + + public static Class getPrimitiveType(Class wrapperType) { + return WRAPPER_TO_PRIMITIVE.get(wrapperType); + } + + public static Class getWrapperType(Class primitiveType) { + return PRIMITIVE_TO_WRAPPER.get(primitiveType); + } + + static { + WRAPPER_TO_PRIMITIVE.put(Boolean.class, Boolean.TYPE); + WRAPPER_TO_PRIMITIVE.put(Character.class, Character.TYPE); + WRAPPER_TO_PRIMITIVE.put(Byte.class, Byte.TYPE); + WRAPPER_TO_PRIMITIVE.put(Short.class, Short.TYPE); + WRAPPER_TO_PRIMITIVE.put(Integer.class, Integer.TYPE); + WRAPPER_TO_PRIMITIVE.put(Float.class, Float.TYPE); + WRAPPER_TO_PRIMITIVE.put(Long.class, Long.TYPE); + WRAPPER_TO_PRIMITIVE.put(Double.class, Double.TYPE); + + PRIMITIVE_TO_WRAPPER.put(Boolean.TYPE, Boolean.class); + PRIMITIVE_TO_WRAPPER.put(Character.TYPE, Character.class); + PRIMITIVE_TO_WRAPPER.put(Byte.TYPE, Byte.class); + PRIMITIVE_TO_WRAPPER.put(Short.TYPE, Short.class); + PRIMITIVE_TO_WRAPPER.put(Integer.TYPE, Integer.class); + PRIMITIVE_TO_WRAPPER.put(Float.TYPE, Float.class); + PRIMITIVE_TO_WRAPPER.put(Long.TYPE, Long.class); + PRIMITIVE_TO_WRAPPER.put(Double.TYPE, Double.class); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java new file mode 100644 index 00000000..2fce0e56 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; + +/** + * Modify from mockit.internal.util.ConstructorReflection JMockit v1.13 + * Util class to invoke constructor of specified class. + */ +public final class ConstructorReflection { + + private ConstructorReflection() { + } + + /** + * invoke the {@constructor} with parameters {@initArgs}. + */ + public static T invoke(Constructor constructor, Object... initArgs) { + if (constructor == null || initArgs == null) { + throw new IllegalArgumentException(); + } + makeAccessible(constructor); + + try { + return constructor.newInstance(initArgs); + } catch (InstantiationException e) { + throw new RuntimeException(e); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } catch (InvocationTargetException e) { + Throwable cause = e.getCause(); + if (cause instanceof Error) { + throw (Error) cause; + } else if (cause instanceof RuntimeException) { + throw (RuntimeException) cause; + } else { + throw new IllegalStateException("Should never get here", cause); + } + } + } + + /** + * invoke the constructor with parameters {@nonNullArgs Object...}. + */ + public static T newInstance(Class aClass, Object... nonNullArgs) { + if (aClass == null || nonNullArgs == null) { + throw new IllegalArgumentException(); + } else { + Class[] argTypes = ParameterReflection.getArgumentTypesFromArgumentValues(nonNullArgs); + Constructor constructor = findCompatibleConstructor(aClass, argTypes); + return invoke(constructor, nonNullArgs); + } + } + + /** + * invoke the constructor with no parameters of {@aClass Class}. + */ + private static T newInstance(Class aClass) { + return (T) newInstance((Class) aClass, ParameterReflection.NO_PARAMETERS); + } + + /** + * invoke the default constructor of {@aClass Class}. + * if the default constructor is not available, try to invoke the one constructor with no parameters. + */ + public static T newInstanceUsingDefaultConstructor(Class aClass) { + if (aClass == null) { + throw new IllegalArgumentException(); + } + try { + return aClass.newInstance(); + } catch (InstantiationException e) { + throw new RuntimeException(e); + } catch (IllegalAccessException e) { + return newInstance(aClass); + } + } + + /** + * invoke the default constructor of {@aClass Class}. + */ + public static T newInstanceUsingDefaultConstructorIfAvailable(Class aClass) { + if (aClass == null) { + throw new IllegalArgumentException(); + } + try { + return aClass.newInstance(); + } catch (InstantiationException e) { + return null; + } catch (IllegalAccessException e) { + return null; + } + } + + /** + * invoke inner-class constructor with outer-class instance {@outerInstance} and parameters {@nonNullArgs}. + */ + public static T newInnerInstance(Class innerClass, Object outerInstance, Object... nonNullArgs) { + if (innerClass == null || outerInstance == null || nonNullArgs == null) { + throw new IllegalArgumentException(); + } else { + Object[] initArgs = ParameterReflection.argumentsWithExtraFirstValue(nonNullArgs, outerInstance); + return newInstance(innerClass, initArgs); + } + } + + /** + * Get non-inner-class constructor with {@argTypes Class[]}. + * if more than one constructor was found, choose the more specific one. (i.e. constructor with parameters that have more concrete types is more specific) + * if no constructor was found, will check if {@theClass} is a inner class. Then a IllegalArgumentException exception will be thrown. + */ + private static Constructor findCompatibleConstructor(Class theClass, Class[] argTypes) { + if (theClass == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Constructor found = null; + Class[] foundParameters = null; + Constructor[] declaredConstructors = theClass.getDeclaredConstructors(); + Constructor[] declaredConstructorsArray = declaredConstructors; + + for (Constructor declaredConstructor : declaredConstructorsArray) { + Class[] declaredParamTypes = declaredConstructor.getParameterTypes(); + int gap = declaredParamTypes.length - argTypes.length; + if (gap == 0 && (ParameterReflection.matchesParameterTypes(declaredParamTypes, argTypes) + || ParameterReflection.acceptsArgumentTypes(declaredParamTypes, argTypes)) + && (found == null || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParameters))) { + found = (Constructor) declaredConstructor; + foundParameters = declaredParamTypes; + } + } + + if (found != null) { + return found; + } else { + Class declaringClass = theClass.getDeclaringClass(); + Class[] paramTypes = declaredConstructors[0].getParameterTypes(); + // check if this constructor is belong to a inner class + // the parameter[0] of inner class's constructor is a instance of outer class + if (paramTypes[0] == declaringClass && paramTypes.length > argTypes.length) { + throw new IllegalArgumentException("Invalid instantiation of inner class; use newInnerInstance instead"); + } else { + String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); + throw new IllegalArgumentException("No compatible constructor found: " + theClass.getSimpleName() + argTypesDesc); + } + } + } + + // ensure that field is accessible + public static void makeAccessible(AccessibleObject classMember) { + if (!classMember.isAccessible()) { + classMember.setAccessible(true); + } + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java new file mode 100644 index 00000000..5fb33717 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +/** + * Modify from mockit.internal.util.Deencapsulation JMockit ver1.13 + */ +public final class Deencapsulation { + private Deencapsulation() { + } + + public static T getField(Object objectWithField, String fieldName) { + return FieldReflection.getField(objectWithField.getClass(), fieldName, objectWithField); + } + + public static T getField(Object objectWithField, Class fieldType) { + return FieldReflection.getField(objectWithField.getClass(), fieldType, objectWithField); + } + + public static T getField(Class classWithStaticField, String fieldName) { + return FieldReflection.getField(classWithStaticField, fieldName, null); + } + + public static T getField(Class classWithStaticField, Class fieldType) { + return FieldReflection.getField(classWithStaticField, fieldType, null); + } + + public static void setField(Object objectWithField, String fieldName, Object fieldValue) { + FieldReflection.setField(objectWithField.getClass(), objectWithField, fieldName, fieldValue); + } + + public static void setField(Object objectWithField, Object fieldValue) { + FieldReflection.setField(objectWithField.getClass(), objectWithField, null, fieldValue); + } + + public static void setField(Class classWithStaticField, String fieldName, Object fieldValue) { + FieldReflection.setField(classWithStaticField, null, fieldName, fieldValue); + } + + public static void setField(Class classWithStaticField, Object fieldValue) { + FieldReflection.setField(classWithStaticField, null, null, fieldValue); + } + + public static T invoke(Object objectWithMethod, String methodName, Object... nonNullArgs) { + Class theClass = objectWithMethod.getClass(); + return MethodReflection.invoke(theClass, objectWithMethod, methodName, nonNullArgs); + } + + public static T invoke(Class classWithStaticMethod, String methodName, Object... nonNullArgs) { + return MethodReflection.invoke(classWithStaticMethod, null, methodName, nonNullArgs); + } + + public static T newInstance(Class classToInstantiate, Object... nonNullArgs) { + return ConstructorReflection.newInstance(classToInstantiate, nonNullArgs); + } + + public static T newInnerInstance(Class innerClassToInstantiate, Object outerClassInstance, Object... nonNullArgs) { + return ConstructorReflection.newInnerInstance(innerClassToInstantiate, outerClassInstance, nonNullArgs); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java new file mode 100644 index 00000000..f37aedee --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; +import java.lang.reflect.TypeVariable; + + +/** + * Modify from mockit.internal.util.FieldReflection JMockit v1.13 + * Util class to set and get the value of specified field. + */ +public final class FieldReflection { + private FieldReflection() { + } + + /** + * Get field's value with field's name. + */ + public static T getField(Class theClass, String fieldName, Object targetObject) { + if (theClass == null || fieldName == null || targetObject == null) { + throw new IllegalStateException(); + } + Field field = getDeclaredField(theClass, fieldName, targetObject != null); + return getFieldValue(field, targetObject); + } + + /** + * Get field's value with field's type. + */ + public static T getField(Class theClass, Class fieldType, Object targetObject) { + if (theClass == null || fieldType == null) { + throw new IllegalStateException(); + } + Field field = getDeclaredField(theClass, fieldType, targetObject != null, false); + return getFieldValue(field, targetObject); + } + + /** + * Get field's value with field's type. + */ + public static T getField(Class theClass, Type fieldType, Object targetObject) { + if (theClass == null || fieldType == null) { + throw new IllegalStateException(); + } + Field field = getDeclaredField(theClass, fieldType, targetObject != null, false); + return getFieldValue(field, targetObject); + } + + /** + * Modify field's value in targetObject. + * If {@fieldName String} is null, will try to set field with field's type. + */ + public static Field setField(Class theClass, Object targetObject, String fieldName, Object fieldValue) { + if (theClass == null) { + throw new IllegalArgumentException(); + } + boolean instanceField = targetObject != null; + Field field; + if (fieldName != null) { + field = getDeclaredField(theClass, fieldName, instanceField); + } else { + if (fieldValue == null) { + throw new IllegalArgumentException("Missing field value when setting field by type"); + } + + field = getDeclaredField(theClass, fieldValue.getClass(), instanceField, true); + } + + setFieldValue(field, targetObject, fieldValue); + return field; + } + + /** + * Get field by field's name. + * If no field is found in this class, it will continue to look up its super class. + * If {@instanceField boolean} is true, will only search for the non-static field. + */ + private static Field getDeclaredField(Class theClass, String fieldName, boolean instanceField) { + if (theClass == null || fieldName == null) { + throw new IllegalStateException(); + } + try { + return theClass.getDeclaredField(fieldName); + } catch (NoSuchFieldException e) { + Class superClass = theClass.getSuperclass(); + if (superClass != null && superClass != Object.class) { + return getDeclaredField(superClass, fieldName, instanceField); + } else { + String kind = instanceField ? "instance" : "static"; + throw new IllegalArgumentException("No " + kind + " field of name \"" + fieldName + "\" found in " + theClass); + } + } + } + + /** + * Get field by field's type. + * If no field is found in this class, it will continue to look up its super class. + * If {@instanceField boolean} is true, will only search for the non-static field. + * If {@forAssignment boolean} is true, will compare its super type with desiredType. + */ + private static Field getDeclaredField(Class theClass, Type desiredType, boolean instanceField, boolean forAssignment) { + if (theClass == null || desiredType == null) { + throw new IllegalStateException(); + } + Field found = getDeclaredFieldInSingleClass(theClass, desiredType, instanceField, forAssignment); + if (found == null) { + Class superClass = theClass.getSuperclass(); + if (superClass != null && superClass != Object.class) { + return getDeclaredField(superClass, desiredType, instanceField, forAssignment); + } else { + StringBuilder errorMsg = new StringBuilder(instanceField ? "Instance" : "Static"); + String typeName = getTypeName(desiredType); + errorMsg.append(" field of type ").append(typeName).append(" not found in ").append(theClass); + throw new IllegalArgumentException(errorMsg.toString()); + } + } else { + return found; + } + } + + /** + * Get field by field's type. + * There is only one field is expected to be found in a single class. + * If {@instanceField boolean} is true, will only search for the non-static field. + * If {@forAssignment boolean} is true, will compare its super type with desiredType. + * If more than one field are found, a IllegalArgumentException will be thrown. + */ + private static Field getDeclaredFieldInSingleClass(Class theClass, Type desiredType, boolean instanceField, boolean forAssignment) { + if (theClass == null || desiredType == null) { + throw new IllegalStateException(); + } + Field found = null; + Field[] fields = theClass.getDeclaredFields(); + + for (Field field : fields) { + if (!field.isSynthetic()) { + Type fieldType = field.getGenericType(); + if (instanceField != Modifier.isStatic(field.getModifiers()) && isCompatibleFieldType(fieldType, desiredType, forAssignment)) { + if (found != null) { + String message = errorMessageForMoreThanOneFieldFound(desiredType, instanceField, forAssignment, found, field); + throw new IllegalArgumentException(message); + } + + found = field; + } + } + } + + return found; + } + + /** + * return true if the {@fieldType} is compatible with {@desiredType}. + * If {@forAssignment} is true, will compare its super type with desiredType. + * If {@forAssignment} is false, will also compare it with desiredType's super type. + */ + private static boolean isCompatibleFieldType(Type fieldType, Type desiredType, boolean forAssignment) { + if (fieldType == null || desiredType == null) { + throw new IllegalStateException(); + } + Class fieldClass = getClassType(fieldType); + Class desiredClass = getClassType(desiredType); + if (isSameType(desiredClass, fieldClass)) { + return true; + } else if (forAssignment) { + return fieldClass.isAssignableFrom(desiredClass); + } else { + return desiredClass.isAssignableFrom(fieldClass) || fieldClass.isAssignableFrom(desiredClass); + } + } + + private static String errorMessageForMoreThanOneFieldFound(Type desiredFieldType, boolean instanceField, boolean forAssignment, Field firstField, Field secondField) { + return "More than one " + (instanceField ? "instance" : "static") + " field " + (forAssignment ? "to" : "from") + + " which a value of type " + + getTypeName(desiredFieldType) + (forAssignment ? " can be assigned" : " can be read") + " exists in " + + secondField.getDeclaringClass() + ": " + firstField.getName() + ", " + secondField.getName(); + } + + private static String getTypeName(Type type) { + if (type == null) { + throw new IllegalStateException(); + } + Class classType = getClassType(type); + Class primitiveType = AutoType.getPrimitiveType(classType); + if (primitiveType != null) { + return primitiveType + " or " + classType.getSimpleName(); + } else { + String name = classType.getName(); + return name.startsWith("java.lang.") ? name.substring(10) : name; + } + } + + /** + * Get field in {@targetObject Object}. + */ + private static T getFieldValue(Field field, Object targetObject) { + if (field == null) { + throw new IllegalStateException(); + } + makeAccessible(field); + + try { + return (T) field.get(targetObject); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + /** + * Modify field with value in {@targetObject Object}. + */ + public static void setFieldValue(Field field, Object targetObject, Object value) { + if (field == null) { + throw new IllegalStateException(); + } + try { + if (Modifier.isStatic(field.getModifiers()) && Modifier.isFinal(field.getModifiers())) { + throw new IllegalArgumentException("Do not allow to set static final field"); + } else { + makeAccessible(field); + field.set(targetObject, value); + } + + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + /* + private static void setStaticFinalField(Field field, Object value) throws IllegalAccessException { + if (field == null) { + throw new IllegalStateException(); + } + Field modifiersField; + try { + modifiersField = Field.class.getDeclaredField("modifiers"); + } catch (NoSuchFieldException e) { + throw new RuntimeException(e); + } + + modifiersField.setAccessible(true); + int nonFinalModifiers = modifiersField.getInt(field) - 16; + modifiersField.setInt(field, nonFinalModifiers); + FieldAccessor accessor = ReflectionFactory.getReflectionFactory().newFieldAccessor(field, false); + accessor.set((Object)null, value); + } + */ + + public static Class getClassType(Type declaredType) { + while (!(declaredType instanceof Class)) { + if (declaredType instanceof ParameterizedType) { + return (Class) ((ParameterizedType) declaredType).getRawType(); + } + + if (!(declaredType instanceof TypeVariable)) { + throw new IllegalArgumentException("Type of unexpected kind: " + declaredType); + } + + declaredType = ((TypeVariable) declaredType).getBounds()[0]; + } + + return (Class) declaredType; + } + + // ensure that field is accessible + public static void makeAccessible(AccessibleObject classMember) { + if (!classMember.isAccessible()) { + classMember.setAccessible(true); + } + } + + // return true if the two types are same type. + private static boolean isSameType(Class firstType, Class secondType) { + return firstType == secondType + || firstType.isPrimitive() && firstType == AutoType.getPrimitiveType(secondType) + || secondType.isPrimitive() && secondType == AutoType.getPrimitiveType(firstType); + } + +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java new file mode 100644 index 00000000..1aae3418 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.Proxy; + +/** + * Modify from mockit.internal.util.GeneratedClasses JMockit v1.13 + * Helper class to return type of mocked-object + */ +public final class GeneratedClasses { + private static final String IMPLCLASS_PREFIX = "$Impl_"; + private static final String SUBCLASS_PREFIX = "$Subclass_"; + + private GeneratedClasses() { + } + + static boolean isGeneratedImplementationClass(Class mockedType) { + return isGeneratedImplementationClass(mockedType.getName()); + } + + static boolean isGeneratedImplementationClass(String className) { + return className.contains(IMPLCLASS_PREFIX); + } + + static boolean isGeneratedSubclass(String className) { + return className.contains(SUBCLASS_PREFIX); + } + + static boolean isGeneratedClass(String className) { + return isGeneratedSubclass(className) || isGeneratedImplementationClass(className); + } + + static Class getMockedClassOrInterfaceType(Class aClass) { + if (!Proxy.isProxyClass(aClass) && !isGeneratedImplementationClass(aClass)) { + return isGeneratedSubclass(aClass.getName()) ? aClass.getSuperclass() : aClass; + } else { + return aClass.getInterfaces()[0]; + } + } + + static Class getMockedClass(Object mock) { + return getMockedClassOrInterfaceType(mock.getClass()); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java new file mode 100644 index 00000000..749e2e7c --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; + +/** + * Modify from mockit.internal.util.MethodReflection JMockit v1.13 + * Util class to get and invoke method from specified class. + */ +public final class MethodReflection { + private MethodReflection() { + } + + public static T invoke(Class theClass, Object targetInstance, String methodName, Object... methodArgs) { + if (theClass == null || methodName == null) { + throw new IllegalArgumentException(); + } + boolean staticMethod = targetInstance == null; + Class[] argTypes = ParameterReflection.getArgumentTypesFromArgumentValues(methodArgs); + Method method = staticMethod ? findCompatibleStaticMethod(theClass, methodName, argTypes) : findCompatibleMethod(theClass, methodName, argTypes); + if (staticMethod && !Modifier.isStatic(method.getModifiers())) { + throw new IllegalArgumentException("Attempted to invoke non-static method without an instance to invoke it on"); + } else { + T result = invoke(targetInstance, method, methodArgs); + return result; + } + } + + public static T invoke(Object targetInstance, Method method, Object... methodArgs) { + if (method == null || methodArgs == null) { + throw new IllegalArgumentException(); + } + makeAccessible(method); + + try { + return (T) method.invoke(targetInstance, methodArgs); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Failure to invoke method: " + method, e); + } catch (InvocationTargetException e) { + Throwable cause = e.getCause(); + if (cause instanceof Error) { + throw (Error) cause; + } else if (cause instanceof RuntimeException) { + throw (RuntimeException) cause; + } else { + ThrowOfCheckedException.doThrow((Exception) cause); + return null; + } + } + } + + /** + * Get a static method with {@methodName String} and {@argTypes Class[]}. + * If no method was found, a IllegalArgumentException will be thrown. + */ + private static Method findCompatibleStaticMethod(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method methodFound = findCompatibleMethodInClass(theClass, methodName, argTypes); + if (methodFound != null) { + return methodFound; + } else { + String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); + throw new IllegalArgumentException("No compatible static method found: " + methodName + argTypesDesc); + } + } + + /** + * Get a non-static method with {@methodName String} and {@argTypes Class[]}. + */ + public static Method findCompatibleMethod(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method methodFound = findCompatibleMethodIfAvailable(theClass, methodName, argTypes); + if (methodFound != null) { + return methodFound; + } else { + String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); + throw new IllegalArgumentException("No compatible method found: " + methodName + argTypesDesc); + } + } + + /** + * Get method with {@methodName String} and {@argTypes Class[]} from {@theClass Class}. + * If more than one method is found, choose the more specific one. (i.e. method with parameters that have more concrete types is more specific) + */ + private static Method findCompatibleMethodInClass(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method found = null; + Class[] foundParamTypes = null; + Method[] methods = theClass.getDeclaredMethods(); + + for (Method declaredMethod : methods) { + if (declaredMethod.getName().equals(methodName)) { + Class[] declaredParamTypes = declaredMethod.getParameterTypes(); + int gap = declaredParamTypes.length - argTypes.length; + if (gap == 0 && (ParameterReflection.matchesParameterTypes(declaredParamTypes, argTypes) + || ParameterReflection.acceptsArgumentTypes(declaredParamTypes, argTypes)) + && (foundParamTypes == null + || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParamTypes))) { + found = declaredMethod; + foundParamTypes = declaredParamTypes; + } + } + } + + return found; + } + + /** + * Get method with {@methodName String} and {@argTypes Class[]} from {@theClass Class} as well as its super class. + * If more than one method is found, choose the more specify one. (i.e. choose the method with parameters that have more concrete types) + */ + private static Method findCompatibleMethodIfAvailable(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method methodFound = null; + + while (true) { + Method compatibleMethod = findCompatibleMethodInClass(theClass, methodName, argTypes); + if (compatibleMethod != null && (methodFound == null || ParameterReflection.hasMoreSpecificTypes(compatibleMethod.getParameterTypes(), methodFound.getParameterTypes()))) { + methodFound = compatibleMethod; + } + + Class superClass = theClass.getSuperclass(); + if (superClass == null || superClass == Object.class) { + return methodFound; + } + + theClass = superClass; + } + } + + + + // ensure that field is accessible + public static void makeAccessible(AccessibleObject classMember) { + if (!classMember.isAccessible()) { + classMember.setAccessible(true); + } + } + + // return true if the two types are same type. + private static boolean isSameType(Class firstType, Class secondType) { + return firstType == secondType + || firstType.isPrimitive() && firstType == AutoType.getPrimitiveType(secondType) + || secondType.isPrimitive() && secondType == AutoType.getPrimitiveType(firstType); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java new file mode 100644 index 00000000..84a54dfd --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.util.regex.Pattern; + +/** + * Modify from mockit.internal.util.ParameterReflection JMockit v1.13 + * Util class to verify parameter of methods. + */ +public final class ParameterReflection { + public static final Class[] NO_PARAMETERS = new Class[0]; + + public static final Pattern JAVA_LANG = Pattern.compile("java.lang.", 16); + + private ParameterReflection() { + } + + /** + * check if every member in {@declaredTypes} is completely equal to the corresponding member {@specifiedTypes}. + */ + static boolean matchesParameterTypes(Class[] declaredTypes, Class[] specifiedTypes) { + if (declaredTypes == null || specifiedTypes == null) { + throw new IllegalArgumentException(); + } + for (int i = 0; i < declaredTypes.length; ++i) { + Class declaredType = declaredTypes[i]; + Class specifiedType = specifiedTypes[i]; + if (!isSameType(declaredType, specifiedType)) { + return false; + } + } + + return true; + } + + /** + * check if every member in {@paramTypes} is acceptable to the corresponding member in {@argTypes}. + */ + static boolean acceptsArgumentTypes(Class[] paramTypes, Class[] argTypes) { + if (paramTypes == null || argTypes == null) { + throw new IllegalArgumentException(); + } + for (int i = 0; i < paramTypes.length; ++i) { + Class parType = paramTypes[i]; + Class argType = argTypes[i]; + if (!isSameType(parType, argType) && !parType.isAssignableFrom(argType)) { + return false; + } + } + + return true; + } + + /** + * Get all types from objects {@args}. + */ + static Class[] getArgumentTypesFromArgumentValues(Object... args) { + if (args == null) { + throw new IllegalArgumentException(); + } + if (args.length == 0) { + return NO_PARAMETERS; + } else { + Class[] argTypes = new Class[args.length]; + + for (int i = 0; i < args.length; ++i) { + argTypes[i] = getArgumentTypeFromArgumentValue(i, args); + } + + return argTypes; + } + } + + /** + * Get type from {@args} by index. + */ + static Class getArgumentTypeFromArgumentValue(int i, Object[] args) { + Object arg = args[i]; + if (arg == null) { + throw new IllegalArgumentException("Invalid null value passed as argument " + i); + } else { + Class argType; + if (arg instanceof Class) { + argType = (Class) arg; + args[i] = null; + } else { + argType = GeneratedClasses.getMockedClass(arg); + } + + return argType; + } + } + + /** + * return true if {@currentTypes} is more specific than {@previousTypes}. + */ + static boolean hasMoreSpecificTypes(Class[] currentTypes, Class[] previousTypes) { + if (currentTypes == null || previousTypes == null) { + throw new IllegalArgumentException(); + } + for (int i = 0; i < currentTypes.length; ++i) { + Class current = wrappedIfPrimitive(currentTypes[i]); + Class previous = wrappedIfPrimitive(previousTypes[i]); + if (current != previous && previous.isAssignableFrom(current)) { + return true; + } + } + + return false; + } + + /** + * return the type names of {@paramTypes} wrapped in brackets. + */ + static String getParameterTypesDescription(Class[] paramTypes) { + if (paramTypes == null) { + throw new IllegalArgumentException(); + } + StringBuilder paramTypesDesc = new StringBuilder(200); + paramTypesDesc.append('('); + String sep = ""; + + for (Class paramType : paramTypes) { + String typeName = JAVA_LANG.matcher(paramType.getCanonicalName()).replaceAll(""); + paramTypesDesc.append(sep).append(typeName); + sep = ", "; + } + + paramTypesDesc.append(')'); + return paramTypesDesc.toString(); + } + + /** + * return real parameters array of inner-class belong to the outer-class instance {@firstValue Object}. + * the parameter[0] of a inner-class constructor is always the instance of its outer-class. + */ + static Object[] argumentsWithExtraFirstValue(Object[] args, Object firstValue) { + Object[] args2 = new Object[1 + args.length]; + args2[0] = firstValue; + System.arraycopy(args, 0, args2, 1, args.length); + return args2; + } + + // return wrapped type if its type is primitive. + private static Class wrappedIfPrimitive(Class parameterType) { + if (parameterType.isPrimitive()) { + Class wrapperType = AutoType.getWrapperType(parameterType); + + assert wrapperType != null; + + return wrapperType; + } else { + return parameterType; + } + } + + // return true if the two types are same type. + private static boolean isSameType(Class firstType, Class secondType) { + return firstType == secondType + || firstType.isPrimitive() && firstType == AutoType.getPrimitiveType(secondType) + || secondType.isPrimitive() && secondType == AutoType.getPrimitiveType(firstType); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java new file mode 100644 index 00000000..4dfc44ae --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +/** + * Modify from mockit.internal.reflection.ThrowOfCheckedException JMockit v1.13 + */ +public final class ThrowOfCheckedException { + private static Exception exceptionToThrow; + + ThrowOfCheckedException() throws Exception { + throw exceptionToThrow; + } + + public static synchronized void doThrow(Exception checkedException) { + exceptionToThrow = checkedException; + ConstructorReflection.newInstanceUsingDefaultConstructor(ThrowOfCheckedException.class); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java new file mode 100644 index 00000000..93c4c6c0 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java @@ -0,0 +1,488 @@ +package org.apache.doris.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.json.JsonMapper; +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +public class EtlJobConfig implements Serializable { + // global dict + public static final String GLOBAL_DICT_TABLE_NAME = "doris_global_dict_table_%d"; + public static final String DISTINCT_KEY_TABLE_NAME = "doris_distinct_key_table_%d_%s"; + public static final String DORIS_INTERMEDIATE_HIVE_TABLE_NAME = "doris_intermediate_hive_table_%d_%s"; + // tableId.partitionId.indexId.bucket.schemaHash + public static final String TABLET_META_FORMAT = "%d.%d.%d.%d.%d"; + public static final String ETL_OUTPUT_FILE_FORMAT = "parquet"; + // dpp result + public static final String DPP_RESULT_NAME = "dpp_result.json"; + // hdfsEtlPath/jobs/dbId/loadLabel/PendingTaskSignature + private static final String ETL_OUTPUT_PATH_FORMAT = "%s/jobs/%d/%s/%d"; + private static final String ETL_OUTPUT_FILE_NAME_DESC_V1 = + "version.label.tableId.partitionId.indexId.bucket.schemaHash.parquet"; + @JsonProperty(value = "tables") + public Map tables; + @JsonProperty(value = "outputPath") + public String outputPath; + @JsonProperty(value = "outputFilePattern") + public String outputFilePattern; + @JsonProperty(value = "label") + public String label; + @JsonProperty(value = "properties") + public EtlJobProperty properties; + @JsonProperty(value = "configVersion") + public ConfigVersion configVersion; + + /** + * for json deserialize + */ + public EtlJobConfig() { + } + + public EtlJobConfig(Map tables, String outputFilePattern, String label, EtlJobProperty properties) { + this.tables = tables; + // set outputPath when submit etl job + this.outputPath = null; + this.outputFilePattern = outputFilePattern; + this.label = label; + this.properties = properties; + this.configVersion = ConfigVersion.V1; + } + + public static String getOutputPath(String hdfsEtlPath, long dbId, String loadLabel, long taskSignature) { + return String.format(ETL_OUTPUT_PATH_FORMAT, hdfsEtlPath, dbId, loadLabel, taskSignature); + } + + public static String getOutputFilePattern(String loadLabel, FilePatternVersion filePatternVersion) { + return String.format("%s.%s.%s.%s", filePatternVersion.name(), loadLabel, TABLET_META_FORMAT, + ETL_OUTPUT_FILE_FORMAT); + } + + public static String getDppResultFilePath(String outputPath) { + return outputPath + "/" + DPP_RESULT_NAME; + } + + public static String getTabletMetaStr(String filePath) throws Exception { + String fileName = filePath.substring(filePath.lastIndexOf("/") + 1); + String[] fileNameArr = fileName.split("\\."); + // check file version + switch (FilePatternVersion.valueOf(fileNameArr[0])) { + case V1: + // version.label.tableId.partitionId.indexId.bucket.schemaHash.parquet + if (fileNameArr.length != ETL_OUTPUT_FILE_NAME_DESC_V1.split("\\.").length) { + throw new Exception( + "etl output file name error, format: " + ETL_OUTPUT_FILE_NAME_DESC_V1 + ", name: " + + fileName); + } + long tableId = Long.parseLong(fileNameArr[2]); + long partitionId = Long.parseLong(fileNameArr[3]); + long indexId = Long.parseLong(fileNameArr[4]); + int bucket = Integer.parseInt(fileNameArr[5]); + int schemaHash = Integer.parseInt(fileNameArr[6]); + // tableId.partitionId.indexId.bucket.schemaHash + return String.format(TABLET_META_FORMAT, tableId, partitionId, indexId, bucket, schemaHash); + default: + throw new Exception("etl output file version error. version: " + fileNameArr[0]); + } + } + + public static EtlJobConfig configFromJson(String jsonConfig) throws JsonProcessingException { + JsonMapper mapper = JsonMapper.builder().build(); + return mapper.readValue(jsonConfig, EtlJobConfig.class); + } + + @Override + public String toString() { + return "EtlJobConfig{" + "tables=" + tables + ", outputPath='" + outputPath + '\'' + ", outputFilePattern='" + + outputFilePattern + '\'' + ", label='" + label + '\'' + ", properties=" + properties + ", version=" + + configVersion + '}'; + } + + public String getOutputPath() { + return outputPath; + } + + public String configToJson() throws JsonProcessingException { + JsonMapper mapper = JsonMapper.builder().build(); + return mapper.writeValueAsString(this); + } + + public enum ConfigVersion { + V1 + } + + public enum FilePatternVersion { + V1 + } + + public enum SourceType { + FILE, HIVE + } + + public static class EtlJobProperty implements Serializable { + @JsonProperty(value = "strictMode") + public boolean strictMode; + @JsonProperty(value = "timezone") + public String timezone; + + @Override + public String toString() { + return "EtlJobProperty{" + "strictMode=" + strictMode + ", timezone='" + timezone + '\'' + '}'; + } + } + + public static class EtlTable implements Serializable { + @JsonProperty(value = "indexes") + public List indexes; + @JsonProperty(value = "partitionInfo") + public EtlPartitionInfo partitionInfo; + @JsonProperty(value = "fileGroups") + public List fileGroups; + + /** + * for json deserialize + */ + public EtlTable() { + } + + public EtlTable(List etlIndexes, EtlPartitionInfo etlPartitionInfo) { + this.indexes = etlIndexes; + this.partitionInfo = etlPartitionInfo; + this.fileGroups = Lists.newArrayList(); + } + + public void addFileGroup(EtlFileGroup etlFileGroup) { + fileGroups.add(etlFileGroup); + } + + @Override + public String toString() { + return "EtlTable{" + "indexes=" + indexes + ", partitionInfo=" + partitionInfo + ", fileGroups=" + + fileGroups + '}'; + } + } + + public static class EtlColumn implements Serializable { + @JsonProperty(value = "columnName") + public String columnName; + @JsonProperty(value = "columnType") + public String columnType; + @JsonProperty(value = "isAllowNull") + public boolean isAllowNull; + @JsonProperty(value = "isKey") + public boolean isKey; + @JsonProperty(value = "aggregationType") + public String aggregationType; + @JsonProperty(value = "defaultValue") + public String defaultValue; + @JsonProperty(value = "stringLength") + public int stringLength; + @JsonProperty(value = "precision") + public int precision; + @JsonProperty(value = "scale") + public int scale; + @JsonProperty(value = "defineExpr") + public String defineExpr; + + // for unit test + public EtlColumn() { + } + + public EtlColumn(String columnName, String columnType, boolean isAllowNull, boolean isKey, + String aggregationType, String defaultValue, int stringLength, int precision, int scale) { + this.columnName = columnName; + this.columnType = columnType; + this.isAllowNull = isAllowNull; + this.isKey = isKey; + this.aggregationType = aggregationType; + this.defaultValue = defaultValue; + this.stringLength = stringLength; + this.precision = precision; + this.scale = scale; + this.defineExpr = null; + } + + @Override + public String toString() { + return "EtlColumn{" + "columnName='" + columnName + '\'' + ", columnType='" + columnType + '\'' + + ", isAllowNull=" + isAllowNull + ", isKey=" + isKey + ", aggregationType='" + aggregationType + + '\'' + ", defaultValue='" + defaultValue + '\'' + ", stringLength=" + stringLength + + ", precision=" + precision + ", scale=" + scale + ", defineExpr='" + defineExpr + '\'' + '}'; + } + } + + public static class EtlIndexComparator implements Comparator { + @Override + public int compare(EtlIndex a, EtlIndex b) { + int diff = a.columns.size() - b.columns.size(); + if (diff == 0) { + return 0; + } else if (diff > 0) { + return 1; + } else { + return -1; + } + } + } + + public static class EtlIndex implements Serializable { + @JsonProperty(value = "indexId") + public long indexId; + @JsonProperty(value = "columns") + public List columns; + @JsonProperty(value = "schemaHash") + public int schemaHash; + @JsonProperty(value = "indexType") + public String indexType; + @JsonProperty(value = "isBaseIndex") + public boolean isBaseIndex; + @JsonProperty(value = "schemaVersion") + public int schemaVersion; + + /** + * for json deserialize + */ + public EtlIndex() { + } + + public EtlIndex(long indexId, List etlColumns, int schemaHash, String indexType, boolean isBaseIndex, + int schemaVersion) { + this.indexId = indexId; + this.columns = etlColumns; + this.schemaHash = schemaHash; + this.indexType = indexType; + this.isBaseIndex = isBaseIndex; + this.schemaVersion = schemaVersion; + } + + public EtlColumn getColumn(String name) { + for (EtlColumn column : columns) { + if (column.columnName.equals(name)) { + return column; + } + } + return null; + } + + @Override + public String toString() { + return "EtlIndex{" + "indexId=" + indexId + ", columns=" + columns + ", schemaHash=" + schemaHash + + ", indexType='" + indexType + '\'' + ", isBaseIndex=" + isBaseIndex + ", schemaVersion=" + + schemaVersion + '}'; + } + } + + public static class EtlPartitionInfo implements Serializable { + @JsonProperty(value = "partitionType") + public String partitionType; + @JsonProperty(value = "partitionColumnRefs") + public List partitionColumnRefs; + @JsonProperty(value = "distributionColumnRefs") + public List distributionColumnRefs; + @JsonProperty(value = "partitions") + public List partitions; + + /** + * for json deserialize + */ + public EtlPartitionInfo() { + } + + public EtlPartitionInfo(String partitionType, List partitionColumnRefs, + List distributionColumnRefs, List etlPartitions) { + this.partitionType = partitionType; + this.partitionColumnRefs = partitionColumnRefs; + this.distributionColumnRefs = distributionColumnRefs; + this.partitions = etlPartitions; + } + + @Override + public String toString() { + return "EtlPartitionInfo{" + "partitionType='" + partitionType + '\'' + ", partitionColumnRefs=" + + partitionColumnRefs + ", distributionColumnRefs=" + distributionColumnRefs + ", partitions=" + + partitions + '}'; + } + } + + public static class EtlPartition implements Serializable { + @JsonProperty(value = "partitionId") + public long partitionId; + @JsonProperty(value = "startKeys") + public List startKeys; + @JsonProperty(value = "endKeys") + public List endKeys; + @JsonProperty(value = "isMaxPartition") + public boolean isMaxPartition; + @JsonProperty(value = "bucketNum") + public int bucketNum; + + /** + * for json deserialize + */ + public EtlPartition() { + } + + public EtlPartition(long partitionId, List startKeys, List endKeys, boolean isMaxPartition, + int bucketNum) { + this.partitionId = partitionId; + this.startKeys = startKeys; + this.endKeys = endKeys; + this.isMaxPartition = isMaxPartition; + this.bucketNum = bucketNum; + } + + @Override + public String toString() { + return "EtlPartition{" + "partitionId=" + partitionId + ", startKeys=" + startKeys + ", endKeys=" + + endKeys + ", isMaxPartition=" + isMaxPartition + ", bucketNum=" + bucketNum + '}'; + } + } + + public static class EtlFileGroup implements Serializable { + @JsonProperty(value = "sourceType") + public SourceType sourceType = SourceType.FILE; + @JsonProperty(value = "filePaths") + public List filePaths; + @JsonProperty(value = "fileFieldNames") + public List fileFieldNames; + @JsonProperty(value = "columnsFromPath") + public List columnsFromPath; + @JsonProperty(value = "columnSeparator") + public String columnSeparator; + @JsonProperty(value = "lineDelimiter") + public String lineDelimiter; + @JsonProperty(value = "isNegative") + public boolean isNegative; + @JsonProperty(value = "fileFormat") + public String fileFormat; + @JsonProperty(value = "columnMappings") + public Map columnMappings; + @JsonProperty(value = "where") + public String where; + @JsonProperty(value = "partitions") + public List partitions; + @JsonProperty(value = "hiveDbTableName") + public String hiveDbTableName; + @JsonProperty(value = "hiveTableProperties") + public Map hiveTableProperties; + + // hive db table used in dpp, not serialized + // set with hiveDbTableName (no bitmap column) or IntermediateHiveTable (created by global dict builder) + // in spark etl job + @JsonIgnore + public String dppHiveDbTableName; + + public EtlFileGroup() { + } + + // for data infile path + public EtlFileGroup(SourceType sourceType, List filePaths, List fileFieldNames, + List columnsFromPath, String columnSeparator, String lineDelimiter, + boolean isNegative, String fileFormat, Map columnMappings, + String where, List partitions) { + this.sourceType = sourceType; + this.filePaths = filePaths; + this.fileFieldNames = fileFieldNames; + this.columnsFromPath = columnsFromPath; + this.columnSeparator = Strings.isNullOrEmpty(columnSeparator) ? "\t" : columnSeparator; + this.lineDelimiter = lineDelimiter; + this.isNegative = isNegative; + this.fileFormat = fileFormat; + this.columnMappings = columnMappings; + this.where = where; + this.partitions = partitions; + } + + // for data from table + public EtlFileGroup(SourceType sourceType, String hiveDbTableName, Map hiveTableProperties, + boolean isNegative, Map columnMappings, String where, + List partitions) { + this.sourceType = sourceType; + this.hiveDbTableName = hiveDbTableName; + this.hiveTableProperties = hiveTableProperties; + this.isNegative = isNegative; + this.columnMappings = columnMappings; + this.where = where; + this.partitions = partitions; + } + + @Override + public String toString() { + return "EtlFileGroup{" + "sourceType=" + sourceType + ", filePaths=" + filePaths + ", fileFieldNames=" + + fileFieldNames + ", columnsFromPath=" + columnsFromPath + ", columnSeparator='" + columnSeparator + + '\'' + ", lineDelimiter='" + lineDelimiter + '\'' + ", isNegative=" + isNegative + + ", fileFormat='" + fileFormat + '\'' + ", columnMappings=" + columnMappings + ", where='" + where + + '\'' + ", partitions=" + partitions + ", hiveDbTableName='" + hiveDbTableName + '\'' + + ", hiveTableProperties=" + hiveTableProperties + '}'; + } + } + + /** + * FunctionCallExpr = functionName(args) + * For compatibility with old designed functions used in Hadoop MapReduce etl + *

+ * expr is more general, like k1 + 1, not just FunctionCall + */ + public static class EtlColumnMapping implements Serializable { + + private static Map functionMap = + new ImmutableMap.Builder().put("md5sum", "md5").build(); + + @JsonProperty(value = "functionName") + public String functionName; + @JsonProperty(value = "args") + public List args; + @JsonProperty(value = "expr") + public String expr; + + public EtlColumnMapping() { + } + + public EtlColumnMapping(String functionName, List args) { + this.functionName = functionName; + this.args = args; + } + + public EtlColumnMapping(String expr) { + this.expr = expr; + } + + public String toDescription() { + StringBuilder sb = new StringBuilder(); + if (functionName == null) { + sb.append(expr); + } else { + if (functionMap.containsKey(functionName)) { + sb.append(functionMap.get(functionName)); + } else { + sb.append(functionName); + } + sb.append("("); + if (args != null) { + for (String arg : args) { + sb.append(arg); + sb.append(","); + } + } + sb.deleteCharAt(sb.length() - 1); + sb.append(")"); + } + return sb.toString(); + } + + @Override + public String toString() { + return "EtlColumnMapping{" + "functionName='" + functionName + '\'' + ", args=" + args + ", expr=" + expr + + '}'; + } + } + +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index b22c7820..53e7e742 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -202,6 +202,9 @@ public boolean canBeRecovered() throws SparkLoadException { if (DPP_RESULT_JSON.equalsIgnoreCase(fileName)) { hasDppResult = true; String content = FileSystemUtils.readFile(jobConfig, fileStatus.getPath().toString()); + if (StringUtils.isBlank(content)) { + return false; + } DppResult dppResult = JsonUtils.readValue(content, DppResult.class); if (!checkDppResult(dppResult)) { LOG.info("previous etl job is failed, cannot be recovered"); @@ -211,6 +214,9 @@ public boolean canBeRecovered() throws SparkLoadException { // check meta consist if (LOAD_META_JSON.equalsIgnoreCase(fileName)) { String content = FileSystemUtils.readFile(jobConfig, fileStatus.getPath().toString()); + if (StringUtils.isBlank(content)) { + return false; + } LoadMeta oldLoadMeta = JsonUtils.readValue(content, LoadMeta.class); for (Map.Entry entry : loadMeta.getTableMeta().entrySet()) { TableMeta tableMeta = entry.getValue(); @@ -230,11 +236,19 @@ public boolean canBeRecovered() throws SparkLoadException { for (Map.Entry indexEntry : indexMap.entrySet()) { EtlJobConfig.EtlIndex index = indexEntry.getValue(); EtlJobConfig.EtlIndex oldIndex = oldIndexMap.get(indexEntry.getKey()); - // index not exists or index mismatch - if (oldIndex == null || oldIndex.indexId != index.indexId - || oldIndex.schemaHash != index.schemaHash) { - LOG.info("index mismatch, old index: " + oldIndex + ", now index: " + index - + ", cannot be recovered"); + // index not exists + if (oldIndex == null) { + LOG.info("index " + index.indexId + " is not exists in previous meta"); + return false; + } + // index mismatch + if (oldIndex.schemaHash != index.schemaHash + || oldIndex.schemaVersion != index.schemaVersion) { + LOG.info("index " + index.indexId + " has changed, " + + "old schemaHash: " + oldIndex.schemaHash + " and schemaVersion: " + + oldIndex.schemaVersion + "current schemaHash: " + + index.schemaHash + " and schemaVersion: " + + index.schemaVersion + ", cannot be recovered"); return false; } } From c5ef91ad2dc1f2f233268ffaa57ff7e7a88c3f54 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 4 Jul 2024 16:37:25 +0800 Subject: [PATCH 31/45] rename load name --- .../org/apache/doris/client/DorisClient.java | 18 +++++++++++------- .../org/apache/doris/load/job/PullLoader.java | 15 ++++++++++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java index 1125fcce..8345279f 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -2,12 +2,13 @@ import org.apache.doris.common.LoadInfo; import org.apache.doris.common.ResponseEntity; -import org.apache.doris.exception.SparkLoadException; import org.apache.doris.common.meta.LoadInfoResponse; import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.exception.SparkLoadException; import org.apache.doris.util.HttpUtils; import org.apache.doris.util.JsonUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHeaders; import org.apache.http.HttpStatus; import org.apache.http.client.methods.CloseableHttpResponse; @@ -49,7 +50,7 @@ public static class FeClient { public static final String BASE_URL = "http://%s%s"; - public static final String RAW_LOAD_URL_PATTERN = "/api/spark_load/%s/%s"; + public static final String INGESTION_LOAD_URL_PATTERN = "/api/ingestion_load/%s/%s"; public static final String CREATE_ACTION = "_create"; @@ -78,10 +79,10 @@ private String parseAuth(String user, String password) { return Base64.getEncoder().encodeToString((user + ":" + password).getBytes(StandardCharsets.UTF_8)); } - public LoadMeta createSparkLoad(String db, Map> tableToPartition, String label, - Map properties) throws SparkLoadException { + public LoadMeta createIngestionLoad(String db, Map> tableToPartition, String label, + Map properties) throws SparkLoadException { try { - String path = String.format(RAW_LOAD_URL_PATTERN, db, CREATE_ACTION); + String path = String.format(INGESTION_LOAD_URL_PATTERN, db, CREATE_ACTION); HttpPost httpPost = new HttpPost(); addCommonHeaders(httpPost); Map params = new HashMap<>(); @@ -90,6 +91,9 @@ public LoadMeta createSparkLoad(String db, Map> tableToPart params.put("properties", properties); httpPost.setEntity(new StringEntity(JsonUtils.writeValueAsString(params))); String content = executeRequest(httpPost, path, null); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request create load failed, path: %s", path)); + } ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); if (res.getCode() != 0) { throw new SparkLoadException(String.format("create load failed, code: %d, msg: %s, reason: %s", @@ -132,10 +136,10 @@ private String executeRequest(HttpRequestBase req, String apiPath, Map statusInfo) + public void updateIngestionLoad(String db, Long loadId, Map statusInfo) throws SparkLoadException { - String path = String.format(RAW_LOAD_URL_PATTERN, db, UPDATE_ACTION); + String path = String.format(INGESTION_LOAD_URL_PATTERN, db, UPDATE_ACTION); HttpPost httpPost = new HttpPost(); addCommonHeaders(httpPost); Map params = new HashMap<>(); diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index 53e7e742..23a0e829 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -19,6 +19,7 @@ import org.apache.doris.SparkLoadRunner; import org.apache.doris.client.DorisClient; +import org.apache.doris.common.Constants; import org.apache.doris.common.DppResult; import org.apache.doris.common.LoadInfo; import org.apache.doris.common.enums.JobStatus; @@ -72,9 +73,17 @@ public void prepare() throws SparkLoadException { jobConfig.getPassword()); Map> tableToPartition = jobConfig.getLoadTasks().entrySet().stream() .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().getTargetPartitions())); - loadMeta = feClient.createSparkLoad(jobConfig.getDatabase(), tableToPartition, jobConfig.getLabel(), + loadMeta = feClient.createIngestionLoad(jobConfig.getDatabase(), tableToPartition, jobConfig.getLabel(), jobConfig.getJobProperties()); etlJobConfig = loadMeta.getEtlJobConfig(jobConfig); + if (Constants.HADOOP_AUTH_KERBEROS.equalsIgnoreCase( + jobConfig.getHadoopProperties().get(Constants.HADOOP_SECURITY_AUTHENTICATION))) { + try { + FileSystemUtils.kerberosLogin(jobConfig); + } catch (IOException e) { + throw new SparkLoadException("login with kerberos auth failed", e); + } + } } @Override @@ -144,7 +153,7 @@ public void afterFinished() throws SparkLoadException { } catch (IOException e) { throw new SparkLoadException("update job status failed", e); } - feClient.updateSparkLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); + feClient.updateIngestionLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); do { LoadInfo loadInfo = feClient.getLoadInfo(jobConfig.getDatabase(), jobConfig.getLabel()); switch (loadInfo.getState().toUpperCase(Locale.ROOT)) { @@ -178,7 +187,7 @@ public void afterFailed(Exception e) { statusInfo.put("msg", e.getMessage()); statusInfo.put("appId", appHandle == null ? null : appHandle.getAppId()); try { - feClient.updateSparkLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); + feClient.updateIngestionLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); } catch (SparkLoadException ex) { LOG.warn("update load failed status failed", ex); } From 336dbf7a828a1e4e558f539534fbe1eb4b949504 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 4 Jul 2024 16:46:36 +0800 Subject: [PATCH 32/45] copy reference classes from doris fe common and remove fe common dependency --- .gitignore | 5 -- .../spark/load/RecordBatchInputStream.java | 2 +- .../doris/spark/load/StreamLoader.scala | 6 ++- spark-load/pom.xml | 48 +++---------------- spark-load/spark-load-common/pom.xml | 39 +++++++++++++++ .../org/apache/doris/common/DppResult.java | 3 ++ .../apache/doris/common/io/BitmapValue.java | 3 ++ .../org/apache/doris/common/io/Codec.java | 3 ++ .../java/org/apache/doris/common/io/Hll.java | 3 ++ .../apache/doris/common/io/Roaring64Map.java | 3 ++ .../apache/doris/common/jmockit/AutoType.java | 28 ++++++----- .../common/jmockit/ConstructorReflection.java | 11 +++-- .../doris/common/jmockit/Deencapsulation.java | 2 + .../doris/common/jmockit/FieldReflection.java | 3 +- .../common/jmockit/GeneratedClasses.java | 2 + .../common/jmockit/MethodReflection.java | 15 ++++-- .../common/jmockit/ParameterReflection.java | 2 + .../org/apache/doris/config/EtlJobConfig.java | 4 +- spark-load/spark-load-core/pom.xml | 9 ++-- .../org/apache/doris/SparkLoadRunner.java | 11 ++--- .../apache/doris/common/meta/LoadMeta.java | 2 +- .../org/apache/doris/config/JobConfig.java | 2 +- .../org/apache/doris/load/job/PullLoader.java | 2 +- .../doris/common/meta/LoadMetaTest.java | 4 +- spark-load/spark-load-dpp/pom.xml | 11 ++--- .../doris/load/loadv2/dpp/ColumnParser.java | 5 +- .../loadv2/dpp/DorisRangePartitioner.java | 2 +- .../doris/load/loadv2/dpp/DppUtils.java | 4 +- .../dpp/MinimumCoverageRollupTreeBuilder.java | 2 +- .../load/loadv2/dpp/RollupTreeBuilder.java | 2 +- .../doris/load/loadv2/dpp/RollupTreeNode.java | 2 +- .../doris/load/loadv2/dpp/SparkDpp.java | 9 ++-- .../load/loadv2/dpp/SparkRDDAggregator.java | 2 +- .../doris/load/loadv2/etl/SparkEtlJob.java | 12 ++--- .../load/loadv2/dpp/ColumnParserTest.java | 2 +- .../loadv2/dpp/DorisRangePartitionerTest.java | 2 +- .../doris/load/loadv2/dpp/DppUtilsTest.java | 2 +- .../MinimumCoverageRollupTreeBuilderTest.java | 10 ++-- .../doris/load/loadv2/dpp/SparkDppTest.java | 2 +- .../load/loadv2/etl/SparkEtlJobTest.java | 22 ++++----- 40 files changed, 168 insertions(+), 135 deletions(-) create mode 100644 spark-load/spark-load-common/pom.xml rename spark-load/{spark-load-core => spark-load-common}/src/main/java/org/apache/doris/common/DppResult.java (96%) diff --git a/.gitignore b/.gitignore index 252ad80b..86b9ad2d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,11 +8,6 @@ spark-doris-connector/output/ spark-doris-connector/target/ spark-doris-connector/.idea/ -spark-load/spark-dpp/dependency-reduced-pom.xml -spark-load/spark-dpp/output/ -spark-load/spark-dpp/target/ -spark-load/spark-dpp/.idea/ - spark-load/target spark-load/spark-load-core/dependency-reduced-pom.xml spark-load/spark-load-core/output/ diff --git a/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java b/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java index b6264edc..3b6be497 100644 --- a/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java +++ b/spark-doris-connector/src/main/java/org/apache/doris/spark/load/RecordBatchInputStream.java @@ -87,7 +87,7 @@ public int read() throws IOException { if (read < 0) { return -1; } else { - return bytes[0] & 0xFF; + return bytes[0]; } } diff --git a/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala b/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala index faa08d7d..06bb56ff 100644 --- a/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala +++ b/spark-doris-connector/src/main/scala/org/apache/doris/spark/load/StreamLoader.scala @@ -498,12 +498,14 @@ class StreamLoader(settings: SparkSettings, isStreaming: Boolean) extends Loader val loadResponse: StreamLoadResponse = StreamLoadResponse(code, msg, content) if (loadResponse.code != HttpStatus.SC_OK) { - throw new StreamLoadException(String.format("stream load error, http status:%d, msg:%s", - new Integer(loadResponse.code), loadResponse.msg)) + LOG.error(s"Stream load http status is not OK, status: ${loadResponse.code}, response: $loadResponse") + throw new StreamLoadException(String.format("stream load error, http status:%d, response:%s", + new Integer(loadResponse.code), loadResponse)) } else { try { val respContent = MAPPER.readValue(loadResponse.content, classOf[RespContent]) if (!respContent.isSuccess) { + LOG.error(s"Stream load status is not success, status:${respContent.getStatus}, response:$loadResponse") throw new StreamLoadException(String.format("stream load error, load status:%s, response:%s", respContent.getStatus, loadResponse)) } LOG.info("Stream load Response:{}", loadResponse) diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 1816125f..60a41168 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -30,6 +30,7 @@ spark-load-core spark-load-dpp spark-load-dist + spark-load-common @@ -37,7 +38,6 @@ 1.8 UTF-8 1.0-SNAPSHOT - 1.2-SNAPSHOT 1.13 3.9 3.3.6 @@ -56,49 +56,11 @@ 2.0.7 1.2 1.12.669 + 0.8.13 - - org.apache.doris - fe-common - ${doris.fe.version} - - - org.apache.logging.log4j - log4j-1.2-api - - - org.apache.logging.log4j - log4j-api - - - org.apache.logging.log4j - log4j-core - - - commons-logging - commons-logging - - - org.slf4j - slf4j-api - - - com.fasterxml.jackson.core - jackson-core - - - com.fasterxml.jackson.core - jackson-databind - - - com.fasterxml.jackson.core - jackson-annotations - - - commons-codec @@ -372,7 +334,11 @@ commons-logging ${commons-logging.version} - + + org.roaringbitmap + RoaringBitmap + ${RoaringBitmap.version} + diff --git a/spark-load/spark-load-common/pom.xml b/spark-load/spark-load-common/pom.xml new file mode 100644 index 00000000..cbab0271 --- /dev/null +++ b/spark-load/spark-load-common/pom.xml @@ -0,0 +1,39 @@ + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + + + spark-load-common + + + 8 + 8 + UTF-8 + + + + + com.fasterxml.jackson.core + jackson-databind + + + com.google.guava + guava + + + org.roaringbitmap + RoaringBitmap + + + commons-codec + commons-codec + + + + \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java similarity index 96% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java rename to spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java index a27445bb..3daa6541 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/DppResult.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java @@ -21,6 +21,9 @@ import java.io.Serializable; +/** + * Copied from Apache Doris org.apache.doris.sparkdpp.DppResult + */ public class DppResult implements Serializable { @JsonProperty(value = "is_success", required = true) diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java index 04bb368f..d6fd410b 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java @@ -6,6 +6,9 @@ import java.io.DataOutput; import java.io.IOException; +/** + * Copied from Apache Doris + */ public class BitmapValue { public static final int EMPTY = 0; diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java index 2d783a3f..3c57a0f1 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java @@ -21,6 +21,9 @@ import java.io.DataOutput; import java.io.IOException; +/** + * Copied from Apache Doris + */ public class Codec { // not support encode negative value now diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java index 8f8042ee..427543f8 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java @@ -9,6 +9,9 @@ import java.util.HashSet; import java.util.Set; +/** + * Copied from Apache Doris + */ public class Hll { public static final byte HLL_DATA_EMPTY = 0; diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java index 85db5853..67b1e765 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java @@ -27,6 +27,9 @@ import java.util.SortedMap; import java.util.TreeMap; +/** + * Copied from Apache Doris + */ public class Roaring64Map { private static final boolean DEFAULT_ORDER_IS_SIGNED = false; diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java index eeebc76e..f65a9fdf 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java @@ -19,25 +19,15 @@ * |Short |short * |Integer |int * |Float |float - * |Long |long + * |Long |longFieldReflection * |Double |double + *

+ * Copied from Apache Doris */ public class AutoType { private static final Map, Class> PRIMITIVE_TO_WRAPPER = new HashMap(); private static final Map, Class> WRAPPER_TO_PRIMITIVE = new HashMap(); - public static boolean isWrapperOfPrimitiveType(Class type) { - return WRAPPER_TO_PRIMITIVE.containsKey(type); - } - - public static Class getPrimitiveType(Class wrapperType) { - return WRAPPER_TO_PRIMITIVE.get(wrapperType); - } - - public static Class getWrapperType(Class primitiveType) { - return PRIMITIVE_TO_WRAPPER.get(primitiveType); - } - static { WRAPPER_TO_PRIMITIVE.put(Boolean.class, Boolean.TYPE); WRAPPER_TO_PRIMITIVE.put(Character.class, Character.TYPE); @@ -57,4 +47,16 @@ public static Class getWrapperType(Class primitiveType) { PRIMITIVE_TO_WRAPPER.put(Long.TYPE, Long.class); PRIMITIVE_TO_WRAPPER.put(Double.TYPE, Double.class); } + + public static boolean isWrapperOfPrimitiveType(Class type) { + return WRAPPER_TO_PRIMITIVE.containsKey(type); + } + + public static Class getPrimitiveType(Class wrapperType) { + return WRAPPER_TO_PRIMITIVE.get(wrapperType); + } + + public static Class getWrapperType(Class primitiveType) { + return PRIMITIVE_TO_WRAPPER.get(primitiveType); + } } diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java index 2fce0e56..4b437ce4 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java @@ -12,6 +12,8 @@ /** * Modify from mockit.internal.util.ConstructorReflection JMockit v1.13 * Util class to invoke constructor of specified class. + *

+ * Copied from Apache Doris */ public final class ConstructorReflection { @@ -129,7 +131,8 @@ private static Constructor findCompatibleConstructor(Class theClass, C int gap = declaredParamTypes.length - argTypes.length; if (gap == 0 && (ParameterReflection.matchesParameterTypes(declaredParamTypes, argTypes) || ParameterReflection.acceptsArgumentTypes(declaredParamTypes, argTypes)) - && (found == null || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParameters))) { + && + (found == null || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParameters))) { found = (Constructor) declaredConstructor; foundParameters = declaredParamTypes; } @@ -143,10 +146,12 @@ private static Constructor findCompatibleConstructor(Class theClass, C // check if this constructor is belong to a inner class // the parameter[0] of inner class's constructor is a instance of outer class if (paramTypes[0] == declaringClass && paramTypes.length > argTypes.length) { - throw new IllegalArgumentException("Invalid instantiation of inner class; use newInnerInstance instead"); + throw new IllegalArgumentException( + "Invalid instantiation of inner class; use newInnerInstance instead"); } else { String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); - throw new IllegalArgumentException("No compatible constructor found: " + theClass.getSimpleName() + argTypesDesc); + throw new IllegalArgumentException( + "No compatible constructor found: " + theClass.getSimpleName() + argTypesDesc); } } } diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java index 5fb33717..74362e0c 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java @@ -7,6 +7,8 @@ /** * Modify from mockit.internal.util.Deencapsulation JMockit ver1.13 + *

+ * Copied from Apache Doris */ public final class Deencapsulation { private Deencapsulation() { diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java index f37aedee..04c6d9cd 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java @@ -12,10 +12,11 @@ import java.lang.reflect.Type; import java.lang.reflect.TypeVariable; - /** * Modify from mockit.internal.util.FieldReflection JMockit v1.13 * Util class to set and get the value of specified field. + *

+ * Copied from Apache Doris */ public final class FieldReflection { private FieldReflection() { diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java index 1aae3418..1281f4ed 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java @@ -10,6 +10,8 @@ /** * Modify from mockit.internal.util.GeneratedClasses JMockit v1.13 * Helper class to return type of mocked-object + *

+ * Copied from Apache Doris */ public final class GeneratedClasses { private static final String IMPLCLASS_PREFIX = "$Impl_"; diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java index 749e2e7c..293e9816 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java @@ -13,6 +13,8 @@ /** * Modify from mockit.internal.util.MethodReflection JMockit v1.13 * Util class to get and invoke method from specified class. + *

+ * Copied from Apache Doris */ public final class MethodReflection { private MethodReflection() { @@ -24,9 +26,11 @@ public static T invoke(Class theClass, Object targetInstance, String meth } boolean staticMethod = targetInstance == null; Class[] argTypes = ParameterReflection.getArgumentTypesFromArgumentValues(methodArgs); - Method method = staticMethod ? findCompatibleStaticMethod(theClass, methodName, argTypes) : findCompatibleMethod(theClass, methodName, argTypes); + Method method = staticMethod ? findCompatibleStaticMethod(theClass, methodName, argTypes) : + findCompatibleMethod(theClass, methodName, argTypes); if (staticMethod && !Modifier.isStatic(method.getModifiers())) { - throw new IllegalArgumentException("Attempted to invoke non-static method without an instance to invoke it on"); + throw new IllegalArgumentException( + "Attempted to invoke non-static method without an instance to invoke it on"); } else { T result = invoke(targetInstance, method, methodArgs); return result; @@ -110,7 +114,7 @@ private static Method findCompatibleMethodInClass(Class theClass, String meth if (gap == 0 && (ParameterReflection.matchesParameterTypes(declaredParamTypes, argTypes) || ParameterReflection.acceptsArgumentTypes(declaredParamTypes, argTypes)) && (foundParamTypes == null - || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParamTypes))) { + || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParamTypes))) { found = declaredMethod; foundParamTypes = declaredParamTypes; } @@ -132,7 +136,9 @@ private static Method findCompatibleMethodIfAvailable(Class theClass, String while (true) { Method compatibleMethod = findCompatibleMethodInClass(theClass, methodName, argTypes); - if (compatibleMethod != null && (methodFound == null || ParameterReflection.hasMoreSpecificTypes(compatibleMethod.getParameterTypes(), methodFound.getParameterTypes()))) { + if (compatibleMethod != null && (methodFound == null || + ParameterReflection.hasMoreSpecificTypes(compatibleMethod.getParameterTypes(), + methodFound.getParameterTypes()))) { methodFound = compatibleMethod; } @@ -146,7 +152,6 @@ private static Method findCompatibleMethodIfAvailable(Class theClass, String } - // ensure that field is accessible public static void makeAccessible(AccessibleObject classMember) { if (!classMember.isAccessible()) { diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java index 84a54dfd..6a6efc11 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java @@ -10,6 +10,8 @@ /** * Modify from mockit.internal.util.ParameterReflection JMockit v1.13 * Util class to verify parameter of methods. + *

+ * Copied from Apache Doris */ public final class ParameterReflection { public static final Class[] NO_PARAMETERS = new Class[0]; diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java index 93c4c6c0..99e679f1 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java @@ -1,6 +1,5 @@ package org.apache.doris.config; -import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.JsonProcessingException; @@ -14,6 +13,9 @@ import java.util.List; import java.util.Map; +/** + * Copied from Apache Doris org.apache.doris.sparkdpp.EtlJobConfig + */ public class EtlJobConfig implements Serializable { // global dict public static final String GLOBAL_DICT_TABLE_NAME = "doris_global_dict_table_%d"; diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index b1a31e82..2878ad4d 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -36,6 +36,11 @@ + + org.apache.doris + spark-load-common + ${revision} + com.fasterxml.jackson.core jackson-databind @@ -79,10 +84,6 @@ org.apache.hadoop hadoop-client - - org.apache.doris - fe-common - org.apache.logging.log4j log4j-core diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java index a3207636..a8d64dcf 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -114,13 +114,9 @@ private static CommandLineOptions parseArgs(String[] args) { System.exit(-1); } - if (cmd.hasOption('c') || cmd.hasOption("config")) { - String configPath = cmd.getOptionValue("config"); - boolean recovery = cmd.hasOption('r') || cmd.hasOption("recovery"); - return new CommandLineOptions(configPath, recovery); - } - - throw new IllegalArgumentException(); + String configPath = cmd.getOptionValue("config"); + boolean recovery = cmd.hasOption('r') || cmd.hasOption("recovery"); + return new CommandLineOptions(configPath, recovery); } @@ -144,6 +140,7 @@ private static void checkConfig(JobConfig jobConfig) { Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getDatabase()), "database is empty"); jobConfig.checkTaskInfo(); jobConfig.checkSparkInfo(); + jobConfig.checkHadoopProperties(); } } diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java index 6ac4ff71..fd56cd97 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -18,9 +18,9 @@ package org.apache.doris.common.meta; import org.apache.doris.common.Constants; +import org.apache.doris.config.EtlJobConfig; import org.apache.doris.config.JobConfig; import org.apache.doris.exception.SparkLoadException; -import org.apache.doris.sparkdpp.EtlJobConfig; import com.google.common.annotations.VisibleForTesting; import lombok.Data; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 9ffc4408..2eaaed21 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -20,11 +20,11 @@ import org.apache.doris.SparkLoadRunner; import org.apache.doris.common.Constants; import org.apache.doris.common.enums.LoadMode; -import org.apache.doris.sparkdpp.EtlJobConfig; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import lombok.Data; +import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import java.io.File; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index 23a0e829..a2bcf7bc 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -25,9 +25,9 @@ import org.apache.doris.common.enums.JobStatus; import org.apache.doris.common.meta.LoadMeta; import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.EtlJobConfig; import org.apache.doris.config.JobConfig; import org.apache.doris.exception.SparkLoadException; -import org.apache.doris.sparkdpp.EtlJobConfig; import org.apache.doris.util.DateUtils; import org.apache.doris.util.FileSystemUtils; import org.apache.doris.util.JsonUtils; diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java index 79c95739..7546583a 100644 --- a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java @@ -18,8 +18,8 @@ package org.apache.doris.common.meta; +import org.apache.doris.config.EtlJobConfig; import org.apache.doris.exception.SparkLoadException; -import org.apache.doris.sparkdpp.EtlJobConfig; import org.junit.Assert; import org.junit.Rule; @@ -45,7 +45,7 @@ public void checkMapping() throws SparkLoadException { columns.add(new EtlJobConfig.EtlColumn("c1", "HLL", true, false, "NONE", null, 0, 10, 0)); columns.add(new EtlJobConfig.EtlColumn("c2", "BITMAP", true, false, "NONE", null, 0, 10, 0)); - EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1, columns, 1, "DUPLICATE", true); + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1, columns, 1, "DUPLICATE", true, 1); EtlJobConfig.EtlPartition etlPartition = new EtlJobConfig.EtlPartition(1L, Collections.singletonList(0), Collections.singletonList(1), true, 1); EtlJobConfig.EtlPartitionInfo etlPartitionInfo = diff --git a/spark-load/spark-load-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml index af5ca5c7..67647cff 100644 --- a/spark-load/spark-load-dpp/pom.xml +++ b/spark-load/spark-load-dpp/pom.xml @@ -33,8 +33,9 @@ under the License. - ${project.groupId} - fe-common + org.apache.doris + spark-load-common + ${revision} @@ -46,11 +47,7 @@ under the License. org.apache.commons commons-lang3 - - - - - + diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java index 84ef9ba8..d639b31f 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java @@ -18,7 +18,7 @@ package org.apache.doris.load.loadv2.dpp; import org.apache.doris.common.SparkDppException; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,8 +33,6 @@ // Parser to validate value for different type public abstract class ColumnParser implements Serializable { - protected static final Logger LOG = LoggerFactory.getLogger(ColumnParser.class); - // thread safe formatter public static final DateTimeFormatter DATE_FORMATTER = new DateTimeFormatterBuilder() .appendPattern("uuuu-MM-dd") @@ -42,6 +40,7 @@ public abstract class ColumnParser implements Serializable { public static final DateTimeFormatter DATE_TIME_FORMATTER = new DateTimeFormatterBuilder() .appendPattern("uuuu-MM-dd HH:mm:ss") .toFormatter(); + protected static final Logger LOG = LoggerFactory.getLogger(ColumnParser.class); public static ColumnParser create(EtlJobConfig.EtlColumn etlColumn) throws SparkDppException { String columnType = etlColumn.columnType; diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java index 05f2bdcc..9fd413db 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.apache.spark.Partitioner; diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java index 0c6b6454..bf190408 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java @@ -18,7 +18,7 @@ package org.apache.doris.load.loadv2.dpp; import org.apache.doris.common.SparkDppException; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import com.google.common.collect.Lists; import org.apache.spark.sql.Row; @@ -231,7 +231,7 @@ public static StructType replaceBinaryColsInSchema(Set binaryColumns, St } public static StructType createDstTableSchema(List columns, - boolean addBucketIdColumn, boolean regardDistinctColumnAsBinary) { + boolean addBucketIdColumn, boolean regardDistinctColumnAsBinary) { List fields = new ArrayList<>(); if (addBucketIdColumn) { StructField bucketIdField = DataTypes.createStructField(BUCKET_ID, DataTypes.StringType, true); diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java index 0b54389a..ca89ab8d 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import java.util.ArrayList; import java.util.Collections; diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java index acb0d4c9..16ce92b8 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; // RollupTreeBuilder is used to get the RollupTree from the TableMeta public abstract interface RollupTreeBuilder { diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java index a95482c2..ec3129f3 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import java.util.List; diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java index 08137c70..325b39c8 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -17,9 +17,9 @@ package org.apache.doris.load.loadv2.dpp; +import org.apache.doris.common.DppResult; import org.apache.doris.common.SparkDppException; -import org.apache.doris.sparkdpp.DppResult; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import com.google.common.base.Strings; import com.google.common.collect.Maps; @@ -95,6 +95,8 @@ public final class SparkDpp implements java.io.Serializable { private static final String NULL_FLAG = "\\N"; private static final String DPP_RESULT_FILE = "dpp_result.json"; private static final String BITMAP_TYPE = "bitmap"; + Map> tableToBitmapDictColumns = new HashMap<>(); + Map> tableToBinaryBitmapColumns = new HashMap<>(); private SparkSession spark = null; private EtlJobConfig etlJobConfig = null; private LongAccumulator abnormalRowAcc = null; @@ -109,8 +111,6 @@ public final class SparkDpp implements java.io.Serializable { // we need to wrap it so that we can use it in executor. private SerializableConfiguration serializableHadoopConf; private DppResult dppResult = new DppResult(); - Map> tableToBitmapDictColumns = new HashMap<>(); - Map> tableToBinaryBitmapColumns = new HashMap<>(); // just for ut public SparkDpp() { @@ -252,6 +252,7 @@ private void writeRepartitionAndSortedRDDToParquet(JavaPairRDD, Obj conf.setBoolean("spark.sql.parquet.int64AsTimestampMillis", false); conf.setBoolean("spark.sql.parquet.int96AsTimestamp", true); conf.setBoolean("spark.sql.parquet.binaryAsString", false); + conf.setBoolean("spark.sql.parquet.fieldId.write.enabled", true); conf.set("spark.sql.parquet.outputTimestampType", "INT96"); ParquetWriteSupport.setSchema(dstSchema, conf); ParquetWriteSupport parquetWriteSupport = new ParquetWriteSupport(); diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java index 0e140af1..e06dc2df 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java @@ -20,7 +20,7 @@ import org.apache.doris.common.SparkDppException; import org.apache.doris.common.io.BitmapValue; import org.apache.doris.common.io.Hll; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java index a359612e..03300014 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java @@ -18,14 +18,14 @@ package org.apache.doris.load.loadv2.etl; import org.apache.doris.common.SparkDppException; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig.EtlColumn; +import org.apache.doris.config.EtlJobConfig.EtlColumnMapping; +import org.apache.doris.config.EtlJobConfig.EtlFileGroup; +import org.apache.doris.config.EtlJobConfig.EtlIndex; +import org.apache.doris.config.EtlJobConfig.EtlTable; import org.apache.doris.load.loadv2.dpp.GlobalDictBuilder; import org.apache.doris.load.loadv2.dpp.SparkDpp; -import org.apache.doris.sparkdpp.EtlJobConfig; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumn; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumnMapping; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlFileGroup; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlIndex; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlTable; import com.google.common.collect.Lists; import com.google.common.collect.Maps; diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java index 9091686c..41de92ae 100644 --- a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.junit.Assert; import org.junit.Test; diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java index 131018ed..0e865ddd 100644 --- a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.junit.Assert; import org.junit.Test; diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java index e7cea5d0..30f7a30b 100644 --- a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java index 90c95cf0..33c0dba3 100644 --- a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.junit.Assert; import org.junit.Test; @@ -55,24 +55,24 @@ public void testBuild() { baseColumns.add(column3); baseColumns.add(column4); EtlJobConfig.EtlIndex baseIndex = new EtlJobConfig.EtlIndex(10000, - baseColumns, 12345, "DUPLICATE", true); + baseColumns, 12345, "DUPLICATE", true, 1); List roll1Columns = new ArrayList<>(); roll1Columns.add(column1); roll1Columns.add(column2); roll1Columns.add(column4); EtlJobConfig.EtlIndex roll1Index = new EtlJobConfig.EtlIndex(10001, - roll1Columns, 12346, "AGGREGATE", false); + roll1Columns, 12346, "AGGREGATE", false, 1); List roll2Columns = new ArrayList<>(); roll2Columns.add(column1); roll2Columns.add(column4); EtlJobConfig.EtlIndex roll2Index = new EtlJobConfig.EtlIndex(10002, - roll2Columns, 12347, "AGGREGATE", false); + roll2Columns, 12347, "AGGREGATE", false, 1); List roll3Columns = new ArrayList<>(); roll3Columns.add(column3); roll3Columns.add(column4); EtlJobConfig.EtlIndex roll3Index = new EtlJobConfig.EtlIndex(10003, - roll3Columns, 12348, "AGGREGATE", false); + roll3Columns, 12348, "AGGREGATE", false, 1); List indexes = new ArrayList<>(); indexes.add(baseIndex); diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java index 7522a69c..31ad8d36 100644 --- a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java @@ -17,7 +17,7 @@ package org.apache.doris.load.loadv2.dpp; -import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig; import org.apache.spark.sql.RowFactory; import org.junit.Assert; diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java index 0ea7f660..aa50de59 100644 --- a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java @@ -18,15 +18,15 @@ package org.apache.doris.load.loadv2.etl; import org.apache.doris.common.jmockit.Deencapsulation; -import org.apache.doris.sparkdpp.EtlJobConfig; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumn; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlColumnMapping; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlFileGroup; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlIndex; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlJobProperty; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlPartition; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlPartitionInfo; -import org.apache.doris.sparkdpp.EtlJobConfig.EtlTable; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig.EtlColumn; +import org.apache.doris.config.EtlJobConfig.EtlColumnMapping; +import org.apache.doris.config.EtlJobConfig.EtlFileGroup; +import org.apache.doris.config.EtlJobConfig.EtlIndex; +import org.apache.doris.config.EtlJobConfig.EtlJobProperty; +import org.apache.doris.config.EtlJobConfig.EtlPartition; +import org.apache.doris.config.EtlJobConfig.EtlPartitionInfo; +import org.apache.doris.config.EtlJobConfig.EtlTable; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -68,9 +68,9 @@ public void setUp() { EtlColumn k1 = new EtlColumn("k1", "INT", false, true, "NONE", "0", 0, 0, 0); EtlColumn k2 = new EtlColumn("k2", "VARCHAR", false, true, "NONE", "0", 10, 0, 0); EtlColumn v1 = new EtlColumn("v1", "BIGINT", false, false, "NONE", "0", 0, 0, 0); - EtlIndex index1 = new EtlIndex(index1Id, Lists.newArrayList(k1, k2, v1), 666666, "DUPLICATE", true); + EtlIndex index1 = new EtlIndex(index1Id, Lists.newArrayList(k1, k2, v1), 666666, "DUPLICATE", true, 1); v1 = new EtlColumn("v1", "BIGINT", false, false, "SUM", "0", 0, 0, 0); - EtlIndex index2 = new EtlIndex(index2Id, Lists.newArrayList(k1, v1), 888888, "AGGREGATE", true); + EtlIndex index2 = new EtlIndex(index2Id, Lists.newArrayList(k1, v1), 888888, "AGGREGATE", true,1 ); List indexes = Lists.newArrayList(index1, index2); // partition info List partitions = Lists.newArrayList(); From 8c5a7900f0b2a36e08250ba8184055d6e9fea4ba Mon Sep 17 00:00:00 2001 From: gnehil Date: Fri, 5 Jul 2024 11:19:13 +0800 Subject: [PATCH 33/45] add license header --- spark-load/build.sh | 2 +- .../apache/doris/common/io/BitmapValue.java | 19 ++++++++++++++++++- .../java/org/apache/doris/common/io/Hll.java | 19 ++++++++++++++++++- .../apache/doris/common/io/Roaring64Map.java | 17 +++++++++++++++++ .../org/apache/doris/config/EtlJobConfig.java | 19 ++++++++++++++++++- .../org/apache/doris/client/DorisClient.java | 17 +++++++++++++++++ .../{config => common/enums}/TaskType.java | 2 +- .../org/apache/doris/config/JobConfig.java | 1 + 8 files changed, 91 insertions(+), 5 deletions(-) rename spark-load/spark-load-core/src/main/java/org/apache/doris/{config => common/enums}/TaskType.java (95%) diff --git a/spark-load/build.sh b/spark-load/build.sh index 88c2aeba..a8ca1c73 100644 --- a/spark-load/build.sh +++ b/spark-load/build.sh @@ -17,7 +17,7 @@ # under the License. ############################################################## -# This script is used to compile Spark-Doris-Connector +# This script is used to compile Spark-Load # Usage: # sh build.sh # diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java index d6fd410b..db4a65c2 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.io; import org.roaringbitmap.Util; @@ -403,4 +420,4 @@ public boolean is32BitsEnough() { return false; } } -} \ No newline at end of file +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java index 427543f8..a28ea1d8 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.io; import org.apache.commons.codec.binary.StringUtils; @@ -374,4 +391,4 @@ public byte[] getRegisters() { public Set getHashSet() { return hashSet; } -} \ No newline at end of file +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java index 67b1e765..33237983 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.common.io; import org.roaringbitmap.BitmapDataProvider; diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java index 99e679f1..8fa4db47 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.config; import com.fasterxml.jackson.annotation.JsonIgnore; @@ -487,4 +504,4 @@ public String toString() { } } -} \ No newline at end of file +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java index 8345279f..b25d5ee7 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package org.apache.doris.client; import org.apache.doris.common.LoadInfo; diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/TaskType.java similarity index 95% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java rename to spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/TaskType.java index ba3283ea..e6ebf9e0 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/TaskType.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/TaskType.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.doris.config; +package org.apache.doris.common.enums; public enum TaskType { diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 2eaaed21..1ab2f231 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -20,6 +20,7 @@ import org.apache.doris.SparkLoadRunner; import org.apache.doris.common.Constants; import org.apache.doris.common.enums.LoadMode; +import org.apache.doris.common.enums.TaskType; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; From 15bdf4d57d97679104bd1fed81e9c4487b1a3e59 Mon Sep 17 00:00:00 2001 From: gnehil Date: Thu, 18 Jul 2024 10:05:16 +0800 Subject: [PATCH 34/45] add gson dependency for EtlJobConfig --- spark-load/pom.xml | 13 ++ spark-load/spark-load-common/pom.xml | 6 +- .../org/apache/doris/config/EtlJobConfig.java | 148 +++++++++--------- spark-load/spark-load-dpp/pom.xml | 6 +- 4 files changed, 99 insertions(+), 74 deletions(-) diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 60a41168..7c95b668 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -57,6 +57,7 @@ 1.2 1.12.669 0.8.13 + 2.9.1 @@ -339,6 +340,18 @@ RoaringBitmap ${RoaringBitmap.version} + + + + com.google.code.gson + gson + ${gson.version} + + + org.apache.doris + spark-load-common + ${revision} + diff --git a/spark-load/spark-load-common/pom.xml b/spark-load/spark-load-common/pom.xml index cbab0271..3d6660e7 100644 --- a/spark-load/spark-load-common/pom.xml +++ b/spark-load/spark-load-common/pom.xml @@ -10,7 +10,7 @@ spark-load-common - + jar 8 8 @@ -22,6 +22,10 @@ com.fasterxml.jackson.core jackson-databind + + com.google.code.gson + gson + com.google.guava guava diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java index 8fa4db47..9cca8650 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java @@ -17,13 +17,15 @@ package org.apache.doris.config; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.json.JsonMapper; + import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; +import com.google.gson.ExclusionStrategy; +import com.google.gson.FieldAttributes; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.annotations.SerializedName; import java.io.Serializable; import java.util.Comparator; @@ -47,17 +49,17 @@ public class EtlJobConfig implements Serializable { private static final String ETL_OUTPUT_PATH_FORMAT = "%s/jobs/%d/%s/%d"; private static final String ETL_OUTPUT_FILE_NAME_DESC_V1 = "version.label.tableId.partitionId.indexId.bucket.schemaHash.parquet"; - @JsonProperty(value = "tables") + @SerializedName(value = "tables") public Map tables; - @JsonProperty(value = "outputPath") + @SerializedName(value = "outputPath") public String outputPath; - @JsonProperty(value = "outputFilePattern") + @SerializedName(value = "outputFilePattern") public String outputFilePattern; - @JsonProperty(value = "label") + @SerializedName(value = "label") public String label; - @JsonProperty(value = "properties") + @SerializedName(value = "properties") public EtlJobProperty properties; - @JsonProperty(value = "configVersion") + @SerializedName(value = "configVersion") public ConfigVersion configVersion; /** @@ -113,9 +115,14 @@ public static String getTabletMetaStr(String filePath) throws Exception { } } - public static EtlJobConfig configFromJson(String jsonConfig) throws JsonProcessingException { - JsonMapper mapper = JsonMapper.builder().build(); - return mapper.readValue(jsonConfig, EtlJobConfig.class); + public static EtlJobConfig configFromJson(String jsonConfig) { + return new Gson().fromJson(jsonConfig, EtlJobConfig.class); + } + + public String configToJson() { + Gson gson = + new GsonBuilder().addDeserializationExclusionStrategy(new HiddenAnnotationExclusionStrategy()).create(); + return gson.toJson(this); } @Override @@ -129,11 +136,6 @@ public String getOutputPath() { return outputPath; } - public String configToJson() throws JsonProcessingException { - JsonMapper mapper = JsonMapper.builder().build(); - return mapper.writeValueAsString(this); - } - public enum ConfigVersion { V1 } @@ -147,9 +149,9 @@ public enum SourceType { } public static class EtlJobProperty implements Serializable { - @JsonProperty(value = "strictMode") + @SerializedName(value = "strictMode") public boolean strictMode; - @JsonProperty(value = "timezone") + @SerializedName(value = "timezone") public String timezone; @Override @@ -159,11 +161,11 @@ public String toString() { } public static class EtlTable implements Serializable { - @JsonProperty(value = "indexes") + @SerializedName(value = "indexes") public List indexes; - @JsonProperty(value = "partitionInfo") + @SerializedName(value = "partitionInfo") public EtlPartitionInfo partitionInfo; - @JsonProperty(value = "fileGroups") + @SerializedName(value = "fileGroups") public List fileGroups; /** @@ -190,25 +192,25 @@ public String toString() { } public static class EtlColumn implements Serializable { - @JsonProperty(value = "columnName") + @SerializedName(value = "columnName") public String columnName; - @JsonProperty(value = "columnType") + @SerializedName(value = "columnType") public String columnType; - @JsonProperty(value = "isAllowNull") + @SerializedName(value = "isAllowNull") public boolean isAllowNull; - @JsonProperty(value = "isKey") + @SerializedName(value = "isKey") public boolean isKey; - @JsonProperty(value = "aggregationType") + @SerializedName(value = "aggregationType") public String aggregationType; - @JsonProperty(value = "defaultValue") + @SerializedName(value = "defaultValue") public String defaultValue; - @JsonProperty(value = "stringLength") + @SerializedName(value = "stringLength") public int stringLength; - @JsonProperty(value = "precision") + @SerializedName(value = "precision") public int precision; - @JsonProperty(value = "scale") + @SerializedName(value = "scale") public int scale; - @JsonProperty(value = "defineExpr") + @SerializedName(value = "defineExpr") public String defineExpr; // for unit test @@ -253,17 +255,17 @@ public int compare(EtlIndex a, EtlIndex b) { } public static class EtlIndex implements Serializable { - @JsonProperty(value = "indexId") + @SerializedName(value = "indexId") public long indexId; - @JsonProperty(value = "columns") + @SerializedName(value = "columns") public List columns; - @JsonProperty(value = "schemaHash") + @SerializedName(value = "schemaHash") public int schemaHash; - @JsonProperty(value = "indexType") + @SerializedName(value = "indexType") public String indexType; - @JsonProperty(value = "isBaseIndex") + @SerializedName(value = "isBaseIndex") public boolean isBaseIndex; - @JsonProperty(value = "schemaVersion") + @SerializedName(value = "schemaVersion") public int schemaVersion; /** @@ -300,13 +302,13 @@ public String toString() { } public static class EtlPartitionInfo implements Serializable { - @JsonProperty(value = "partitionType") + @SerializedName(value = "partitionType") public String partitionType; - @JsonProperty(value = "partitionColumnRefs") + @SerializedName(value = "partitionColumnRefs") public List partitionColumnRefs; - @JsonProperty(value = "distributionColumnRefs") + @SerializedName(value = "distributionColumnRefs") public List distributionColumnRefs; - @JsonProperty(value = "partitions") + @SerializedName(value = "partitions") public List partitions; /** @@ -332,15 +334,15 @@ public String toString() { } public static class EtlPartition implements Serializable { - @JsonProperty(value = "partitionId") + @SerializedName(value = "partitionId") public long partitionId; - @JsonProperty(value = "startKeys") + @SerializedName(value = "startKeys") public List startKeys; - @JsonProperty(value = "endKeys") + @SerializedName(value = "endKeys") public List endKeys; - @JsonProperty(value = "isMaxPartition") + @SerializedName(value = "isMaxPartition") public boolean isMaxPartition; - @JsonProperty(value = "bucketNum") + @SerializedName(value = "bucketNum") public int bucketNum; /** @@ -366,42 +368,38 @@ public String toString() { } public static class EtlFileGroup implements Serializable { - @JsonProperty(value = "sourceType") + @SerializedName(value = "sourceType") public SourceType sourceType = SourceType.FILE; - @JsonProperty(value = "filePaths") + @SerializedName(value = "filePaths") public List filePaths; - @JsonProperty(value = "fileFieldNames") + @SerializedName(value = "fileFieldNames") public List fileFieldNames; - @JsonProperty(value = "columnsFromPath") + @SerializedName(value = "columnsFromPath") public List columnsFromPath; - @JsonProperty(value = "columnSeparator") + @SerializedName(value = "columnSeparator") public String columnSeparator; - @JsonProperty(value = "lineDelimiter") + @SerializedName(value = "lineDelimiter") public String lineDelimiter; - @JsonProperty(value = "isNegative") + @SerializedName(value = "isNegative") public boolean isNegative; - @JsonProperty(value = "fileFormat") + @SerializedName(value = "fileFormat") public String fileFormat; - @JsonProperty(value = "columnMappings") + @SerializedName(value = "columnMappings") public Map columnMappings; - @JsonProperty(value = "where") + @SerializedName(value = "where") public String where; - @JsonProperty(value = "partitions") + @SerializedName(value = "partitions") public List partitions; - @JsonProperty(value = "hiveDbTableName") + @SerializedName(value = "hiveDbTableName") public String hiveDbTableName; - @JsonProperty(value = "hiveTableProperties") + @SerializedName(value = "hiveTableProperties") public Map hiveTableProperties; // hive db table used in dpp, not serialized // set with hiveDbTableName (no bitmap column) or IntermediateHiveTable (created by global dict builder) // in spark etl job - @JsonIgnore public String dppHiveDbTableName; - public EtlFileGroup() { - } - // for data infile path public EtlFileGroup(SourceType sourceType, List filePaths, List fileFieldNames, List columnsFromPath, String columnSeparator, String lineDelimiter, @@ -455,16 +453,13 @@ public static class EtlColumnMapping implements Serializable { private static Map functionMap = new ImmutableMap.Builder().put("md5sum", "md5").build(); - @JsonProperty(value = "functionName") + @SerializedName(value = "functionName") public String functionName; - @JsonProperty(value = "args") + @SerializedName(value = "args") public List args; - @JsonProperty(value = "expr") + @SerializedName(value = "expr") public String expr; - public EtlColumnMapping() { - } - public EtlColumnMapping(String functionName, List args) { this.functionName = functionName; this.args = args; @@ -504,4 +499,15 @@ public String toString() { } } + public static class HiddenAnnotationExclusionStrategy implements ExclusionStrategy { + public boolean shouldSkipField(FieldAttributes f) { + return f.getAnnotation(SerializedName.class) == null; + } + + @Override + public boolean shouldSkipClass(Class clazz) { + return false; + } + } + } diff --git a/spark-load/spark-load-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml index 67647cff..a81f76f2 100644 --- a/spark-load/spark-load-dpp/pom.xml +++ b/spark-load/spark-load-dpp/pom.xml @@ -35,7 +35,6 @@ under the License. org.apache.doris spark-load-common - ${revision} @@ -182,7 +181,10 @@ under the License. com.amazonaws aws-java-sdk-dynamodb - + + com.google.code.gson + gson + spark-load-dpp-${project.version} From 12b7b203d2743bcb0e139b05f0bee1445a58c507 Mon Sep 17 00:00:00 2001 From: gnehil Date: Tue, 23 Jul 2024 16:37:12 +0800 Subject: [PATCH 35/45] fix start script --- spark-load/spark-load-dist/src/main/bin/spark-load.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-load/spark-load-dist/src/main/bin/spark-load.sh b/spark-load/spark-load-dist/src/main/bin/spark-load.sh index 241dd432..9097dd24 100644 --- a/spark-load/spark-load-dist/src/main/bin/spark-load.sh +++ b/spark-load/spark-load-dist/src/main/bin/spark-load.sh @@ -18,7 +18,7 @@ if [ -z ${SPARK_LOAD_HOME} ]; then cur_dir=$(dirname "$0")/../ - SPARK_LOAD_HOME=$(pwd ${cur_dir}) + SPARK_LOAD_HOME=$(readlink -f ${cur_dir}) fi export SPARK_LOAD_HOME @@ -43,7 +43,7 @@ fi SPARK_LOAD_CORE_JAR= for f in "${SPARK_LOAD_HOME}/lib"/*.jar; do - if [[ "${f}" == "spark-load-core"*".jar" ]]; then + if [[ $(basename "${f}") == "spark-load-core"*".jar" ]]; then SPARK_LOAD_CORE_JAR="${f}" continue fi From fe964d2f6e7b2824b6f86268e00d690088b9e95b Mon Sep 17 00:00:00 2001 From: gnehil Date: Mon, 29 Jul 2024 15:10:42 +0800 Subject: [PATCH 36/45] serialize by jackson --- .../src/main/java/org/apache/doris/util/JsonUtils.java | 0 .../java/org/apache/doris/load/loadv2/dpp/SparkDpp.java | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename spark-load/{spark-load-core => spark-load-common}/src/main/java/org/apache/doris/util/JsonUtils.java (100%) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/util/JsonUtils.java similarity index 100% rename from spark-load/spark-load-core/src/main/java/org/apache/doris/util/JsonUtils.java rename to spark-load/spark-load-common/src/main/java/org/apache/doris/util/JsonUtils.java diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java index 325b39c8..1e8a46be 100644 --- a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -20,10 +20,10 @@ import org.apache.doris.common.DppResult; import org.apache.doris.common.SparkDppException; import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.util.JsonUtils; import com.google.common.base.Strings; import com.google.common.collect.Maps; -import com.google.gson.Gson; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.IteratorUtils; import org.apache.commons.lang3.StringUtils; @@ -1076,6 +1076,7 @@ private void process() throws Exception { for (Map.Entry entry : etlJobConfig.tables.entrySet()) { Long tableId = entry.getKey(); EtlJobConfig.EtlTable etlTable = entry.getValue(); + LOG.info("etlTable:" + etlTable); Set dictBitmapColumnSet = tableToBitmapDictColumns.getOrDefault(tableId, new HashSet<>()); Set binaryBitmapColumnSet = tableToBinaryBitmapColumns.getOrDefault(tableId, new HashSet<>()); @@ -1188,8 +1189,7 @@ private void writeDppResult(DppResult dppResult) throws Exception { FileSystem fs = FileSystem.get(new Path(outputPath).toUri(), serializableHadoopConf.value()); Path filePath = new Path(resultFilePath); FSDataOutputStream outputStream = fs.create(filePath); - Gson gson = new Gson(); - outputStream.write(gson.toJson(dppResult).getBytes()); + outputStream.write(JsonUtils.writeValueAsBytes(dppResult)); outputStream.write('\n'); outputStream.close(); } From 19eec534fff3e89fb8c9238c0b7c21a559cc3932 Mon Sep 17 00:00:00 2001 From: gnehil Date: Mon, 29 Jul 2024 15:15:05 +0800 Subject: [PATCH 37/45] change dep version var --- spark-load/pom.xml | 6 +++--- spark-load/spark-load-core/pom.xml | 2 +- spark-load/spark-load-dist/pom.xml | 4 ++-- spark-load/spark-load-dpp/pom.xml | 1 + 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 7c95b668..9757d3f6 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -27,10 +27,10 @@ ${revision} pom + spark-load-common spark-load-core spark-load-dpp spark-load-dist - spark-load-common @@ -348,9 +348,9 @@ ${gson.version} - org.apache.doris + ${project.groupId} spark-load-common - ${revision} + ${project.version} diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index 2878ad4d..cfa66d57 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -39,7 +39,7 @@ org.apache.doris spark-load-common - ${revision} + ${project.version} com.fasterxml.jackson.core diff --git a/spark-load/spark-load-dist/pom.xml b/spark-load/spark-load-dist/pom.xml index 21e5d319..01dcad98 100644 --- a/spark-load/spark-load-dist/pom.xml +++ b/spark-load/spark-load-dist/pom.xml @@ -41,12 +41,12 @@ org.apache.doris spark-load-core - ${revision} + ${project.version} org.apache.doris spark-load-dpp - ${revision} + ${project.version} diff --git a/spark-load/spark-load-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml index a81f76f2..de21da68 100644 --- a/spark-load/spark-load-dpp/pom.xml +++ b/spark-load/spark-load-dpp/pom.xml @@ -35,6 +35,7 @@ under the License. org.apache.doris spark-load-common + ${project.version} From 912dfd03c4ed6e3f95aa554f1d1bfefeb9287ee8 Mon Sep 17 00:00:00 2001 From: gnehil Date: Mon, 29 Jul 2024 16:05:41 +0800 Subject: [PATCH 38/45] add license header --- .../doris/spark/sql/TestSparkConnector.scala | 73 ++----------------- spark-load/spark-load-common/pom.xml | 19 +++++ 2 files changed, 26 insertions(+), 66 deletions(-) diff --git a/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala b/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala index c564f789..a5e756c1 100644 --- a/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala +++ b/spark-doris-connector/src/test/scala/org/apache/doris/spark/sql/TestSparkConnector.scala @@ -17,19 +17,19 @@ package org.apache.doris.spark.sql -import org.apache.spark.sql.types.{StringType, StructField, StructType} -import org.apache.spark.sql.{SaveMode, SparkSession} +import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} -import org.junit.{Ignore, Test} +import org.junit.Ignore +import org.junit.Test // This test need real connect info to run. // Set the connect info before comment out this @Ignore @Ignore class TestSparkConnector { - val dorisFeNodes = "10.16.10.6:8939" + val dorisFeNodes = "your_fe_host:8030" val dorisUser = "root" val dorisPwd = "" - val dorisTable = "test.dwd_test" + val dorisTable = "test.test_tbl" val kafkaServers = "" val kafkaTopics = "" @@ -111,70 +111,11 @@ class TestSparkConnector { .option("doris.fenodes", dorisFeNodes) .option("user", dorisUser) .option("password", dorisPwd) - .option("sink.batch.size", 2) - .option("sink.max-retries", 2) + .option("sink.batch.size",2) + .option("sink.max-retries",2) .start().awaitTermination() spark.stop() } - @Test - def sqlReadTest(): Unit = { - - val spark = SparkSession.builder() - .master("local") - .getOrCreate() - spark.sql( - s""" - |CREATE TEMPORARY VIEW t - |USING doris - |OPTIONS( - | "table.identifier"="${dorisTable}", - | "fenodes"="${dorisFeNodes}", - | "user"="${dorisUser}", - | "password"="${dorisPwd}" - |) - |""".stripMargin) - - spark.sql( - """ - |select * from t where dt = '2023-06-15' - |""".stripMargin) - // .explain() - .show(false) - - } - - @Test - def jsonDataWriteTest(): Unit = { - val schema = StructType(Array( - StructField("batch_id", StringType, true), - StructField("gen_uuid", StringType, true), - StructField("keyword", StringType, true), - StructField("step", StringType, true), - StructField("title", StringType, true), - StructField("original_keyword", StringType, true), - StructField("host_ip", StringType, true), - StructField("modify_at", StringType, true) - )) - val sparkSession = SparkSession.builder().appName("JSON DATA READ").master("local[*]").getOrCreate() - val df = sparkSession.read.schema(schema).json("/Users/gnehil/Downloads/social_google_trends_keyword_v2_fdc.json").coalesce(1) - // df.show(2) - df.write.format("doris").mode(SaveMode.Append).option( - "doris.table.identifier", "test.social_google_trends_keyword_v2_fdc_20240506" - ).option( - "doris.fenodes", "10.16.10.6:48733" - ).option("user", "root").option("password", "" - // ).option("doris.write.fields", fieldsString - ).option("sink.properties.format", "json" - ).option("sink.batch.size", 100000 - // ).option("doris.request.connect.timeout.ms", DORIS_REQUEST_CONNECT_TIMEOUT_MS - ).option( - "doris.query.port", 49733 - ).option( - "sink.max-retries", "1" - ).save() - sparkSession.stop() - } - } diff --git a/spark-load/spark-load-common/pom.xml b/spark-load/spark-load-common/pom.xml index 3d6660e7..dbd36f57 100644 --- a/spark-load/spark-load-common/pom.xml +++ b/spark-load/spark-load-common/pom.xml @@ -1,4 +1,23 @@ + + From b3d702321d96b9400bbcfc3047dd36bd4f030def Mon Sep 17 00:00:00 2001 From: gnehil Date: Fri, 2 Aug 2024 14:36:53 +0800 Subject: [PATCH 39/45] move working dir option to root --- .../src/main/java/org/apache/doris/SparkLoadRunner.java | 2 ++ .../main/java/org/apache/doris/common/meta/LoadMeta.java | 2 +- .../src/main/java/org/apache/doris/config/JobConfig.java | 9 ++++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java index a8d64dcf..85c17a19 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -138,6 +138,8 @@ private static void checkConfig(JobConfig jobConfig) { Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getUser()), "user is empty"); Preconditions.checkArgument(jobConfig.getPassword() != null, "password cannot be null"); Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getDatabase()), "database is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getWorkingDir()), + "spark config item workingDir is empty"); jobConfig.checkTaskInfo(); jobConfig.checkSparkInfo(); jobConfig.checkHadoopProperties(); diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java index fd56cd97..6009f092 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -95,7 +95,7 @@ public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) throws SparkLoadExcepti EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); EtlJobConfig etlJobConfig = new EtlJobConfig(tables, outputFilePattern, label, properties); etlJobConfig.outputPath = - EtlJobConfig.getOutputPath(jobConfig.getSpark().getWorkingDir(), getDbId(), label, + EtlJobConfig.getOutputPath(jobConfig.getWorkingDir(), getDbId(), label, getSignature()); return etlJobConfig; } diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 1ab2f231..5dc805f8 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -53,6 +53,9 @@ public class JobConfig { @JsonProperty(required = true) private String database; + @JsonProperty(required = true) + private String workingDir; + @JsonProperty(required = true) private Map loadTasks; @@ -86,7 +89,7 @@ public static class TaskInfo { private String columnFromPath; - private String fieldSep; + private String fieldSep = "\t"; private String lineDelim = "\n"; @@ -121,8 +124,6 @@ public static class SparkInfo { private String sparkHome; - private String workingDir; - private String master; private String deployMode = DEFAULT_DEPLOY_MODE; @@ -188,8 +189,6 @@ public void checkSparkInfo() { SparkInfo sparkInfo = getSpark(); Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getSparkHome()), "spark config item sparkHome is empty"); - Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getWorkingDir()), - "spark config item workingDir is empty"); Preconditions.checkArgument(checkSparkMaster(sparkInfo.getMaster()), "spark master only supports yarn or standalone or local "); Preconditions.checkArgument( From 50f1d1ae01378554cfb0b3ad3da8da37eeec60c7 Mon Sep 17 00:00:00 2001 From: gnehil Date: Fri, 2 Aug 2024 15:58:33 +0800 Subject: [PATCH 40/45] add ut --- spark-load/spark-load-common/pom.xml | 5 + .../apache/doris/config/EtlJobConfigTest.java | 96 +++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java diff --git a/spark-load/spark-load-common/pom.xml b/spark-load/spark-load-common/pom.xml index dbd36f57..4a0e96b7 100644 --- a/spark-load/spark-load-common/pom.xml +++ b/spark-load/spark-load-common/pom.xml @@ -57,6 +57,11 @@ commons-codec commons-codec + + org.junit.jupiter + junit-jupiter-engine + test + \ No newline at end of file diff --git a/spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java b/spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java new file mode 100644 index 00000000..ed911162 --- /dev/null +++ b/spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java @@ -0,0 +1,96 @@ +package org.apache.doris.config; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class EtlJobConfigTest { + + @Test + void getOutputPath() { + String outputPath = EtlJobConfig.getOutputPath("hdfs://127.0.0.1/spark-load", 10001L, "test", 123L); + Assertions.assertEquals("hdfs://127.0.0.1/spark-load/jobs/10001/test/123", outputPath); + } + + @Test + void getOutputFilePattern() { + String outputFilePattern = EtlJobConfig.getOutputFilePattern("test", EtlJobConfig.FilePatternVersion.V1); + Assertions.assertEquals("V1.test.%d.%d.%d.%d.%d.parquet", outputFilePattern); + } + + @Test + void configFromJson() { + List etlIndexes = new ArrayList<>(); + List etlColumns = new ArrayList<>(); + EtlJobConfig.EtlColumn etlColumn0 = new EtlJobConfig.EtlColumn("c0", "INT", false, true, "NONE", "0", 0, 0, 0); + EtlJobConfig.EtlColumn etlColumn1 = + new EtlJobConfig.EtlColumn("c1", "VARCHAR", true, false, "NONE", "\\N", 10, 0, 0); + etlColumns.add(etlColumn0); + etlColumns.add(etlColumn1); + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1L, etlColumns, 123, "DUPLICATE", true, 0); + etlIndexes.add(etlIndex); + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = + new EtlJobConfig.EtlPartitionInfo("UNPARTITIONED", Collections.emptyList(), + Collections.singletonList("c0"), Collections.singletonList( + new EtlJobConfig.EtlPartition(0, Collections.emptyList(), Collections.emptyList(), true, 0))); + EtlJobConfig.EtlTable table = new EtlJobConfig.EtlTable(etlIndexes, etlPartitionInfo); + Map tables = new HashMap<>(); + tables.put(123L, table); + String outputFilePattern = EtlJobConfig.getOutputFilePattern("test", EtlJobConfig.FilePatternVersion.V1); + EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); + EtlJobConfig jobConfig = new EtlJobConfig(tables, outputFilePattern, "test", properties); + Assertions.assertEquals(jobConfig.configToJson(), + EtlJobConfig.configFromJson("{\"tables\":{\"123\":{\"indexes\":[{\"indexId\":1,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":false,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0}," + + "{\"columnName\":\"c1\",\"columnType\":\"VARCHAR\",\"isAllowNull\":true,\"isKey\":false," + + "\"aggregationType\":\"NONE\",\"defaultValue\":\"\\\\N\",\"stringLength\":10,\"precision\":0," + + "\"scale\":0}],\"schemaHash\":123,\"indexType\":\"DUPLICATE\",\"isBaseIndex\":true," + + "\"schemaVersion\":0}],\"partitionInfo\":{\"partitionType\":\"UNPARTITIONED\"," + + "\"partitionColumnRefs\":[],\"distributionColumnRefs\":[\"c0\"],\"partitions\":" + + "[{\"partitionId\":0,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true,\"bucketNum\":0}]}," + + "\"fileGroups\":[]}},\"outputFilePattern\":\"V1.test.%d.%d.%d.%d.%d.parquet\"," + + "\"label\":\"test\",\"properties\":{\"strictMode\":false},\"configVersion\":\"V1\"}").configToJson()); + } + + @Test + void configToJson() { + List etlIndexes = new ArrayList<>(); + List etlColumns = new ArrayList<>(); + EtlJobConfig.EtlColumn etlColumn0 = new EtlJobConfig.EtlColumn("c0", "INT", false, true, "NONE", "0", 0, 0, 0); + EtlJobConfig.EtlColumn etlColumn1 = + new EtlJobConfig.EtlColumn("c1", "VARCHAR", true, false, "NONE", "\\N", 10, 0, 0); + etlColumns.add(etlColumn0); + etlColumns.add(etlColumn1); + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1L, etlColumns, 123, "DUPLICATE", true, 0); + etlIndexes.add(etlIndex); + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = + new EtlJobConfig.EtlPartitionInfo("UNPARTITIONED", Collections.emptyList(), + Collections.singletonList("c0"), Collections.singletonList( + new EtlJobConfig.EtlPartition(0, Collections.emptyList(), Collections.emptyList(), true, 0))); + EtlJobConfig.EtlTable table = new EtlJobConfig.EtlTable(etlIndexes, etlPartitionInfo); + Map tables = new HashMap<>(); + tables.put(123L, table); + String outputFilePattern = EtlJobConfig.getOutputFilePattern("test", EtlJobConfig.FilePatternVersion.V1); + EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); + EtlJobConfig jobConfig = new EtlJobConfig(tables, outputFilePattern, "test", properties); + Assertions.assertEquals( + "{\"tables\":{\"123\":{\"indexes\":[{\"indexId\":1,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":false,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0}," + + "{\"columnName\":\"c1\",\"columnType\":\"VARCHAR\",\"isAllowNull\":true,\"isKey\":false," + + "\"aggregationType\":\"NONE\",\"defaultValue\":\"\\\\N\",\"stringLength\":10,\"precision\":0," + + "\"scale\":0}],\"schemaHash\":123,\"indexType\":\"DUPLICATE\",\"isBaseIndex\":true," + + "\"schemaVersion\":0}],\"partitionInfo\":{\"partitionType\":\"UNPARTITIONED\"," + + "\"partitionColumnRefs\":[],\"distributionColumnRefs\":[\"c0\"],\"partitions\":" + + "[{\"partitionId\":0,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true,\"bucketNum\":0}]}," + + "\"fileGroups\":[]}},\"outputFilePattern\":\"V1.test.%d.%d.%d.%d.%d.parquet\"," + + "\"label\":\"test\",\"properties\":{\"strictMode\":false},\"configVersion\":\"V1\"}", + jobConfig.configToJson()); + } +} \ No newline at end of file From c34286b3de0507ea8440bca903b5ab05bf373fe7 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 7 Aug 2024 18:08:45 +0800 Subject: [PATCH 41/45] add ut --- spark-load/pom.xml | 8 - .../org/apache/doris/common/DppResult.java | 34 +- spark-load/spark-load-core/pom.xml | 30 +- .../org/apache/doris/config/JobConfig.java | 31 +- .../org/apache/doris/load/job/PullLoader.java | 2 +- .../apache/doris/client/DorisClientTest.java | 384 ++++++++++++++++++ .../doris/common/meta/LoadMetaTest.java | 22 +- .../apache/doris/config/JobConfigTest.java | 184 +++++++++ .../apache/doris/load/LoaderFactoryTest.java | 30 ++ .../apache/doris/load/job/PullLoaderTest.java | 219 ++++++++++ .../org/apache/doris/util/DateUtilsTest.java | 24 ++ 11 files changed, 924 insertions(+), 44 deletions(-) create mode 100644 spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java create mode 100644 spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java create mode 100644 spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java create mode 100644 spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java create mode 100644 spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 9757d3f6..25593bdb 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -274,7 +274,6 @@ ${httpclient5.version} - org.junit.jupiter @@ -282,13 +281,6 @@ ${junit.version} test - - - org.junit.vintage - junit-vintage-engine - ${junit.version} - test - org.junit.jupiter diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java index 3daa6541..7a2a9cb4 100644 --- a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java @@ -17,6 +17,7 @@ package org.apache.doris.common; +import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import java.io.Serializable; @@ -26,35 +27,25 @@ */ public class DppResult implements Serializable { - @JsonProperty(value = "is_success", required = true) public boolean isSuccess; - @JsonProperty(value = "failed_reason", required = true) public String failedReason; - @JsonProperty(value = "scanned_rows", required = true) public long scannedRows; - @JsonProperty(value = "file_number", required = true) public long fileNumber; - @JsonProperty(value = "file_size", required = true) public long fileSize; - @JsonProperty(value = "normal_rows", required = true) public long normalRows; - @JsonProperty(value = "abnormal_rows", required = true) public long abnormalRows; - @JsonProperty(value = "unselect_rows", required = true) public long unselectRows; // only part of abnormal rows will be returned - @JsonProperty("partial_abnormal_rows") public String partialAbnormalRows; - @JsonProperty("scanned_bytes") public long scannedBytes; public DppResult() { @@ -70,4 +61,27 @@ public DppResult() { scannedBytes = 0; } + @JsonCreator + public DppResult(@JsonProperty(value = "is_success", required = true) boolean isSuccess, + @JsonProperty(value = "failed_reason", required = true) String failedReason, + @JsonProperty(value = "scanned_rows", required = true) long scannedRows, + @JsonProperty(value = "file_number", required = true) long fileNumber, + @JsonProperty(value = "file_size", required = true) long fileSize, + @JsonProperty(value = "normal_rows", required = true) long normalRows, + @JsonProperty(value = "abnormal_rows", required = true) long abnormalRows, + @JsonProperty(value = "unselect_rows", required = true) long unselectRows, + @JsonProperty("partial_abnormal_rows") String partialAbnormalRows, + @JsonProperty("scanned_bytes") long scannedBytes) { + this.isSuccess = isSuccess; + this.failedReason = failedReason; + this.scannedRows = scannedRows; + this.fileNumber = fileNumber; + this.fileSize = fileSize; + this.normalRows = normalRows; + this.abnormalRows = abnormalRows; + this.unselectRows = unselectRows; + this.partialAbnormalRows = partialAbnormalRows; + this.scannedBytes = scannedBytes; + } + } diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index cfa66d57..d9828d78 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -33,6 +33,8 @@ 8 8 UTF-8 + 1 + -Xmx512m @@ -104,8 +106,13 @@ slf4j-api - org.junit.vintage - junit-vintage-engine + org.junit.jupiter + junit-jupiter-engine + test + + + org.jmockit + jmockit test @@ -154,4 +161,23 @@ + + + + + org.apache.maven.plugins + maven-surefire-plugin + + set larger, eg, 3, to reduce the time or running FE unit tests<--> + ${fe_ut_parallel} + not reuse forked jvm, so that each unit test will run in separate jvm. to avoid singleton confict<--> + false + + -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} + + + + + + \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index 5dc805f8..e5d79416 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -175,7 +175,7 @@ public void checkTaskInfo() { StringUtils.equalsAnyIgnoreCase(taskInfo.getFormat(), "parquet", "orc", "csv"), "format only support parquet or orc or csv"); if ("csv".equalsIgnoreCase(taskInfo.getFormat())) { - Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getFieldSep()), + Preconditions.checkArgument(StringUtils.isNoneEmpty(taskInfo.getFieldSep()), "field separator is empty"); } break; @@ -190,19 +190,19 @@ public void checkSparkInfo() { Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getSparkHome()), "spark config item sparkHome is empty"); Preconditions.checkArgument(checkSparkMaster(sparkInfo.getMaster()), - "spark master only supports yarn or standalone or local "); + "spark master only supports yarn or standalone or local"); Preconditions.checkArgument( StringUtils.equalsAnyIgnoreCase(sparkInfo.getDeployMode(), "cluster", "client"), - "spark deployMode only supports cluster or client "); + "spark deployMode only supports cluster or client"); if (!"yarn".equalsIgnoreCase(sparkInfo.getMaster())) { Preconditions.checkArgument("client".equalsIgnoreCase(sparkInfo.getDeployMode()), "standalone and local master only supports client mode"); } if (LoadMode.PULL == getLoadMode()) { - if (StringUtils.isBlank(getSpark().getDppJarPath())) { - throw new IllegalArgumentException("dpp jar file path is empty "); + if (StringUtils.isBlank(sparkInfo.getDppJarPath())) { + throw new IllegalArgumentException("dpp jar file path is empty"); } - if (!new File(getSpark().getDppJarPath()).exists()) { + if (!new File(sparkInfo.getDppJarPath()).exists()) { throw new IllegalArgumentException("dpp jar file is not exists, path: " + getSpark().getDppJarPath()); } } @@ -230,17 +230,20 @@ public void checkHadoopProperties() { // check auth if (hadoopProperties.containsKey("hadoop.security.authentication") && StringUtils.equalsIgnoreCase(hadoopProperties.get("hadoop.security.authentication"), "kerberos")) { - if (hadoopProperties.containsKey("hadoop.kerberos.principal") - && hadoopProperties.containsKey("hadoop.kerberos.keytab")) { - if (!FileUtils.getFile(hadoopProperties.get("hadoop.kerberos.principal")).exists()) { - throw new IllegalArgumentException("hadoop kerberos principal file is not exists, path: " - + hadoopProperties.get("hadoop.kerberos.principal")); + if (hadoopProperties.containsKey("hadoop.kerberos.principal")) { + if (StringUtils.isBlank(hadoopProperties.get("hadoop.kerberos.principal"))) { + throw new IllegalArgumentException("hadoop kerberos principal is empty"); } - if (!FileUtils.getFile(hadoopProperties.get("hadoop.kerberos.keytab")).exists()) { - throw new IllegalArgumentException("hadoop kerberos keytab file is not exists, path: " - + hadoopProperties.get("hadoop.kerberos.keytab")); + if (hadoopProperties.containsKey("hadoop.kerberos.keytab")) { + if (!FileUtils.getFile(hadoopProperties.get("hadoop.kerberos.keytab")).exists()) { + throw new IllegalArgumentException("hadoop kerberos keytab file is not exists, path: " + + hadoopProperties.get("hadoop.kerberos.keytab")); + } + return; } + throw new IllegalArgumentException("hadoop.kerberos.keytab is not set"); } + throw new IllegalArgumentException("hadoop.kerberos.principal is not set"); } else { if (!hadoopProperties.containsKey("hadoop.username")) { throw new IllegalArgumentException("hadoop username is empty"); diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java index a2bcf7bc..a048e955 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -255,7 +255,7 @@ public boolean canBeRecovered() throws SparkLoadException { || oldIndex.schemaVersion != index.schemaVersion) { LOG.info("index " + index.indexId + " has changed, " + "old schemaHash: " + oldIndex.schemaHash + " and schemaVersion: " - + oldIndex.schemaVersion + "current schemaHash: " + + oldIndex.schemaVersion + " current schemaHash: " + index.schemaHash + " and schemaVersion: " + index.schemaVersion + ", cannot be recovered"); return false; diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java new file mode 100644 index 00000000..a59bf4b3 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java @@ -0,0 +1,384 @@ +package org.apache.doris.client; + +import org.apache.doris.common.LoadInfo; +import org.apache.doris.common.meta.LoadInfoResponse; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.util.JsonUtils; + +import com.fasterxml.jackson.core.JsonProcessingException; +import mockit.Mock; +import mockit.MockUp; +import org.apache.http.Header; +import org.apache.http.HeaderIterator; +import org.apache.http.HttpEntity; +import org.apache.http.HttpStatus; +import org.apache.http.HttpVersion; +import org.apache.http.ProtocolVersion; +import org.apache.http.StatusLine; +import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.message.BasicStatusLine; +import org.apache.http.params.HttpParams; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +class DorisClientTest { + + @Test + public void getFeClient() { + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("", "", "")); + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("127.0.0.1", "", "")); + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("127.0.0.1:", "", "")); + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient(":8030", "", "")); + Assertions.assertDoesNotThrow(() -> DorisClient.getFeClient("127.0.0.1:8030", "", "")); + } + + @Test + public void createIngestionLoad() throws SparkLoadException, JsonProcessingException { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.createIngestionLoad("db", new HashMap<>(), "test", new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":1,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.createIngestionLoad("db", new HashMap<>(), "test", new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\",\"data\":{\"loadId\":1,\"txnId\":1," + + "\"dbId\":1,\"signature\":1,\"tableMeta\":{\"tbl1\":{\"id\":1," + + "\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\",\"columnType\":\"INT\"," + + "\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\",\"defaultValue\":\"0\"," + + "\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}],\"schemaHash\":0," + + "\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}},\"count\":0}")); + return response; + } + }; + + LoadMeta loadMeta = new LoadMeta(); + loadMeta.setLoadId(1L); + loadMeta.setTxnId(1L); + loadMeta.setDbId(1L); + loadMeta.setSignature(1L); + Map tableMetaMap = new HashMap<>(); + TableMeta tableMeta = new TableMeta(); + tableMeta.setId(1L); + List indexList = new ArrayList<>(); + TableMeta.EtlIndex index = new TableMeta.EtlIndex(); + List columnList = new ArrayList<>(); + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnName = "c0"; + column.columnType = "INT"; + column.defaultValue = "0"; + column.isAllowNull = true; + column.aggregationType = "NONE"; + column.isKey = true; + columnList.add(column); + index.columns = columnList; + indexList.add(index); + tableMeta.setIndexes(indexList); + TableMeta.EtlPartitionInfo partitionInfo = new TableMeta.EtlPartitionInfo(); + TableMeta.EtlPartition partition = new TableMeta.EtlPartition(); + partition.partitionId = 1; + partition.bucketNum = 1; + partition.startKeys = Collections.emptyList(); + partition.endKeys = Collections.emptyList(); + partition.isMaxPartition = true; + partitionInfo.partitions = Collections.singletonList(partition); + partitionInfo.partitionType = "UNPARTITIONED"; + partitionInfo.partitionColumnRefs = new ArrayList<>(); + partitionInfo.distributionColumnRefs = new ArrayList<>(); + tableMeta.setPartitionInfo(partitionInfo); + tableMetaMap.put("tbl1", tableMeta); + loadMeta.setTableMeta(tableMetaMap); + Assertions.assertEquals(JsonUtils.writeValueAsString(loadMeta), + JsonUtils.writeValueAsString(feClient.createIngestionLoad("db", new HashMap<>(), "test", new HashMap<>()))); + + } + + @Test + public void updateIngestionLoad() { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.updateIngestionLoad("db", 1L, new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":1,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.updateIngestionLoad("db", 1L, new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + Assertions.assertDoesNotThrow(() -> feClient.updateIngestionLoad("db", 1L, new HashMap<>())); + + } + + @Test + public void getLoadInfo() throws SparkLoadException, JsonProcessingException { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getLoadInfo("db", "test")); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"status\":\"err\",\"msg\":\"\",\"jobInfo\":{\"dbName\":\"db\"," + + "\"tblNames\":[\"tbl1\"],\"label\":\"test\",\"clusterName\":\"default\",\"state\":\"FINISHED\"," + + "\"failMsg\":\"\",\"trackingUrl\":\"\"}}")); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getLoadInfo("db", "test")); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"status\":\"ok\",\"msg\":\"\",\"jobInfo\":{\"dbName\":\"db\"," + + "\"tblNames\":[\"tbl1\"],\"label\":\"test\",\"clusterName\":\"default\",\"state\":\"FINISHED\"," + + "\"failMsg\":\"\",\"trackingUrl\":\"\"}}")); + return response; + } + }; + Assertions.assertEquals("{\"dbName\":\"db\",\"tblNames\":[\"tbl1\"],\"label\":\"test\"," + + "\"clusterName\":\"default\",\"state\":\"FINISHED\",\"failMsg\":\"\",\"trackingUrl\":\"\"}", + JsonUtils.writeValueAsString(feClient.getLoadInfo("db", "test"))); + + } + + private class MockedCloseableHttpResponse implements CloseableHttpResponse { + + private StatusLine statusLine; + private HttpEntity entity; + + @Override + public void close() throws IOException { + + } + + @Override + public StatusLine getStatusLine() { + return statusLine; + } + + @Override + public void setStatusLine(StatusLine statusline) { + this.statusLine = statusline; + } + + @Override + public void setStatusLine(ProtocolVersion ver, int code) { + this.statusLine = new BasicStatusLine(ver, code, ""); + } + + @Override + public void setStatusLine(ProtocolVersion ver, int code, String reason) { + this.statusLine = new BasicStatusLine(ver, code, reason); + } + + @Override + public void setStatusCode(int code) throws IllegalStateException { + if (this.statusLine == null) { + this.statusLine = new BasicStatusLine(HttpVersion.HTTP_1_1, code, ""); + } else { + this.statusLine = new BasicStatusLine(statusLine.getProtocolVersion(), code, statusLine.getReasonPhrase()); + } + } + + @Override + public void setReasonPhrase(String reason) throws IllegalStateException { + if (this.statusLine == null) { + this.statusLine = new BasicStatusLine(HttpVersion.HTTP_1_1, HttpStatus.SC_OK, reason); + } else { + this.statusLine = new BasicStatusLine(statusLine.getProtocolVersion(), statusLine.getStatusCode(), reason); + } + } + + @Override + public HttpEntity getEntity() { + return entity; + } + + @Override + public void setEntity(HttpEntity entity) { + this.entity = entity; + } + + @Override + public Locale getLocale() { + return null; + } + + @Override + public void setLocale(Locale loc) { + + } + + @Override + public ProtocolVersion getProtocolVersion() { + return HttpVersion.HTTP_1_1; + } + + @Override + public boolean containsHeader(String name) { + return false; + } + + @Override + public Header[] getHeaders(String name) { + return new Header[0]; + } + + @Override + public Header getFirstHeader(String name) { + return null; + } + + @Override + public Header getLastHeader(String name) { + return null; + } + + @Override + public Header[] getAllHeaders() { + return new Header[0]; + } + + @Override + public void addHeader(Header header) { + + } + + @Override + public void addHeader(String name, String value) { + + } + + @Override + public void setHeader(Header header) { + + } + + @Override + public void setHeader(String name, String value) { + + } + + @Override + public void setHeaders(Header[] headers) { + + } + + @Override + public void removeHeader(Header header) { + + } + + @Override + public void removeHeaders(String name) { + + } + + @Override + public HeaderIterator headerIterator() { + return null; + } + + @Override + public HeaderIterator headerIterator(String name) { + return null; + } + + @Override + public HttpParams getParams() { + return null; + } + + @Override + public void setParams(HttpParams params) { + + } + } + + +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java index 7546583a..00bb6517 100644 --- a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java @@ -19,12 +19,11 @@ import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.JobConfig; import org.apache.doris.exception.SparkLoadException; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.ExpectedException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.Collections; @@ -34,9 +33,6 @@ public class LoadMetaTest { - @Rule - public ExpectedException thrown = ExpectedException.none(); - @Test public void checkMapping() throws SparkLoadException { @@ -59,11 +55,11 @@ public void checkMapping() throws SparkLoadException { Map columnMappingMap = new HashMap<>(); columnMappingMap.put("c2", new EtlJobConfig.EtlColumnMapping("to_bitmap(c1)")); - Assert.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap)); + Assertions.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap)); Map columnMappingMap1 = new HashMap<>(); columnMappingMap1.put("c1", new EtlJobConfig.EtlColumnMapping("hll_hash(c1)")); - Assert.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap1)); + Assertions.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap1)); Map columnMappingMap2 = new HashMap<>(); columnMappingMap2.put("c1", new EtlJobConfig.EtlColumnMapping("hll_hash(c1)")); @@ -71,4 +67,12 @@ public void checkMapping() throws SparkLoadException { loadMeta.checkMapping(etlTable, columnMappingMap2); } + + + @Test + void getEtlJobConfig() throws SparkLoadException { + JobConfig jobConfig = new JobConfig(); + LoadMeta loadMeta = new LoadMeta(); + loadMeta.getEtlJobConfig(jobConfig); + } } \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java new file mode 100644 index 00000000..197ac30a --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java @@ -0,0 +1,184 @@ +package org.apache.doris.config; + + +import org.apache.doris.common.enums.TaskType; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +public class JobConfigTest { + + @Test + public void checkFeAddress() { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setFeAddresses(""); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkFeAddress); + Assertions.assertEquals("feAddress is empty", e1.getMessage()); + + jobConfig.setFeAddresses("127.0.0.1"); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkFeAddress, + "feAddress format is incorrect"); + Assertions.assertEquals("feAddress format is incorrect", e2.getMessage()); + + jobConfig.setFeAddresses("127.0.0.1,127.0.0.2"); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkFeAddress, + "feAddress format is incorrect"); + Assertions.assertEquals("feAddress format is incorrect", e3.getMessage()); + + jobConfig.setFeAddresses("127.0.0.1:8030"); + Assertions.assertDoesNotThrow(jobConfig::checkFeAddress); + + } + + @Test + public void checkTaskInfo() { + + JobConfig jobConfig = new JobConfig(); + + jobConfig.setLoadTasks(new HashMap<>()); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("loadTasks is empty", e1.getMessage()); + + Map loadTasks1 = new HashMap<>(); + JobConfig.TaskInfo taskInfo1 = new JobConfig.TaskInfo(); + taskInfo1.setType(TaskType.FILE); + loadTasks1.put("task1", taskInfo1); + jobConfig.setLoadTasks(loadTasks1); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("file path is empty", e2.getMessage()); + + Map loadTasks2 = new HashMap<>(); + JobConfig.TaskInfo taskInfo2 = new JobConfig.TaskInfo(); + taskInfo2.setType(TaskType.FILE); + taskInfo2.setPaths(Collections.singletonList("test")); + taskInfo2.setFormat("sequence"); + loadTasks2.put("task2", taskInfo2); + jobConfig.setLoadTasks(loadTasks2); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("format only support parquet or orc or csv", e3.getMessage()); + + taskInfo2.setFormat("csv"); + Assertions.assertDoesNotThrow(jobConfig::checkTaskInfo); + + Map loadTasks3 = new HashMap<>(); + JobConfig.TaskInfo taskInfo3 = new JobConfig.TaskInfo(); + taskInfo3.setType(TaskType.HIVE); + loadTasks3.put("task3", taskInfo3); + jobConfig.setLoadTasks(loadTasks3); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("hive database is empty", e4.getMessage()); + + taskInfo3.setHiveDatabase("db"); + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo, "hive table is empty"); + + taskInfo3.setHiveTable("tbl"); + Assertions.assertDoesNotThrow(jobConfig::checkTaskInfo); + + } + + @Test + public void checkSparkInfo() throws IOException { + + JobConfig jobConfig = new JobConfig(); + JobConfig.SparkInfo sparkInfo = new JobConfig.SparkInfo(); + jobConfig.setSpark(sparkInfo); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("spark config item sparkHome is empty", e1.getMessage()); + + sparkInfo.setSparkHome("test"); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("spark master only supports yarn or standalone or local", e2.getMessage()); + + sparkInfo.setMaster("local"); + sparkInfo.setDeployMode("abc"); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("spark deployMode only supports cluster or client", e3.getMessage()); + + sparkInfo.setMaster("spark://127.0.0.1:7077"); + sparkInfo.setDeployMode("cluster"); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("standalone and local master only supports client mode", e4.getMessage()); + + sparkInfo.setMaster("yarn"); + sparkInfo.setDeployMode("cluster"); + IllegalArgumentException e5 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("dpp jar file is not exists, path: null/app/spark-load-dpp-1.0-SNAPSHOT.jar", e5.getMessage()); + + sparkInfo.setDppJarPath(""); + IllegalArgumentException e6 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("dpp jar file path is empty", e6.getMessage()); + + Path path = Files.createTempFile(null, null); + sparkInfo.setDppJarPath(path.toAbsolutePath().toString()); + Assertions.assertDoesNotThrow(jobConfig::checkSparkInfo); + + } + + @Test + public void checkHadoopProperties() throws IOException { + + JobConfig jobConfig = new JobConfig(); + Map hadoopProperties = new HashMap<>(); + jobConfig.setHadoopProperties(hadoopProperties); + + hadoopProperties.put("abc", "123"); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("fs.defaultFS is empty", e1.getMessage()); + + hadoopProperties.put("fs.defaultFS", "test"); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop username is empty", e2.getMessage()); + + hadoopProperties.put("hadoop.username", "hadoop"); + Assertions.assertDoesNotThrow(jobConfig::checkHadoopProperties); + + hadoopProperties.put("hadoop.security.authentication", "kerberos"); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop.kerberos.principal is not set", e3.getMessage()); + + hadoopProperties.put("hadoop.kerberos.principal", ""); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop kerberos principal is empty", e4.getMessage()); + + hadoopProperties.put("hadoop.kerberos.principal", "spark@DORIS.ORG"); + IllegalArgumentException e5 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop.kerberos.keytab is not set", e5.getMessage()); + + hadoopProperties.put("hadoop.kerberos.keytab", "test"); + IllegalArgumentException e6 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop kerberos keytab file is not exists, path: test", e6.getMessage()); + + Path path = Files.createTempFile("spark", ".keytab"); + hadoopProperties.put("hadoop.kerberos.keytab", path.toAbsolutePath().toString()); + Assertions.assertDoesNotThrow(jobConfig::checkHadoopProperties); + + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java new file mode 100644 index 00000000..ea5f0e87 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java @@ -0,0 +1,30 @@ +package org.apache.doris.load; + +import org.apache.doris.common.enums.LoadMode; +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.PullLoader; + +import org.junit.jupiter.api.Assertions; +import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.Test; + +class LoaderFactoryTest { + + @Test + void createLoader() { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setLoadMode(null); + Assertions.assertThrows(NullPointerException.class, () -> LoaderFactory.createLoader(jobConfig, false)); + + jobConfig.setLoadMode(LoadMode.PUSH); + Assertions.assertThrows(UnsupportedOperationException.class, () -> LoaderFactory.createLoader(jobConfig, false)); + + jobConfig.setLoadMode(LoadMode.PULL); + Assertions.assertDoesNotThrow(() -> LoaderFactory.createLoader(jobConfig, false)); + Loader loader = LoaderFactory.createLoader(jobConfig, false);; + Assertions.assertInstanceOf(PullLoader.class, loader); + + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java new file mode 100644 index 00000000..187f2744 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java @@ -0,0 +1,219 @@ +package org.apache.doris.load.job; + +import org.apache.doris.client.DorisClient; +import org.apache.doris.common.enums.TaskType; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.load.LoaderFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.json.JsonMapper; +import mockit.Mock; +import mockit.MockUp; +import org.apache.commons.io.FileUtils; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class PullLoaderTest { + + @Test + void canBeRecovered() throws SparkLoadException, IOException { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setFeAddresses("127.0.0.1:8080"); + Map loadTasks = new HashMap<>(); + JobConfig.TaskInfo taskInfo = new JobConfig.TaskInfo(); + taskInfo.setType(TaskType.FILE); + taskInfo.setPaths(Collections.singletonList("test")); + loadTasks.put("tbl1", taskInfo); + jobConfig.setLoadTasks(loadTasks); + jobConfig.setLabel("test"); + File file = new File(System.getProperty("java.io.tmpdir")); + jobConfig.setWorkingDir(file.getAbsolutePath()); + + new MockUp() { + @Mock + public LoadMeta createIngestionLoad(String db, Map> tableToPartition, String label, + Map properties) { + LoadMeta loadMeta = new LoadMeta(); + loadMeta.setLoadId(1L); + loadMeta.setTxnId(1L); + loadMeta.setDbId(1L); + loadMeta.setSignature(1L); + Map tableMetaMap = new HashMap<>(); + TableMeta tableMeta = new TableMeta(); + tableMeta.setId(1L); + List indexList = new ArrayList<>(); + TableMeta.EtlIndex index = new TableMeta.EtlIndex(); + List columnList = new ArrayList<>(); + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnName = "c0"; + column.columnType = "INT"; + column.defaultValue = "0"; + column.isAllowNull = true; + column.aggregationType = "NONE"; + column.isKey = true; + columnList.add(column); + index.columns = columnList; + indexList.add(index); + tableMeta.setIndexes(indexList); + TableMeta.EtlPartitionInfo partitionInfo = new TableMeta.EtlPartitionInfo(); + TableMeta.EtlPartition partition = new TableMeta.EtlPartition(); + partition.partitionId = 1; + partition.bucketNum = 1; + partition.startKeys = Collections.emptyList(); + partition.endKeys = Collections.emptyList(); + partition.isMaxPartition = true; + partitionInfo.partitions = Collections.singletonList(partition); + partitionInfo.partitionType = "UNPARTITIONED"; + partitionInfo.partitionColumnRefs = new ArrayList<>(); + partitionInfo.distributionColumnRefs = new ArrayList<>(); + tableMeta.setPartitionInfo(partitionInfo); + tableMetaMap.put("tbl1", tableMeta); + loadMeta.setTableMeta(tableMetaMap); + try { + System.out.println(JsonMapper.builder().build().writeValueAsString(loadMeta)); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + return loadMeta; + } + }; + Loader loader = LoaderFactory.createLoader(jobConfig, true); + assertInstanceOf(Recoverable.class, loader); + loader.prepare(); + assertFalse(((Recoverable)loader).canBeRecovered()); + + File file1 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test"); + try { + + file1.mkdirs(); + assertFalse(((Recoverable)loader).canBeRecovered()); + + File file2 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test/1"); + file2.mkdirs(); + assertFalse(((Recoverable)loader).canBeRecovered()); + + File file3 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test/1/dpp_result.json"); + Files.write(file3.toPath(), Collections.singletonList("")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("test")); + assertThrows(SparkLoadException.class, () -> ((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("{}")); + assertThrows(SparkLoadException.class, () -> ((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("{\"is_success\":false,\"failed_reason\":\"\"," + + "\"scanned_rows\":0,\"file_number\":0,\"file_size\":0,\"normal_rows\":0,\"abnormal_rows\":0," + + "\"unselect_rows\":0,\"partial_abnormal_rows\":\"\",\"scanned_bytes\":0}\n")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("{\"is_success\":true,\"failed_reason\":\"\"," + + "\"scanned_rows\":0,\"file_number\":0,\"file_size\":0,\"normal_rows\":0,\"abnormal_rows\":0," + + "\"unselect_rows\":0,\"partial_abnormal_rows\":\"\",\"scanned_bytes\":0}\n")); + + File file4 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test/1/load_meta.json"); + Files.write(file4.toPath(), Collections.singletonList("")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[],\"partitionInfo\":{\"partitionType\":" + + "\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[],\"partitions\":" + + "[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true,\"bucketNum\":1}]}" + + "}}}\n")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl2\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":1,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":1,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":1}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1},{\"partitionId\":2,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":2,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertTrue(((Recoverable)loader).canBeRecovered()); + + } finally { + // delete ${java.io.tmpdir}/jobs on exit + FileUtils.deleteDirectory(file1.getParentFile().getParentFile()); + } + + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java new file mode 100644 index 00000000..32527aa6 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java @@ -0,0 +1,24 @@ +package org.apache.doris.util; + +import mockit.Mock; +import mockit.MockUp; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.time.LocalDateTime; +import java.time.ZoneId; + +class DateUtilsTest { + + @Test + void getFormattedNow() { + new MockUp() { + @Mock + public LocalDateTime now(ZoneId zoneId) { + return LocalDateTime.of(2024,8,1,12,34,56); + } + }; + Assertions.assertEquals("2024-08-01 12:34:56", DateUtils.getFormattedNow(DateUtils.NORMAL_FORMATER)); + Assertions.assertEquals("20240801123456", DateUtils.getFormattedNow(DateUtils.NUMBER_FORMATER)); + } +} \ No newline at end of file From 28a394615d1c56bae7a0739e52f1ed5da3a98efc Mon Sep 17 00:00:00 2001 From: gnehil Date: Fri, 9 Aug 2024 14:17:49 +0800 Subject: [PATCH 42/45] api path change --- .../src/main/java/org/apache/doris/client/DorisClient.java | 7 ++++--- .../src/main/java/org/apache/doris/common/Constants.java | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java index b25d5ee7..1b93be2a 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -17,6 +17,7 @@ package org.apache.doris.client; +import org.apache.doris.common.Constants; import org.apache.doris.common.LoadInfo; import org.apache.doris.common.ResponseEntity; import org.apache.doris.common.meta.LoadInfoResponse; @@ -67,7 +68,7 @@ public static class FeClient { public static final String BASE_URL = "http://%s%s"; - public static final String INGESTION_LOAD_URL_PATTERN = "/api/ingestion_load/%s/%s"; + public static final String INGESTION_LOAD_URL_PATTERN = "/api/ingestion_load/%s/%s/%s"; public static final String CREATE_ACTION = "_create"; @@ -99,7 +100,7 @@ private String parseAuth(String user, String password) { public LoadMeta createIngestionLoad(String db, Map> tableToPartition, String label, Map properties) throws SparkLoadException { try { - String path = String.format(INGESTION_LOAD_URL_PATTERN, db, CREATE_ACTION); + String path = String.format(INGESTION_LOAD_URL_PATTERN, Constants.DEFAULT_CATALOG, db, CREATE_ACTION); HttpPost httpPost = new HttpPost(); addCommonHeaders(httpPost); Map params = new HashMap<>(); @@ -156,7 +157,7 @@ private String executeRequest(HttpRequestBase req, String apiPath, Map statusInfo) throws SparkLoadException { - String path = String.format(INGESTION_LOAD_URL_PATTERN, db, UPDATE_ACTION); + String path = String.format(INGESTION_LOAD_URL_PATTERN, Constants.DEFAULT_CATALOG, db, UPDATE_ACTION); HttpPost httpPost = new HttpPost(); addCommonHeaders(httpPost); Map params = new HashMap<>(); diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java index 20130549..a3e4803e 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java @@ -26,4 +26,6 @@ public interface Constants { String HADOOP_KERBEROS_PRINCIPAL = "hadoop.kerberos.principal"; String HADOOP_KERBEROS_KEYTAB = "hadoop.kerberos.keytab"; + String DEFAULT_CATALOG = "internal"; + } From 41865dca9b3aee0bf9d88696ee88508e2cf5a4a6 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 14 Aug 2024 14:44:02 +0800 Subject: [PATCH 43/45] change http client dep --- spark-load/pom.xml | 8 ++++---- spark-load/spark-load-core/pom.xml | 4 ++++ spark-load/spark-load-dpp/pom.xml | 10 +++++----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/spark-load/pom.xml b/spark-load/pom.xml index 25593bdb..6909e137 100644 --- a/spark-load/pom.xml +++ b/spark-load/pom.xml @@ -49,7 +49,7 @@ 2.14.2 1.18.30 1.4 - 5.2.1 + 4.5.13 5.8.2 1.49 2.17.1 @@ -269,9 +269,9 @@ - org.apache.httpcomponents.client5 - httpclient5 - ${httpclient5.version} + org.apache.httpcomponents + httpclient + ${httpclient.version} diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml index d9828d78..fbe3edaf 100644 --- a/spark-load/spark-load-core/pom.xml +++ b/spark-load/spark-load-core/pom.xml @@ -120,6 +120,10 @@ + + org.apache.httpcomponents + httpclient + org.apache.hadoop hadoop-aws diff --git a/spark-load/spark-load-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml index de21da68..0d6ab52b 100644 --- a/spark-load/spark-load-dpp/pom.xml +++ b/spark-load/spark-load-dpp/pom.xml @@ -107,11 +107,11 @@ under the License. test - - org.junit.vintage - junit-vintage-engine - test - + + + + + org.junit.jupiter From 9cbe4d39782ec6b0b1728480a3a094dc4881f35e Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 14 Aug 2024 14:44:30 +0800 Subject: [PATCH 44/45] add fe client http res content empty check --- .../java/org/apache/doris/client/DorisClient.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java index 1b93be2a..8d46617a 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -86,8 +86,12 @@ public FeClient(String feAddresses, String user, String password) { } private List parseFeNodes(String feAddresses) { + if (StringUtils.isBlank(feAddresses)) { + throw new IllegalArgumentException(); + } String[] feArr = feAddresses.split(","); - if (feArr.length == 0) { + if (Arrays.stream(feArr).map(x -> x.split(":")) + .anyMatch(x -> x.length != 2 || x[0].isEmpty() || x[1].isEmpty())) { throw new IllegalArgumentException(); } return Arrays.stream(feArr).collect(Collectors.toList()); @@ -142,7 +146,6 @@ private String executeRequest(HttpRequestBase req, String apiPath, Map stat try { httpPost.setEntity(new StringEntity(JsonUtils.writeValueAsString(params))); String content = executeRequest(httpPost, path, null); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request update load failed, path: %s", path)); + } ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); if (res.getCode() != 0) { throw new SparkLoadException(String.format("update load failed, code: %d, msg: %s, reason: %s", @@ -186,6 +192,9 @@ public LoadInfo getLoadInfo(String db, String label) throws SparkLoadException { Map params = new HashMap<>(); params.put("label", label); String content = executeRequest(httpGet, path, params); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request get load info failed, path: %s", path)); + } LoadInfoResponse res = JsonUtils.readValue(content, LoadInfoResponse.class); if (!"ok".equalsIgnoreCase(res.getStatus())) { throw new SparkLoadException(String.format("get load info failed, status: %s, msg: %s, jobInfo: %s", From e8f05007626263cef70742f22d66d354496e1616 Mon Sep 17 00:00:00 2001 From: gnehil Date: Wed, 14 Aug 2024 17:11:14 +0800 Subject: [PATCH 45/45] add mow check --- .../org/apache/doris/client/DorisClient.java | 35 +++++++- .../org/apache/doris/config/JobConfig.java | 13 +++ .../apache/doris/client/DorisClientTest.java | 83 ++++++++++++++++++- .../apache/doris/config/JobConfigTest.java | 23 +++++ 4 files changed, 146 insertions(+), 8 deletions(-) diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java index 8d46617a..3a4831c1 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -76,6 +76,8 @@ public static class FeClient { public static final String GET_LOAD_INFO = "/api/%s/_load_info"; + public static final String GET_DDL = "/api/_get_ddl"; + private final List feNodes; private final String auth; @@ -87,12 +89,12 @@ public FeClient(String feAddresses, String user, String password) { private List parseFeNodes(String feAddresses) { if (StringUtils.isBlank(feAddresses)) { - throw new IllegalArgumentException(); + throw new IllegalArgumentException("feAddresses is empty"); } String[] feArr = feAddresses.split(","); if (Arrays.stream(feArr).map(x -> x.split(":")) .anyMatch(x -> x.length != 2 || x[0].isEmpty() || x[1].isEmpty())) { - throw new IllegalArgumentException(); + throw new IllegalArgumentException("feAddresses contains invalid format, " + feAddresses); } return Arrays.stream(feArr).collect(Collectors.toList()); } @@ -178,7 +180,7 @@ public void updateIngestionLoad(String db, Long loadId, Map stat res.getCode(), res.getMsg(), res.getData().isNull() ? null : res.getData().asText())); } } catch (IOException | URISyntaxException e) { - throw new SparkLoadException("update spark load failed", e); + throw new SparkLoadException("update load failed", e); } } @@ -202,7 +204,32 @@ public LoadInfo getLoadInfo(String db, String label) throws SparkLoadException { } return res.getJobInfo(); } catch (IOException | URISyntaxException e) { - throw new SparkLoadException("update spark load failed", e); + throw new SparkLoadException("get load info failed", e); + } + + } + + public String getDDL(String db, String table) throws SparkLoadException { + + HttpGet httpGet = new HttpGet(); + addCommonHeaders(httpGet); + try { + Map params = new HashMap<>(); + params.put("db", db); + params.put("table", table); + String content = executeRequest(httpGet, GET_DDL, params); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request get ddl failed, path: %s", GET_DDL)); + } + ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); + if (res.getCode() != 0 || !res.getData().has("create_table") + || res.getData().get("create_table").isEmpty()) { + throw new SparkLoadException(String.format("get ddl failed, status: %s, msg: %s, data: %s", + res.getCode(), res.getMsg(), JsonUtils.writeValueAsString(res.getData()))); + } + return res.getData().get("create_table").get(0).asText(); + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("get ddl failed", e); } } diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java index e5d79416..fb2f5ccb 100644 --- a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -18,9 +18,11 @@ package org.apache.doris.config; import org.apache.doris.SparkLoadRunner; +import org.apache.doris.client.DorisClient; import org.apache.doris.common.Constants; import org.apache.doris.common.enums.LoadMode; import org.apache.doris.common.enums.TaskType; +import org.apache.doris.exception.SparkLoadException; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; @@ -30,6 +32,7 @@ import java.io.File; import java.net.URI; +import java.sql.DriverManager; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -160,6 +163,16 @@ public void checkTaskInfo() { Map tasks = getLoadTasks(); Preconditions.checkArgument(!tasks.isEmpty(), "loadTasks is empty"); for (Map.Entry entry : tasks.entrySet()) { + String table = entry.getKey(); + try { + DorisClient.FeClient feClient = DorisClient.getFeClient(feAddresses, user, password); + String ddl = feClient.getDDL(database, table); + if (StringUtils.isNoneBlank(ddl) && ddl.contains("\"enable_unique_key_merge_on_write\" = \"true\"")) { + throw new IllegalArgumentException("Merge On Write is not supported"); + } + } catch (SparkLoadException e) { + throw new IllegalArgumentException("check table failed", e); + } TaskInfo taskInfo = entry.getValue(); switch (taskInfo.getType()) { case HIVE: diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java index a59bf4b3..b5d7d386 100644 --- a/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java @@ -41,10 +41,18 @@ class DorisClientTest { @Test public void getFeClient() { - Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("", "", "")); - Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("127.0.0.1", "", "")); - Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("127.0.0.1:", "", "")); - Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient(":8030", "", "")); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("", "", "")); + Assertions.assertEquals("feAddresses is empty", e1.getMessage()); + IllegalArgumentException e2 = Assertions.assertThrows(IllegalArgumentException.class, + () -> DorisClient.getFeClient("127.0.0.1", "", "")); + Assertions.assertEquals("feAddresses contains invalid format, 127.0.0.1", e2.getMessage()); + IllegalArgumentException e3 = Assertions.assertThrows(IllegalArgumentException.class, + () -> DorisClient.getFeClient("127.0.0.1:", "", "")); + Assertions.assertEquals("feAddresses contains invalid format, 127.0.0.1:", e3.getMessage()); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient(":8030", "", "")); + Assertions.assertEquals("feAddresses contains invalid format, :8030", e4.getMessage()); Assertions.assertDoesNotThrow(() -> DorisClient.getFeClient("127.0.0.1:8030", "", "")); } @@ -226,6 +234,73 @@ public CloseableHttpResponse execute( } + @Test + public void getDDL() { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + SparkLoadException e1 = + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getDDL("db", "test")); + Assertions.assertEquals("request get ddl failed, path: /api/_get_ddl", e1.getMessage()); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":1,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + SparkLoadException e2 = + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getDDL("db", "test")); + Assertions.assertEquals("get ddl failed, status: 1, msg: , data: {}", e2.getMessage()); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + SparkLoadException e3 = + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getDDL("db", "test")); + Assertions.assertEquals("get ddl failed, status: 0, msg: , data: {}", e3.getMessage()); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\"," + + "\"data\":{\"create_table\": [\"CREATE TABLE `tbl1` (\\n `k1` int(11) NULL " + + "COMMENT \\\"\\\",\\n `k2` int(11) NULL COMMENT \\\"\\\"\\n) ENGINE=OLAP\\n" + + "DUPLICATE KEY(`k1`, `k2`)\\nCOMMENT \\\"OLAP\\\"\\nDISTRIBUTED BY HASH(`k1`) BUCKETS 1\\n" + + "PROPERTIES (\\n\\\"replication_num\\\" = \\\"1\\\",\\n\\\"version_info\\\" = \\\"1,0\\\",\\n" + + "\\\"in_memory\\\" = \\\"false\\\",\\n\\\"storage_format\\\" = \\\"DEFAULT\\\"\\n);\"]\n}," + + "\"count\":0}")); + return response; + } + }; + Assertions.assertDoesNotThrow(() -> feClient.getDDL("db", "test")); + + + } + private class MockedCloseableHttpResponse implements CloseableHttpResponse { private StatusLine statusLine; diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java index 197ac30a..bc1691a0 100644 --- a/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java @@ -1,8 +1,12 @@ package org.apache.doris.config; +import org.apache.doris.client.DorisClient; import org.apache.doris.common.enums.TaskType; +import org.apache.doris.exception.SparkLoadException; +import mockit.Mock; +import mockit.MockUp; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -46,12 +50,21 @@ public void checkFeAddress() { public void checkTaskInfo() { JobConfig jobConfig = new JobConfig(); + jobConfig.setFeAddresses("127.0.0.1:8030"); jobConfig.setLoadTasks(new HashMap<>()); IllegalArgumentException e1 = Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); Assertions.assertEquals("loadTasks is empty", e1.getMessage()); + new MockUp(DorisClient.FeClient.class) { + @Mock + public String getDDL(String db, String table) throws SparkLoadException { + return "create table tbl1 (col1 int, col2 int, col3 int, col4 int) unique key (col1) properties (" + + "\"enable_unique_key_merge_on_write\" = \"false\")"; + } + }; + Map loadTasks1 = new HashMap<>(); JobConfig.TaskInfo taskInfo1 = new JobConfig.TaskInfo(); taskInfo1.setType(TaskType.FILE); @@ -90,6 +103,16 @@ public void checkTaskInfo() { taskInfo3.setHiveTable("tbl"); Assertions.assertDoesNotThrow(jobConfig::checkTaskInfo); + new MockUp(DorisClient.FeClient.class) { + @Mock + public String getDDL(String db, String table) throws SparkLoadException { + return "create table tbl1 (col1 int, col2 int, col3 int, col4 int) unique key (col1) properties (" + + "\"enable_unique_key_merge_on_write\" = \"true\")"; + } + }; + IllegalArgumentException e5 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + } @Test