Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Apache Hudi into the openctest framework #26

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
11 changes: 10 additions & 1 deletion core/add_project.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ function setup_alluxio() {
mvn clean install -DskipTests -Dcheckstyle.skip -Dlicense.skip -Dfindbugs.skip -Dmaven.javadoc.skip=true
}

function setup_hudi() {
[ ! -d "app/ctest-hudi" ] && git clone https://github.com/jessicahuang523/hudi app/ctest-hudi
cd app/ctest-hudi
git fetch && git checkout ctest-injection
cd hudi-common
mvn clean install -DskipTests -Dcheckstyle.skip
}

function usage() {
echo "Usage: add_project.sh <main project>"
exit 1
Expand All @@ -64,7 +72,8 @@ function main() {
hbase) setup_hbase ;;
zookeeper) setup_zookeeper ;;
alluxio) setup_alluxio ;;
*) echo "Unexpected project: $project - only support hadoop, hbase, zookeeper and alluxio." ;;
hudi) setup_hudi ;;
*) echo "Unexpected project: $project - only support hadoop, hbase, zookeeper, alluxio and hudi." ;;
esac
fi
}
Expand Down
14 changes: 12 additions & 2 deletions core/ctest_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,22 @@
HBASE = "hbase-server"
ZOOKEEPER = "zookeeper-server"
ALLUXIO = "alluxio-core"
HUDI = "hudi-common"

CTEST_HADOOP_DIR = os.path.join(APP_DIR, "ctest-hadoop")
CTEST_HBASE_DIR = os.path.join(APP_DIR, "ctest-hbase")
CTEST_ZK_DIR = os.path.join(APP_DIR, "ctest-zookeeper")
CTEST_ALLUXIO_DIR = os.path.join(APP_DIR, "ctest-alluxio")
CTEST_HUDI_DIR = os.path.join(APP_DIR, "ctest-hudi")


PROJECT_DIR = {
HCOMMON: CTEST_HADOOP_DIR,
HDFS: CTEST_HADOOP_DIR,
HBASE: CTEST_HBASE_DIR,
ZOOKEEPER: CTEST_ZK_DIR,
ALLUXIO: CTEST_ALLUXIO_DIR,
HUDI: CTEST_HUDI_DIR
}


Expand All @@ -34,6 +38,7 @@
HBASE: "hbase-server",
ZOOKEEPER: "zookeeper-server",
ALLUXIO: "core",
HUDI: "hudi-common"
}


Expand All @@ -58,6 +63,7 @@
os.path.join(CTEST_ALLUXIO_DIR, MODULE_SUBDIR[ALLUXIO], "server/worker", SUREFIRE_SUBDIR),
os.path.join(CTEST_ALLUXIO_DIR, MODULE_SUBDIR[ALLUXIO], "server/master", SUREFIRE_SUBDIR),
],
HUDI: [os.path.join(CTEST_HUDI_DIR, MODULE_SUBDIR[HUDI], SUREFIRE_SUBDIR)]
}

# default or deprecate conf path
Expand All @@ -74,7 +80,8 @@
HDFS: os.path.join(DEFAULT_CONF_DIR, HDFS + "-default.tsv"),
HBASE: os.path.join(DEFAULT_CONF_DIR, HBASE + "-default.tsv"),
ALLUXIO: os.path.join(DEFAULT_CONF_DIR, ALLUXIO + "-default.tsv"),
ZOOKEEPER: os.path.join(DEFAULT_CONF_DIR, ZOOKEEPER + "-default.tsv")
ZOOKEEPER: os.path.join(DEFAULT_CONF_DIR, ZOOKEEPER + "-default.tsv"),
HUDI: os.path.join(DEFAULT_CONF_DIR, HUDI + "-default.tsv")
}


Expand All @@ -96,7 +103,10 @@
],
ALLUXIO: [
os.path.join(CTEST_ALLUXIO_DIR, "core/alluxio-ctest.properties")
]
],
HUDI: [
os.path.join(CTEST_HUDI_DIR, "hudi-common/src/main/resources/hudi-ctest.conf")
],
}


Expand Down
28 changes: 28 additions & 0 deletions core/default_configs/hudi-common-default.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
hoodie.archivelog.folder archived path under the meta folder, to store archived timeline instants at.
hoodie.table.type COPY_ON_WRITE The table type for the underlying data, for this write. This can’t change between writes.
hoodie.metastore.enable false Use metastore server to store hoodie table metadata
hoodie.filesystem.operation.retry.enable false Enabled to handle list/get/delete etc file system performance issue.
hoodie.consistency.check.enabled false Enabled to handle S3 eventual consistency issue. This property is no longer required since S3 is now strongly consistent. Will be removed in the future releases.
hoodie.timeline.layout.version N/A Version of timeline used, by the table.
hoodie.bootstrap.base.path N/A Base path of the dataset that needs to be bootstrapped as a Hudi table
hoodie.bootstrap.index.enable true Whether or not, this is a bootstrapped table, with bootstrap base data and an mapping index defined, default true.
hoodie.filesystem.view.type MEMORY File system view provides APIs for viewing the files on the underlying lake storage, as file groups and file slices. This config controls how such a view is held. Options include MEMORY,SPILLABLE_DISK,EMBEDDED_KV_STORE,REMOTE_ONLY,REMOTE_FIRST which provide different trade offs for memory usage and API request performance.
hoodie.filesystem.view.secondary.type MEMORY Specifies the secondary form of storage for file system view, if the primary (e.g timeline server) is unavailable.
hoodie.filesystem.view.remote.port 26754 Port to serve file system view queries, when remote. We expect this to be rarely hand configured.
hoodie.filesystem.view.incr.timeline.sync.enable false Controls whether or not, the file system view is incrementally updated as new actions are performed on the timeline.
hoodie.filesystem.view.rocksdb.base.path /tmp/hoodie_timeline_rocksdb Path on local storage to use, when storing file system view in embedded kv store/rocksdb.
hoodie.bootstrap.index.class org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex Implementation to use, for mapping base files to bootstrap base file, that contain actual data.
hoodie.compaction.payload.class org.apache.hudi.common.model.OverwriteWithLatestAvroPayload Payload class to use for performing compactions, i.e merge delta logs with current base file and then produce a new base file.
hoodie.filesystem.view.spillable.mem 104857600 Amount of memory to be used in bytes for holding file system view, before spilling to disk.
hoodie.filesystem.view.spillable.compaction.mem.fraction 0.8 Fraction of the file system view memory, to be used for holding compaction related metadata.
hoodie.filesystem.view.spillable.bootstrap.base.file.mem.fraction 0.05 Fraction of the file system view memory, to be used for holding mapping to bootstrap base files.
hoodie.filesystem.view.spillable.replaced.mem.fraction 0.01 Fraction of the file system view memory, to be used for holding replace commit related metadata.
hoodie.filesystem.view.spillable.clustering.mem.fraction 0.01 Fraction of the file system view memory, to be used for holding clustering related metadata.
hoodie.filesystem.view.spillable.dir /tmp/ Path on local storage to use, when file system view is held in a spillable map.
hoodie.common.spillable.diskmap.type BITCASK When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. Change this to to prefer using rocksDB, for handling the spill.
hoodie.common.diskmap.compression.enabled true Turn on compression for BITCASK disk map used by the External Spillable Map
hoodie.filesystem.operation.retry.max_interval_ms 2000 Maximum amount of time (in ms), to wait for next retry.
hoodie.filesystem.operation.retry.max_numbers 4 Maximum number of retry actions to perform, with exponential backoff.
hoodie.filesystem.operation.retry.initial_interval_ms 100 Amount of time (in ms) to wait, before retry to do operations on storage.
hoodie.table.name N/A Table name to register to Hive metastore
hoodie.table.precombine.field N/A Field used in preCombining before actual write. By default, when two records have the same key value, the largest value for the precombine field determined by Object.compareTo(..), is picked.
Loading